From 644d57d53191b94d9e50a4765891c498790d924b Mon Sep 17 00:00:00 2001
From: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
Date: Wed, 27 Aug 2025 12:02:55 +0800
Subject: [PATCH 001/125] [Model] Add Ernie4.5 VL Model Support (#22514)

Signed-off-by: wangyafeng <wangyafeng@baidu.com>
---
 docs/models/supported_models.md               |    1 +
 examples/offline_inference/vision_language.py |   32 +
 requirements/test.in                          |    1 +
 requirements/test.txt                         |    3 +
 .../multimodal/processing/test_common.py      |    1 +
 tests/models/registry.py                      |    2 +
 .../rotary_embedding/ernie45_vl_rope.py       |   72 +
 .../layers/rotary_embedding/mrope.py          |  123 ++
 vllm/model_executor/models/ernie45_vl.py      | 1504 +++++++++++++++++
 vllm/model_executor/models/ernie45_vl_moe.py  |  723 ++++++++
 vllm/model_executor/models/registry.py        |    1 +
 11 files changed, 2463 insertions(+)
 create mode 100644 vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
 create mode 100644 vllm/model_executor/models/ernie45_vl.py
 create mode 100644 vllm/model_executor/models/ernie45_vl_moe.py
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 74f3a9d1cdb5..19ce8c06724f 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -616,6 +616,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I<sup>+</sup> | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | ✅︎ |
 | `DeepseekVLV2ForCausalLM`<sup>^</sup> | DeepSeek-VL2 | T + I<sup>+</sup> | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | ✅︎ |
 | `DonutForConditionalGeneration`<sup>^</sup> | Donut | T + I | `ByteDance/Dolphin`, `naver-clova-ix/donut-base-finetuned-docvqa`, etc. | | | |
+| `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | ✅︎ |
 | `Florence2ForConditionalGeneration` | Florence-2 | T + I | `microsoft/Florence-2-base`, `microsoft/Florence-2-large`, etc. | | | |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | ✅︎ |
 | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index 8d97ba266826..4e879666f61d 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -173,6 +173,37 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Ernie4.5-VL
+def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={modality: 1},
+        trust_remote_code=True,
+    )
+
+    if modality == "image":
+        placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+    elif modality == "video":
+        placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
+    prompts = [
+        (
+            f"<|begin_of_sentence|>User: {question}{placeholder}\n"
+            "Assistant: <think></think>"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Florence2
 def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1602,6 +1633,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     "chameleon": run_chameleon,
     "command_a_vision": run_command_a_vision,
     "deepseek_vl_v2": run_deepseek_vl2,
+    "ernie45_vl": run_ernie45_vl,
     "florence2": run_florence2,
     "fuyu": run_fuyu,
     "gemma3": run_gemma3,
diff --git a/requirements/test.in b/requirements/test.in
index 098a9242bc3a..92c577c50163 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -54,3 +54,4 @@ runai-model-streamer-s3==0.11.0
 fastsafetensors>=0.1.10
 pydantic>=2.10 # 2.9 leads to error on python 3.10
 terratorch==1.1rc2 # required for PrithviMAE test
+decord==0.6.0
diff --git a/requirements/test.txt b/requirements/test.txt
index 8b872752d875..0c27c9bb67e8 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -156,6 +156,8 @@ datasets==3.0.2
     #   mteb
 decorator==5.1.1
     # via librosa
+decord==0.6.0
+    # via -r requirements/test.in
 dill==0.3.8
     # via
     #   datasets
@@ -493,6 +495,7 @@ numpy==1.26.4
     #   contourpy
     #   cupy-cuda12x
     #   datasets
+    #   decord
     #   einx
     #   encodec
     #   evaluate
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 74ca10d32609..6361cb9b5586 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -272,6 +272,7 @@ def _test_processing_correctness_one(
     "CohereLabs/command-a-vision-07-2025",
     "deepseek-ai/deepseek-vl2-tiny",
     "naver-clova-ix/donut-base-finetuned-docvqa",
+    "baidu/ERNIE-4.5-VL-28B-A3B-PT",
     "microsoft/Florence-2-base",
     "adept/fuyu-8b",
     "google/gemma-3-4b-it",
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 20c7c3af6776..f2c09d3e8452 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -396,6 +396,8 @@ def check_available_online(
                                                 transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}),  # noqa: E501
     "Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
+    "Ernie4_5_VLMoeForConditionalGeneration": _HfExamplesInfo("baidu/ERNIE-4.5-VL-28B-A3B-PT",  # noqa: E501
+                                                              trust_remote_code=True),
     "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
     "Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
     "Gemma3nForConditionalGeneration": _HfExamplesInfo("google/gemma-3n-E2B-it",    # noqa: E501
diff --git a/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
new file mode 100644
index 000000000000..05322e56f262
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/ernie45_vl_rope.py
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from .common import apply_rotary_emb_dispatch
+from .mrope import MRotaryEmbedding
+
+
+class Ernie4_5_VLRotaryEmbedding(MRotaryEmbedding):
+    """3D rotary positional embedding. 3D is t:time h:height w:width"""
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        assert positions.ndim == 1 or positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if positions.ndim == 2:
+            assert self.mrope_section
+
+            section_h = self.mrope_section[0]  # 22
+            section_w = self.mrope_section[1]  # 22
+            section_t = self.mrope_section[2]  # 20
+            assert section_h == section_w
+            # Split according to [h w h w h w h w... t t t...]
+            section_cos_t = cos[..., -section_t:]
+            section_cos_h = cos[..., :section_h + section_w:2]
+            section_cos_w = cos[..., 1:section_h + section_w:2]
+
+            cos_t, cos_h, cos_w = section_cos_t[0], section_cos_h[
+                1], section_cos_w[2]
+            cos_hw = torch.stack([cos_h, cos_w],
+                                 dim=-1).reshape(cos_h.shape[:-1] +
+                                                 (cos_h.shape[-1] * 2, ))
+            cos = torch.cat([cos_hw, cos_t], dim=-1)
+
+            section_sin_t = sin[..., -section_t:]
+            section_sin_h = sin[..., :section_h + section_w:2]
+            section_sin_w = sin[..., 1:section_h + section_w:2]
+
+            sin_t, sin_h, sin_w = section_sin_t[0], section_sin_h[
+                1], section_sin_w[2]
+            sin_hw = torch.stack([sin_h, sin_w],
+                                 dim=-1).reshape(sin_h.shape[:-1] +
+                                                 (sin_h.shape[-1] * 2, ))
+            sin = torch.cat([sin_hw, sin_t], dim=-1)
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin,
+                                              self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin,
+                                            self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index a091cfb74329..e374aa9bebf9 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -393,6 +393,15 @@ def get_input_positions_tensor(
                 context_len=context_len,
                 seq_len=seq_len,
             )
+        elif hf_config.model_type in ["ernie4_5_moe_vl", "ernie4_5_vl"]:
+            return cls._ernie_get_input_positions_tensor(
+                input_tokens=input_tokens,
+                hf_config=hf_config,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                context_len=context_len,
+                seq_len=seq_len,
+            )
         else:
             return cls._vl_get_input_positions_tensor(
                 input_tokens=input_tokens,
@@ -513,6 +522,120 @@ def _glm4v_get_input_positions_tensor(
                                 len(input_tokens)).item()
         return llm_positions, mrope_position_delta
 
+    @classmethod
+    def _ernie_get_input_positions_tensor(
+        cls,
+        input_tokens: list[int],
+        hf_config: PretrainedConfig,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        context_len: int = 0,
+        seq_len: Optional[int] = None,
+    ) -> tuple[torch.Tensor, int]:
+        """Get mrope input positions and delta value for Ernie VL."""
+
+        image_token_id = hf_config.im_patch_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_conv_size = hf_config.spatial_conv_size
+        temporal_conv_size = hf_config.temporal_conv_size
+        llm_pos_ids_list: list = []
+
+        if not (image_grid_thw is None and video_grid_thw is None):
+            if isinstance(image_grid_thw, torch.Tensor):
+                image_grid_thw = image_grid_thw.tolist()
+
+            input_token_type: list[str] = []
+            video_check_flg = False
+            for token in input_tokens:
+                if token == video_start_token_id:
+                    video_check_flg = True
+                elif token == video_end_token_id:
+                    video_check_flg = False
+
+                if (token == image_token_id) and (video_check_flg is False):
+                    input_token_type.append("image")
+                elif (token == image_token_id) and (video_check_flg is True):
+                    input_token_type.append("video")
+                else:
+                    input_token_type.append("text")
+
+            input_type_group: list[tuple[str, int, int]] = []
+            for key, group_iter in itertools.groupby(
+                    enumerate(input_token_type), lambda x: x[1]):
+                group_list = list(group_iter)
+                start_index = group_list[0][0]
+                end_index = group_list[-1][0] + 1
+                input_type_group.append((key, start_index, end_index))
+
+            video_frame_num = 1
+            mm_data_idx = 0
+            for modality_type, start_idx, end_idx in input_type_group:
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(
+                    llm_pos_ids_list) > 0 else 0
+                if modality_type == "image":
+                    t, h, w = (
+                        image_grid_thw[mm_data_idx][0],
+                        image_grid_thw[mm_data_idx][1],
+                        image_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = \
+                        t, h // spatial_conv_size, w // spatial_conv_size
+
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(
+                        -1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(
+                        llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(
+                        llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(
+                        torch.stack([t_index, h_index, w_index]) + st_idx)
+                    mm_data_idx += 1
+
+                elif modality_type == "video":
+                    t, h, w = (
+                        video_grid_thw[mm_data_idx][0],
+                        video_grid_thw[mm_data_idx][1],
+                        video_grid_thw[mm_data_idx][2],
+                    )
+                    llm_grid_t, llm_grid_h, llm_grid_w = (t //
+                                                          temporal_conv_size,
+                                                          h //
+                                                          spatial_conv_size,
+                                                          w //
+                                                          spatial_conv_size)
+
+                    for t_idx in range(llm_grid_t):
+                        t_index = torch.tensor(t_idx).view(-1, 1).expand(
+                            -1, llm_grid_h * llm_grid_w).flatten()
+                        h_index = torch.arange(llm_grid_h).view(
+                            1, -1, 1).expand(1, -1, llm_grid_w).flatten()
+                        w_index = torch.arange(llm_grid_w).view(
+                            1, 1, -1).expand(1, llm_grid_h, -1).flatten()
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx)
+
+                    mm_data_idx += 1
+                    video_frame_num += 1
+
+                else:
+                    text_len = end_idx - start_idx
+                    llm_pos_ids_list.append(
+                        torch.arange(text_len).view(1, -1).expand(3, -1) +
+                        st_idx)
+                    video_frame_num = 1
+
+        else:
+            text_len = len(input_tokens)
+            llm_pos_ids_list.append(
+                torch.arange(text_len).view(1, -1).expand(3, -1))
+
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        llm_positions = llm_positions[:, context_len:seq_len]
+        mrope_position_delta = (llm_positions.max() + 1 -
+                                len(input_tokens)).item()
+        return llm_positions, mrope_position_delta
+
     @classmethod
     def _vl_get_input_positions_tensor(
         cls,
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
new file mode 100644
index 000000000000..d880fc434e20
--- /dev/null
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -0,0 +1,1504 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Erine VL model compatible with HuggingFace weights."""
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import Any, Callable, Literal, Optional, TypedDict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from transformers import BatchFeature
+
+from vllm.config import VllmConfig
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
+                                    MultiModalKwargsItems)
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
+from vllm.multimodal.processing import (BaseMultiModalProcessor,
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import _Backend, current_platform
+from vllm.sequence import IntermediateTensors
+
+from .ernie45_vl_moe import Ernie4_5_VLMoeForCausalLM
+from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
+                         SupportsMultiModal, SupportsPP)
+from .utils import (AutoWeightsLoader, WeightsMapper, maybe_prefix,
+                    merge_multimodal_embeddings)
+from .vision import get_vit_attn_backend
+
+logger = init_logger(__name__)
+
+_MAX_FRAMES_PER_VIDEO = 16
+
+# === Vision Transformer === #
+
+
+def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1),
+                         "... d two -> ... (d two)",
+                         two=2)
+
+
+def apply_rotary_emb_torch(x: torch.Tensor,
+                           cos: torch.Tensor,
+                           sin: torch.Tensor,
+                           interleaved: bool = False) -> torch.Tensor:
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(
+        cos,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(
+        sin,
+        "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat(
+        [
+            x[..., :ro_dim] * cos +
+            rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]
+        ],
+        dim=-1,
+    )
+
+
+def apply_rotary_pos_emb_vision(t: torch.Tensor,
+                                freqs: torch.Tensor) -> torch.Tensor:
+    t_ = t.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    apply_rotary_emb = apply_rotary_emb_torch
+    if current_platform.is_cuda():
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+    output = apply_rotary_emb(t_, cos, sin).type_as(t)
+    return output
+
+
+def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
+    """All-gather the input tensor interleavely across model parallel group."""
+    import torch.distributed as dist
+    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
+    dist.all_gather(gathered_tensors,
+                    local_tensor,
+                    group=parallel_state.get_tp_group().device_group)
+
+    gathered_tensors_split = [
+        torch.split(tensor, hidden_size // tp_size, -1)
+        for tensor in gathered_tensors
+    ]
+    ordered_tensors = [
+        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
+    ]
+    result_tensor = torch.cat(ordered_tensors, dim=-1)
+    return result_tensor
+
+
+class Ernie4_5_VisionAttention(nn.Module):
+    """VisionAttention using VLLM framework APIs"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads)
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size)
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv")
+        self.proj = RowParallelLinear(input_size=projection_size,
+                                      output_size=embed_dim,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.proj")
+
+        # Detect attention implementation.
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+        if self.attn_backend not in {
+                _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
+                _Backend.ROCM_AITER_FA
+        }:
+            raise RuntimeError(
+                f"Ernie45-VL does not support {self.attn_backend} backend now."
+            )
+        self.is_flash_attn_backend = self.attn_backend in {
+            _Backend.FLASH_ATTN, _Backend.ROCM_AITER_FA
+        }
+
+    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        # [s, b, 3 * head * head_dim]
+        seq_len, bs, _ = qkv.shape
+        if self.tp_size > 1:
+            qkv = all_gather_interleave(qkv, self.qkv.hidden_size,
+                                        self.tp_size)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
+        q, k, v = qkv.chunk(3, dim=2)
+
+        # 3 * [s, b, head * head_dim]
+        if self.tp_size > 1:
+            splitter = partial(dist_utils.split_tensor_along_last_dim,
+                               num_partitions=self.tp_size)
+            q = splitter(q)[self.tp_rank]
+            k = splitter(k)[self.tp_rank]
+            v = splitter(v)[self.tp_rank]
+
+        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
+        new_shape = (seq_len, bs, self.num_attention_heads_per_partition,
+                     self.hidden_size_per_attention_head)
+        q, k, v = (x.view(*new_shape) for x in (q, k, v))
+        return q, k, v
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+        # [s, b, c] --> [s, b, head * 3 * head_dim]
+        x, _ = self.qkv(x)
+
+        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
+        q, k, v = self.split_qkv(x)
+        batch_size = q.shape[1]
+
+        q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous()
+                   for x in (q, k, v))
+        if rotary_pos_emb is not None:
+            q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
+
+        if self.is_flash_attn_backend:
+            # from vllm_flash_attn.flash_attn_interface import (
+            #   flash_attn_varlen_func)
+            if self.attn_backend == _Backend.ROCM_AITER_FA:
+                from aiter import flash_attn_varlen_func
+            else:
+                from flash_attn import flash_attn_varlen_func
+
+            q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+
+            output = flash_attn_varlen_func(q,
+                                            k,
+                                            v,
+                                            cu_seqlens_q=cu_seqlens,
+                                            cu_seqlens_k=cu_seqlens,
+                                            max_seqlen_q=max_seqlen,
+                                            max_seqlen_k=max_seqlen,
+                                            dropout_p=0.0,
+                                            causal=False)
+
+            context_layer = rearrange(output,
+                                      "(b s) ... -> b s ...",
+                                      b=batch_size)
+        elif self.attn_backend == _Backend.TORCH_SDPA:
+            # Execute attention entry by entry for speed & less VRAM.
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (rearrange(x, "b s h d -> b h s d")
+                                 for x in [q_i, k_i, v_i])
+                output_i = F.scaled_dot_product_attention(q_i,
+                                                          k_i,
+                                                          v_i,
+                                                          dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1)
+        elif self.attn_backend == _Backend.XFORMERS:
+            from xformers import ops as xops
+            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
+                                                       kv_seqlen=None,
+                                                       device=q.device)
+
+            context_layer = xops.memory_efficient_attention_forward(
+                q, k, v, attn_bias=attn_bias, p=0, scale=None)
+        context_layer = rearrange(context_layer,
+                                  "b s h d -> s b (h d)").contiguous()
+
+        output, _ = self.proj(context_layer)
+        return output
+
+
+class Ernie4_5_VisionMLP(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        act_layer: type[nn.Module] = QuickGELU,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.fc1 = ColumnParallelLinear(in_features,
+                                        hidden_features,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.fc1")
+        self.act = act_layer()
+        self.fc2 = RowParallelLinear(hidden_features,
+                                     in_features,
+                                     quant_config=quant_config,
+                                     prefix=f"{prefix}.fc2")
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_parallel, _ = self.fc1(x)
+        x_parallel = self.act(x_parallel)
+        x, _ = self.fc2(x_parallel)
+        return x
+
+
+class Ernie4_5_VisionBlock(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float,
+        act_layer: type[nn.Module] = QuickGELU,
+        norm_layer: Optional[Callable[[int], nn.Module]] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.norm1 = norm_layer(dim)
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        self.attn = Ernie4_5_VisionAttention(embed_dim=dim,
+                                             num_heads=num_heads,
+                                             projection_size=dim,
+                                             quant_config=quant_config,
+                                             prefix=f"{prefix}.attn")
+
+        self.mlp = Ernie4_5_VisionMLP(dim,
+                                      mlp_hidden_dim,
+                                      act_layer=act_layer,
+                                      quant_config=quant_config,
+                                      prefix=f"{prefix}.mlp")
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor,
+            max_seqlen: Optional[int] = None,  # Only used for Flash Attention
+            seqlens: Optional[list[int]] = None,  # Only used for xFormers
+    ) -> torch.Tensor:
+
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            max_seqlen=max_seqlen,
+            seqlens=seqlens,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+
+
+class Ernie4_5_VisionPatchEmbed(nn.Module):
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        in_channels: int = 3,
+        embed_dim: int = 1280,
+        prefix="",
+    ) -> None:
+
+        super().__init__()
+        self.patch_size = patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Linear(in_channels * patch_size * patch_size,
+                              embed_dim,
+                              bias=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.to(target_dtype)
+        hidden_states = self.proj(hidden_states)
+
+        return hidden_states
+
+
+class Ernie4_5_VisionRotaryEmbedding(nn.Module):
+
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.inv_freq = 1.0 / theta**(
+            torch.arange(start=0, end=dim, step=2, dtype=torch.float32) / dim)
+
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen,
+                           device=self.inv_freq.device,
+                           dtype=self.inv_freq.dtype)
+        freqs = torch.outer(input=seq, vec2=self.inv_freq)
+        return freqs
+
+
+class Ernie4_5_VisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        vision_config,
+        norm_eps: float = 1e-6,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+
+        super().__init__()
+        patch_size = vision_config.patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+        in_channels = vision_config.in_channels
+        hidden_size = vision_config.hidden_size
+        embed_dim = vision_config.embed_dim
+        depth = vision_config.depth
+        num_heads = vision_config.num_heads
+        mlp_ratio = vision_config.mlp_ratio
+
+        self.spatial_merge_size = spatial_merge_size
+        self.num_heads = num_heads
+        self.embed_dim = embed_dim
+
+        self.patch_embed = Ernie4_5_VisionPatchEmbed(
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim,
+            prefix=f"{prefix}.patch_embed",
+        )
+
+        norm_layer = partial(nn.LayerNorm, eps=norm_eps)
+        head_dim = embed_dim // num_heads
+        self.rotary_pos_emb = Ernie4_5_VisionRotaryEmbedding(head_dim // 2)
+
+        self.blocks = nn.ModuleList([
+            Ernie4_5_VisionBlock(dim=embed_dim,
+                                 num_heads=num_heads,
+                                 mlp_ratio=mlp_ratio,
+                                 norm_layer=norm_layer,
+                                 quant_config=quant_config,
+                                 prefix=f"{prefix}.blocks.{layer_idx}")
+            for layer_idx in range(depth)
+        ])
+
+        assert (hidden_size == embed_dim
+                ), "vit's config.hidden must be equal to config.embed_dim"
+        self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
+
+        self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.patch_embed.proj.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.patch_embed.proj.weight.device
+
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            ).permute(0, 2, 1, 3).flatten()
+            pos_ids.append(
+                torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+
+    def compute_attn_mask_seqlen(
+            self, cu_seqlens: torch.Tensor
+    ) -> tuple[Optional[int], Optional[list[int]]]:
+        max_seqlen, seqlens = None, None
+        if self.attn_backend == _Backend.FLASH_ATTN:
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        elif self.attn_backend == _Backend.XFORMERS:
+            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        return max_seqlen, seqlens
+
+    def forward(self,
+                hidden_states: torch.Tensor,
+                grid_thw: torch.Tensor,
+                num_pad=0) -> torch.Tensor:
+
+        hidden_states = self.patch_embed(hidden_states)
+
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
+
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2],
+                                             grid_thw[:, 0]).cumsum(
+                                                 dim=0, dtype=torch.int32)
+
+        if num_pad > 0:
+            cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0)
+            cu_seqlens[-1] = cu_seqlens[-2] + num_pad
+        else:
+            cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+
+        # add batch size
+        if hidden_states.ndim == 2:
+            hidden_states = hidden_states.unsqueeze(dim=1)
+
+        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
+        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+
+        for i, blk in enumerate(self.blocks):
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                rotary_pos_emb=rotary_pos_emb,
+                max_seqlen=max_seqlen,
+                seqlens=seqlens,
+            )
+
+        final_output = self.ln(hidden_states)
+
+        if final_output.ndim == 3:
+            final_output = final_output.squeeze(dim=1)
+
+        return final_output
+
+    def load_weights(self, weights) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+# === Vision Inputs === #
+
+
+class Ernie4_5_VLImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    pixel_values: torch.Tensor
+    """Shape:
+    `(num_patches, num_channels * patch_size * patch_size)`
+    """
+
+    grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Ernie4_5_VLImageInputs = Ernie4_5_VLImagePixelInputs
+
+
+class Ernie4_5_VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
+    """Shape:
+    `(num_patches,
+      num_channels * temporal_patch_size * patch_size * patch_size)`
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Ernie4_5_VLVideoInputs = Ernie4_5_VLImagePixelInputs
+
+# === Vision Processor === #
+
+
+def round_by_factor(number: Union[int, float], factor: int) -> int:
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: Union[int, float], factor: int) -> int:
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: Union[int, float], factor: int) -> int:
+    return math.floor(number / factor) * factor
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 4 * 28 * 28,
+    max_pixels: int = 16384 * 28 * 28,
+):
+    MAX_RATIO = 200
+    if max(height, width) / min(height, width) > MAX_RATIO:
+        if height > width:
+            new_width = max(factor, round_by_factor(width, factor))
+            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
+        else:
+            new_height = max(factor, round_by_factor(height, factor))
+            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
+
+        height = new_height
+        width = new_width
+
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+
+    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
+        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
+
+    return h_bar, w_bar
+
+
+class VariableResolutionResamplerModel(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 spatial_conv_size,
+                 temporal_conv_size,
+                 config,
+                 prefix: str = "") -> None:
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.config = config
+        self.spatial_conv_size = spatial_conv_size
+        self.temporal_conv_size = temporal_conv_size
+        self.use_temporal_conv = config.use_temporal_conv
+
+        # compress 2d conv(picture) to 1d
+        self.spatial_dim = (self.in_dim * self.spatial_conv_size *
+                            self.spatial_conv_size)
+        # compress 3d conv(video) to 1d
+        self.temporal_dim = (self.in_dim * self.spatial_conv_size *
+                             self.spatial_conv_size * self.temporal_conv_size)
+
+        self.spatial_linear1 = ColumnParallelLinear(
+            self.spatial_dim,
+            self.spatial_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.spatial_linear1",
+        )
+
+        self.spatial_gelu = nn.GELU()
+
+        self.spatial_linear2 = ColumnParallelLinear(
+            self.spatial_dim,
+            self.spatial_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.spatial_linear2",
+        )
+
+        self.spatial_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6)
+
+        if self.use_temporal_conv:
+            self.temporal_linear1 = ColumnParallelLinear(
+                self.temporal_dim,
+                self.spatial_dim,
+                bias=True,
+                gather_output=True,
+                quant_config=getattr(config, 'quant_config', None),
+                prefix=f"{prefix}.temporal_linear1",
+            )
+
+            self.temporal_gelu = nn.GELU()
+
+            self.temporal_linear2 = ColumnParallelLinear(
+                self.spatial_dim,
+                self.spatial_dim,
+                bias=True,
+                gather_output=True,
+                quant_config=getattr(config, 'quant_config', None),
+                prefix=f"{prefix}.temporal_linear2",
+            )
+
+            self.temporal_norm = nn.LayerNorm(self.spatial_dim, eps=1e-6)
+
+        self.mlp = ColumnParallelLinear(
+            self.spatial_dim,
+            self.out_dim,
+            bias=True,
+            gather_output=True,
+            quant_config=getattr(config, 'quant_config', None),
+            prefix=f"{prefix}.mlp",
+        )
+
+        self.after_norm = RMSNorm(hidden_size=out_dim,
+                                  eps=getattr(config, 'rms_norm_eps', 1e-6))
+
+    def spatial_conv_reshape(self, x, spatial_conv_size):
+        S, C = x.shape
+        x = x.reshape([-1, C * (spatial_conv_size**2)])
+        return x
+
+    def forward(self, x, grid_thw):
+
+        def fwd_spatial(x):
+            x = self.spatial_conv_reshape(x, self.spatial_conv_size)
+
+            x, _ = self.spatial_linear1(x)
+            x = self.spatial_gelu(x)
+            x, _ = self.spatial_linear2(x)
+            x = self.spatial_norm(x)
+
+            return x
+
+        def fwd_placeholder(x, grid_thw, to_tensor=False):
+
+            grid_thw_cpu = grid_thw.cpu().numpy()
+            grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:]
+            grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**
+                                                      2)
+
+            tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (
+                self.spatial_conv_size**2)
+            batch_offset = np.empty(tokens_per_img_or_vid.size,
+                                    dtype=tokens_per_img_or_vid.dtype)
+            batch_offset[0] = 0
+            batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1]
+
+            slice_offsets = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                    grid_t, grid_hw_after_conv, batch_offset):
+                for temp_offset in range(0, temporoal_size, 2):
+                    slice_offsets.append(
+                        np.arange(
+                            b_offset + (temp_offset) * spatial_size,
+                            b_offset + (temp_offset + 1) * spatial_size,
+                        ))
+            slice_offsets = torch.tensor(np.concatenate(slice_offsets,
+                                                        axis=-1)).to(x.device)
+
+            slice_offsets2 = []
+            for temporoal_size, spatial_size, b_offset in zip(
+                    grid_t, grid_hw_after_conv, batch_offset):
+                for temp_offset in range(1 if temporoal_size > 1 else 0,
+                                         temporoal_size, 2):
+                    slice_offsets2.append(
+                        np.arange(
+                            b_offset + (temp_offset) * spatial_size,
+                            b_offset + (temp_offset + 1) * spatial_size,
+                        ))
+            slice_offsets2 = torch.tensor(
+                np.concatenate(slice_offsets2, axis=-1)).to(x.device)
+
+            x_timestep_1 = torch.index_select(x, dim=0, index=slice_offsets)
+            x_timestep_2 = torch.index_select(x, dim=0, index=slice_offsets2)
+            x = torch.concat([x_timestep_1, x_timestep_2], dim=-1)
+            return x
+
+        def fwd_temporal(x):
+            x, _ = self.temporal_linear1(x)
+            x = self.temporal_gelu(x)
+            x, _ = self.temporal_linear2(x)
+            x = self.temporal_norm(x)
+            return x
+
+        def fwd_mlp(x):
+            x, _ = self.mlp(x)
+            x = self.after_norm(x)
+            return x
+
+        x = fwd_spatial(x)
+        if self.use_temporal_conv:
+            x = fwd_placeholder(x, grid_thw)
+            x = fwd_temporal(x)
+        x = fwd_mlp(x)
+        return x
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            if name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Ernie4_5_VLProcessingInfo(BaseProcessingInfo):
+
+    def get_hf_config(self):
+        return self.ctx.model_config.hf_config
+
+    def get_hf_processor(self, **kwargs: object):
+        return self.ctx.get_hf_processor(use_fast=True, **kwargs)
+
+    def get_image_processor(self, **kwargs: object):
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+        return {"image": None, "video": None}
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor: Optional[Any],
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+
+        patch_size = vision_config.patch_size
+        spatial_conv_size = hf_config.spatial_conv_size
+        temporal_conv_size = hf_config.temporal_conv_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * spatial_conv_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width,
+                                          height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width,
+                                          height=image_height)
+
+        grid_t = max(num_frames // temporal_conv_size, 1)
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_patches = grid_t * grid_h * grid_w
+        num_vision_tokens = num_patches // (spatial_conv_size**2)
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor: Optional[Any],
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_num_video_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int,
+        image_processor: Optional[Any],
+    ) -> int:
+        _, num_video_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            num_frames=num_frames,
+            image_processor=image_processor,
+        )
+        return num_video_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=9999999,
+            image_height=9999999,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_image_tokens = self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+        return num_image_tokens
+
+    def _get_max_video_frames(self, max_tokens: int) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        num_frames = 0
+
+        while True:
+            next_num_frames = num_frames + 1
+            next_max_tokens = self.get_num_video_tokens(
+                image_width=target_width,
+                image_height=target_height,
+                num_frames=next_num_frames,
+                image_processor=None,
+            )
+
+            if next_max_tokens > max_tokens:
+                break
+
+            num_frames = next_num_frames
+
+        # If the number of frames is odd, discard one frame.
+        if num_frames % 2 != 0:
+            num_frames -= 1
+
+        return num_frames
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
+
+        return max(max_frames_per_video, 2)
+
+    def get_max_video_tokens(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_video_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            num_frames=self.get_num_frames_with_most_features(
+                seq_len, mm_counts),
+            image_processor=None,
+        )
+
+
+class Ernie4_5VLMultiModalProcessor(
+        BaseMultiModalProcessor[Ernie4_5_VLProcessingInfo]):
+
+    def _pixel_values_norm(
+        self,
+        pixel_values: torch.Tensor,
+        mm_kwargs: object,
+    ) -> torch.Tensor:
+        hf_config = self.info.get_hf_config()
+        vision_config = hf_config.vision_config
+        image_processor = self.info.get_image_processor(**mm_kwargs)
+        image_mean_tensor = torch.tensor(image_processor.image_mean,
+                                         dtype=torch.float32).reshape(
+                                             [1, 3, 1, 1])
+        image_std_tensor = torch.tensor(image_processor.image_std,
+                                        dtype=torch.float32).reshape(
+                                            [1, 3, 1, 1])
+        rescale_factor = torch.tensor(image_processor.rescale_factor,
+                                      dtype=torch.float32)
+        patch_size_squared = vision_config.patch_size**2
+
+        image_mean_tensor = (image_mean_tensor.squeeze(
+            [-2, -1]).repeat_interleave(patch_size_squared, -1))
+        image_std_tensor = (image_std_tensor.squeeze(
+            [-2, -1]).repeat_interleave(patch_size_squared, -1))
+
+        if not image_mean_tensor.is_contiguous():
+            image_mean_tensor = image_mean_tensor.contiguous()
+        if not image_std_tensor.is_contiguous():
+            image_std_tensor = image_std_tensor.contiguous()
+
+        pixel_values = (rescale_factor * pixel_values.to(torch.float32) -
+                        image_mean_tensor) / image_std_tensor
+        pixel_values = pixel_values.to(hf_config.torch_dtype)
+        return pixel_values
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        # when the prompt is not empty but the multimodal data is empty,
+        # directly invoke the tokenizer.
+        if "images" not in mm_data and "videos" not in mm_data and prompt != "":
+            tokenizer = self.info.get_tokenizer()
+            prompt_ids = tokenizer.encode(prompt)
+            tokenizer_output = BatchFeature(dict(input_ids=[prompt_ids]),
+                                            tensor_type="pt")
+            return tokenizer_output
+
+        if "images" not in mm_data:
+            mm_data["images"] = []
+        if "videos" not in mm_data:
+            mm_data["videos"] = []
+        processor_output = self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=[prompt],
+                 images=mm_data["images"],
+                 videos=mm_data["videos"]),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+        # Divide the processor_output into two modalities: image and video.
+        if processor_output is not None:
+            pixel_values = processor_output['images']
+            if pixel_values is not None:
+                processor_output['images'] = self._pixel_values_norm(
+                    pixel_values, mm_kwargs)
+            for key in list(processor_output.keys()):
+                if processor_output[key] is None:
+                    del processor_output[key]
+                    continue
+                if key == "grid_thw":
+                    grid_thw = processor_output['grid_thw']
+                    pixel_values_all = processor_output['images']
+                    # Identify elements where the first
+                    # dimension is greater than 1 and
+                    # treat them as the video modality
+                    mask = grid_thw[:, 0] > 1
+                    processor_output["video_grid_thw"] = grid_thw[mask]
+                    processor_output["image_grid_thw"] = grid_thw[~mask]
+                    image_patch_num = processor_output["image_grid_thw"].prod(
+                        dim=1).sum()
+                    processor_output[
+                        'pixel_values'] = pixel_values_all[:image_patch_num]
+                    processor_output['pixel_values_videos'] = pixel_values_all[
+                        image_patch_num:]
+                    del processor_output['images']
+
+        return processor_output
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        before_placeholder = {
+            "image": "<|image@placeholder|>",
+            "video": "<|video@placeholder|>"
+        }
+
+        after_placeholder = {
+            # image and video have same placeholder
+            "image": "<|IMAGE_PLACEHOLDER|>",
+            "video": "<|IMAGE_PLACEHOLDER|>"
+        }
+
+        merge_length = hf_processor.spatial_conv_size**2
+
+        def get_replacement_ernie45vl(item_idx: int, modality: str):
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+            if modality == "video":
+                num_tokens = int(grid_thw.prod(
+                )) // hf_processor.temporal_conv_size // merge_length
+            else:
+                num_tokens = int(grid_thw.prod()) // merge_length
+            return after_placeholder[modality] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=before_placeholder[modality],
+                replacement=partial(get_replacement_ernie45vl,
+                                    modality=modality),
+            ) for modality in ("image", "video")
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+
+        image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+        image_grid_sizes = image_grid_thw.prod(-1)
+
+        video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3)))
+        video_grid_sizes = video_grid_thw.prod(-1)
+
+        return dict(
+            pixel_values=MultiModalFieldConfig.flat_from_sizes(
+                "image", image_grid_sizes),
+            image_grid_thw=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.flat_from_sizes(
+                "video", video_grid_sizes),
+            video_grid_thw=MultiModalFieldConfig.batched("video"),
+        )
+
+
+class Ernie4_5_VLDummyInputsBuilder(
+        BaseDummyInputsBuilder[Ernie4_5_VLProcessingInfo]):
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        prompt = ""
+        for i in range(num_images):
+            prompt += (f"Picture {i+1}:"
+                       "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>")
+
+        for i in range(num_videos):
+            prompt += (f"Video {i+1}:"
+                       "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>")
+        return prompt
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images),
+            "video":
+            self._get_dummy_videos(width=target_width,
+                                   height=target_height,
+                                   num_frames=target_num_frames,
+                                   num_videos=num_videos)
+        }
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Ernie4_5VLMultiModalProcessor,
+    info=Ernie4_5_VLProcessingInfo,
+    dummy_inputs=Ernie4_5_VLDummyInputsBuilder)
+class Ernie4_5_VLMoeForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                             SupportsLoRA, SupportsPP):
+
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+            # model.resampler_model.-> language_model.model.resampler_model.
+            # language_model.model.resampler_model. -> resampler_model.
+            "language_model.model.resampler_model.": "resampler_model.",
+        },
+        # resampler_weight_mappings
+        orig_to_new_substr={
+            "spatial_linear.0.": "spatial_linear1.",
+            "spatial_linear.2.": "spatial_linear2.",
+            "spatial_linear.3.": "spatial_norm.",
+            "temporal_linear.0.": "temporal_linear1.",
+            "temporal_linear.2.": "temporal_linear2.",
+            "temporal_linear.3.": "temporal_norm.",
+        })
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+        if modality.startswith("image"):
+            return "<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+        if modality.startswith("video"):
+            return "<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
+        raise ValueError("Only image or video modality is supported")
+
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.vision_model = Ernie4_5_VisionTransformer(
+            config.vision_config,
+            norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"),
+        )
+
+        self.language_model = Ernie4_5_VLMoeForCausalLM(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
+
+        self.resampler_model = VariableResolutionResamplerModel(
+            self.config.pixel_hidden_size,
+            self.config.hidden_size,
+            self.config.spatial_conv_size,
+            self.config.temporal_conv_size,
+            config=self.config,
+            prefix=maybe_prefix(prefix, "resampler_model"))
+
+        self.visual_token_mask = None
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors)
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        """compute logits"""
+        return self.language_model.compute_logits(hidden_states,
+                                                  sampling_metadata)
+
+    def _vision_forward(
+        self,
+        pixel_values: torch.Tensor,
+        grid_thw: torch.Tensor,
+    ) -> torch.Tensor:
+        if grid_thw is not None:
+            grid_thw = grid_thw[grid_thw > 0]
+            if grid_thw.numel() % 3 != 0:
+                raise ValueError(
+                    f"grid_thw has {grid_thw.numel()} elements after filtering,"
+                    "which is not divisible by 3.")
+            grid_thw = grid_thw.reshape(-1, 3)
+            # example: [[1,64,64],[2,80,80]] -> [[1,64,64],[1,80,80],[1,80,80]]
+            grid_thw = F.pad(
+                torch.repeat_interleave(grid_thw[:, 1:], grid_thw[:, 0], 0),
+                [1, 0, 0, 0],
+                value=1,
+            )
+        image_features = self.vision_model(pixel_values, grid_thw)
+        return image_features
+
+    def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
+        if getattr(self.config, "im_patch_id", None) is not None:
+            self.visual_token_mask = (
+                input_ids == self.config.im_patch_id).reshape(-1, 1)
+        else:
+            self.visual_token_mask = None
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def _validate_and_reshape_mm_tensor(self, mm_input: object,
+                                        name: str) -> torch.Tensor:
+        if not isinstance(mm_input, (torch.Tensor, list)):
+            raise ValueError(f"Incorrect type of {name}. "
+                             f"Got type: {type(mm_input)}")
+        if isinstance(mm_input, torch.Tensor):
+            if mm_input.ndim == 2:
+                return mm_input
+            if mm_input.ndim != 3:
+                raise ValueError(f"{name} should be 2D or batched 3D tensor. "
+                                 f"Got ndim: {mm_input.ndim} "
+                                 f"(shape={mm_input.shape})")
+            return torch.concat(list(mm_input))
+        else:
+            return torch.concat(mm_input)
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[Ernie4_5_VLImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None:
+            return None
+
+        if pixel_values is not None:
+            pixel_values = self._validate_and_reshape_mm_tensor(
+                pixel_values, "image pixel values")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
+
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Ernie4_5_VLImagePixelInputs(type="pixel_values",
+                                               pixel_values=pixel_values,
+                                               image_grid_thw=image_grid_thw)
+
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[Ernie4_5_VLVideoInputs]:
+        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_grid_thw = kwargs.pop("video_grid_thw", None)
+
+        if pixel_values_videos is None:
+            return None
+
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Ernie4_5_VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+    def _process_image_input(
+            self,
+            image_input: Ernie4_5_VLImageInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values = image_input["pixel_values"].type(
+            self.vision_model.dtype)
+        image_features = self._vision_forward(pixel_values=pixel_values,
+                                              grid_thw=grid_thw)
+        image_embeds = self.resampler_model(image_features, grid_thw)
+
+        merge_size = self.vision_model.spatial_merge_size
+        sizes = grid_thw.prod(-1) // merge_size // merge_size
+
+        return image_embeds.split(sizes.tolist())
+
+    def _process_video_input(
+            self,
+            video_input: Ernie4_5_VLVideoInputs) -> tuple[torch.Tensor, ...]:
+
+        grid_thw = video_input["video_grid_thw"]
+        assert grid_thw.ndim == 2
+
+        pixel_values_videos = video_input["pixel_values_videos"].type(
+            self.vision_model.dtype)
+        video_features = self._vision_forward(pixel_values=pixel_values_videos,
+                                              grid_thw=grid_thw)
+        video_embeds = self.resampler_model(video_features, grid_thw)
+
+        merge_size = self.vision_model.spatial_merge_size
+        sizes = (grid_thw.prod(-1) //
+                 self.config.temporal_conv_size) // merge_size // merge_size
+
+        return video_embeds.split(sizes.tolist())
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_videos",
+                             "video_embeds") and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
+    def get_multimodal_embeddings(
+            self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
+            return None
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+    ) -> torch.Tensor:
+
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+        if multimodal_embeddings is None:
+            return inputs_embeds
+
+        self._set_visual_token_mask(input_ids)
+        inputs_embeds = merge_multimodal_embeddings(input_ids, inputs_embeds,
+                                                    multimodal_embeddings,
+                                                    [self.config.im_patch_id])
+        return inputs_embeds
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+
+        forward_kwargs = {
+            "input_ids": input_ids,
+            "positions": positions,
+            "intermediate_tensors": intermediate_tensors,
+            "inputs_embeds": inputs_embeds,
+        }
+
+        if self.visual_token_mask is not None:
+
+            if self.visual_token_mask.shape[0] != inputs_embeds.shape[0]:
+                padding_len = inputs_embeds.shape[
+                    0] - self.visual_token_mask.shape[0]
+                # right pad False
+                pad = torch.zeros(
+                    (padding_len, self.visual_token_mask.shape[1]),
+                    dtype=self.visual_token_mask.dtype,
+                    device=self.visual_token_mask.device)
+                self.visual_token_mask = torch.cat(
+                    [self.visual_token_mask, pad], dim=0)
+
+            forward_kwargs.update(
+                {"visual_token_mask": self.visual_token_mask})
+            self.visual_token_mask = None
+
+        hidden_states = self.language_model.model(
+            **forward_kwargs,
+            **kwargs,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
new file mode 100644
index 000000000000..f56c09843515
--- /dev/null
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -0,0 +1,723 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The Baidu team.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Erine VL model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention import Attention
+# from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (QKVParallelLinear,
+                                               ReplicatedLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
+    Ernie4_5_VLRotaryEmbedding)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .ernie45_moe import Ernie4_5_MoeMLP
+from .interfaces import SupportsPP
+from .utils import (PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+logger = init_logger(__name__)
+
+
+class Ernie4_5_VLMoeMLP(Ernie4_5_MoeMLP):
+    pass
+
+
+class Ernie4_5_VLMoeAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        head_dim: Optional[int] = None,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        freq_allocation: int = 20,
+        max_position_embeddings: int = 131072,
+        rms_norm_eps: float = 1e-05,
+        qkv_bias: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix) if len(prefix) > 0 else 0
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(hidden_size,
+                                          self.head_dim,
+                                          self.total_num_heads,
+                                          self.total_num_kv_heads,
+                                          bias=qkv_bias,
+                                          quant_config=quant_config,
+                                          prefix=f"{prefix}.qkv_proj")
+
+        self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim,
+                                        hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix=f"{prefix}.o_proj")
+
+        t_rope = freq_allocation
+        h_rope = (self.head_dim // 2 - freq_allocation) // 2
+        w_rope = (self.head_dim // 2 - freq_allocation) // 2
+
+        self.rotary_emb = Ernie4_5_VLRotaryEmbedding(
+            head_size=self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position_embeddings=max_position_embeddings,
+            base=rope_theta,
+            is_neox_style=False,
+            dtype=torch.get_default_dtype(),
+            mrope_section=[h_rope, w_rope, t_rope])
+
+        self.attn = Attention(self.num_heads,
+                              self.head_dim,
+                              self.scaling,
+                              num_kv_heads=self.num_kv_heads,
+                              cache_config=cache_config,
+                              quant_config=quant_config,
+                              prefix=f"{prefix}.attn")
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Attention
+        attn_output = self.attn(q, k, v)
+        # Output projection
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Ernie4_5_VLMoeMoE(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.has_shared_experts = (getattr(config, "moe_num_shared_experts", 0)
+                                   > 0)
+        self.hidden_size = config.hidden_size
+
+        moe_num_experts = config.moe_num_experts
+        max_moe_num_experts = max(moe_num_experts)
+
+        if self.tp_size > max_moe_num_experts:
+            raise ValueError(
+                f"Tensor parallel size {self.tp_size} is greater than "
+                f"the number of experts {moe_num_experts}.")
+
+        moe_layer_start_index = config.moe_layer_start_index
+        text_moe_layer_start_index = moe_layer_start_index[0]
+        vision_moe_layer_start_index = moe_layer_start_index[1]
+        moe_layer_end_index = config.moe_layer_end_index
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1])
+        text_moe_layer_end_index = moe_layer_end_index[0]
+        vision_moe_layer_end_index = moe_layer_end_index[1]
+
+        assert config.moe_num_experts[0] == config.moe_num_experts[1]
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty(2, config.moe_num_experts[0]))
+
+        assert text_moe_layer_start_index <= text_moe_layer_end_index
+
+        if layer_idx >= text_moe_layer_start_index and \
+            layer_idx <= text_moe_layer_end_index:
+            self.text_experts_gate = ReplicatedLinear(
+                config.hidden_size,
+                config.moe_num_experts[0],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.text_experts_gate")
+
+            self.text_experts = FusedMoE(
+                num_experts=config.moe_num_experts[0],
+                top_k=config.moe_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size[0],
+                reduce_results=False,
+                renormalize=True,
+                quant_config=quant_config,
+                e_score_correction_bias=self.e_score_correction_bias[0],
+                prefix=f"{prefix}.text_experts")
+        else:
+            self.text_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        assert vision_moe_layer_start_index <= vision_moe_layer_end_index
+        if layer_idx >= vision_moe_layer_start_index and \
+            layer_idx <= vision_moe_layer_end_index:
+            self.vision_experts_gate = ReplicatedLinear(
+                config.hidden_size,
+                config.moe_num_experts[1],
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.vision_experts_gate")
+
+            self.vision_experts = FusedMoE(
+                num_experts=config.moe_num_experts[1],
+                top_k=config.moe_k,
+                hidden_size=config.hidden_size,
+                intermediate_size=config.moe_intermediate_size[1],
+                reduce_results=False,
+                renormalize=True,
+                quant_config=quant_config,
+                e_score_correction_bias=self.e_score_correction_bias[1],
+                prefix=f"{prefix}.vision_experts")
+        else:
+            self.vision_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        if self.has_shared_experts:
+            intermediate_size = (config.moe_intermediate_size[0] *
+                                 config.moe_num_shared_experts)
+            self.shared_experts = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.shared_experts",
+                reduce_results=self.text_experts.
+                must_reduce_shared_expert_outputs())
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        visual_token_mask: torch.Tensor,
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        orig_shape = hidden_states.shape
+        hidden_dim = hidden_states.shape[-1]
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        if self.has_shared_experts:
+            shared_output = self.shared_experts(hidden_states)
+
+        if visual_token_mask is not None and visual_token_mask.any():
+            # assert visual_token_mask.shape[0] != hidden_states.shape[0]
+            visual_token_mask = visual_token_mask.repeat(
+                1, self.hidden_size).bool()
+            text_token_mask = ~visual_token_mask
+            final_hidden_states = torch.zeros_like(hidden_states)
+
+            text_hidden_states = hidden_states[text_token_mask].reshape(
+                -1, self.hidden_size)
+            vision_hidden_states = hidden_states[visual_token_mask].reshape(
+                -1, self.hidden_size)
+
+            text_router_logits, _ = self.text_experts_gate(text_hidden_states)
+            final_hidden_states[text_token_mask] = self.text_experts(
+                hidden_states=text_hidden_states,
+                router_logits=text_router_logits).flatten()
+
+            vision_router_logits, _ = self.vision_experts_gate(
+                vision_hidden_states)
+            final_hidden_states[visual_token_mask] = self.vision_experts(
+                hidden_states=vision_hidden_states,
+                router_logits=vision_router_logits).flatten()
+        else:
+            # text modal input processing directly
+            text_router_logits, _ = self.text_experts_gate(hidden_states)
+
+            final_hidden_states = self.text_experts(
+                hidden_states=hidden_states, router_logits=text_router_logits)
+
+        if self.has_shared_experts and \
+              shared_output is not None:
+            final_hidden_states = final_hidden_states + shared_output
+
+        if self.tp_size > 1:
+            final_hidden_states = (
+                self.text_experts.maybe_all_reduce_tensor_model_parallel(
+                    final_hidden_states))
+
+        return final_hidden_states.view(orig_shape)
+
+
+class Ernie4_5_VLMoeDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 500000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        freq_allocation = getattr(config, "freq_allocation", 20)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          131072)
+
+        self.self_attn = Ernie4_5_VLMoeAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            head_dim=getattr(config, 'head_dim', None),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            freq_allocation=freq_allocation,
+            max_position_embeddings=max_position_embeddings,
+            rms_norm_eps=config.rms_norm_eps,
+            qkv_bias=getattr(config, 'use_bias', False),
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        layer_idx = extract_layer_index(prefix)
+        self.layer_idx = layer_idx
+
+        # MoE
+        moe_layer_start_index = config.moe_layer_start_index
+        min_moe_layer_start_index = min(moe_layer_start_index)
+        moe_layer_end_index = getattr(
+            config, "moe_layer_end_index",
+            [config.num_hidden_layers - 1, config.num_hidden_layers - 1])
+        max_moe_layer_end_index = max(moe_layer_end_index)
+        assert min_moe_layer_start_index <= max_moe_layer_end_index
+        moe_num_experts = config.moe_num_experts
+        max_moe_num_experts = max(moe_num_experts)
+        moe_layer_interval = getattr(config, "moe_layer_interval", 1)
+        use_moe = getattr(config, "use_moe", max_moe_num_experts > 0)
+
+        if (use_moe and ((layer_idx + 1) % moe_layer_interval == 0)
+                and layer_idx >= min_moe_layer_start_index
+                and layer_idx <= max_moe_layer_end_index):
+            self.mlp = Ernie4_5_VLMoeMoE(config=config,
+                                         quant_config=quant_config,
+                                         prefix=f"{prefix}.mlp")
+        else:
+            self.mlp = Ernie4_5_VLMoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                use_bias=getattr(config, 'use_bias', False),
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp")
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        visual_token_mask: Optional[torch.Tensor],
+        **kwargs: object,
+    ) -> torch.Tensor:
+
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+
+        if isinstance(self.mlp, Ernie4_5_VLMoeMoE):
+            hidden_states = self.mlp(hidden_states, visual_token_mask,
+                                     **kwargs)
+        else:
+            hidden_states = self.mlp(hidden_states)
+
+        return hidden_states, residual
+
+
+# Since Ernie VL distinguishes between text experts and vision experts,
+# enabling torch.compile will cause errors.
+# @support_torch_compile(
+#     dynamic_arg_dims={
+#         "input_ids": 0,
+#         "positions": -1,
+#         "intermediate_tensors": 0,
+#         "inputs_embeds": 0,
+#         "visual_token_mask": 0,
+#     })
+class Ernie4_5_VLMoeModel(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        self.im_patch_id = config.im_patch_id
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.embed_tokens")
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Ernie4_5_VLMoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        visual_token_mask: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states, residual = layer(positions, hidden_states, residual,
+                                            visual_token_mask, **kwargs)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        return hidden_states
+
+
+# only used as text backbone for ernie4.5-vl
+class Ernie4_5_VLMoeForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Ernie4_5_VLMoeModel(vllm_config=vllm_config,
+                                         prefix=maybe_prefix(prefix, "model"))
+
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(config.vocab_size,
+                                          config.hidden_size,
+                                          quant_config=quant_config)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
+                                   inputs_embeds, **kwargs)
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=max(self.config.moe_num_experts))
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if self.config.tie_word_embeddings and name.endswith(
+                    "lm_head.weight"):
+                loaded_params.add("lm_head.weight")
+                continue
+            # MTP will be supported soon.
+            if "mtp" in name or \
+               "vision_model" in name or \
+               "resampler_model" in name:
+                continue
+
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if ((name.endswith(".bias") or name.endswith("_bias"))
+                        and name not in params_dict):
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Distinguish between vision experts and text experts
+                if "mlp.experts" in name:
+                    moe_offset = int(name.split(".")[-3])
+                    vision_expert_start_idx = self.config.moe_num_experts[0]
+                    is_text_expert = \
+                        moe_offset <= vision_expert_start_idx - 1
+                    if is_text_expert:
+                        name = name.replace(".experts.", ".text_experts.")
+                    else:
+                        name = name.replace(
+                            f".experts.{moe_offset}",
+                            f".vision_experts.{moe_offset-vision_expert_start_idx}"
+                        )
+
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+
+                    if weight_name not in name:
+                        continue
+
+                    # Distinguish between vision experts and text experts
+                    moe_offset = int(name.split(".")[-3])
+                    is_text_expert = \
+                        moe_offset <= self.config.moe_num_experts[0] - 1
+
+                    name = name.replace(weight_name, param_name)
+                    if is_text_expert:
+                        name = name.replace(".experts.", ".text_experts.")
+                    else:
+                        name = name.replace(".experts.", ".vision_experts.")
+
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    param = params_dict[name]
+
+                    weight_loader = param.weight_loader
+                    weight_loader(param,
+                                  loaded_weight,
+                                  name,
+                                  shard_id=shard_id,
+                                  expert_id=expert_id)
+                    break
+                else:
+                    # Distinguish between vision expert gate
+                    # and text expert gate
+                    if name.endswith("mlp.gate.weight"):
+                        name = name.replace("gate.weight",
+                                            "text_experts_gate.weight")
+                        loaded_weight = loaded_weight.T
+                    elif name.endswith("mlp.gate.weight_1"):
+                        name = name.replace("gate.weight_1",
+                                            "vision_experts_gate.weight")
+                        loaded_weight = loaded_weight.T
+
+                    if "e_score_correction_bias" in name:
+                        name = name.replace(".moe_statics.", ".")
+
+                    # Skip loading extra bias for GPTQ models.
+                    if ((name.endswith(".bias") or name.endswith("_bias"))
+                            and name not in params_dict):
+                        continue
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    param = params_dict[name]
+
+                    weight_loader = getattr(param, "weight_loader",
+                                            default_weight_loader)
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index ebf78771e40a..c65c58d4a047 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -206,6 +206,7 @@
     "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"),  # noqa: E501
     "Cohere2VisionForConditionalGeneration": ("cohere2_vision", "Cohere2VisionForConditionalGeneration"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
+    "Ernie4_5_VLMoeForConditionalGeneration": ("ernie45_vl", "Ernie4_5_VLMoeForConditionalGeneration"),  # noqa: E501
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "Gemma3ForConditionalGeneration": ("gemma3_mm", "Gemma3ForConditionalGeneration"),  # noqa: E501
     "Gemma3nForConditionalGeneration": ("gemma3n_mm", "Gemma3nForConditionalGeneration"),    # noqa: E501

From 32102644213a6367d10ec3a92ae76fb0004f3a52 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 21:58:59 -0700
Subject: [PATCH 002/125] [Frontend] Add --log-error-stack to print stack trace
 for error response (#22960)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/openai/api_server.py             | 10 ++++++++++
 vllm/entrypoints/openai/cli_args.py               |  2 ++
 vllm/entrypoints/openai/serving_chat.py           |  4 +++-
 vllm/entrypoints/openai/serving_classification.py |  2 ++
 vllm/entrypoints/openai/serving_completion.py     |  2 ++
 vllm/entrypoints/openai/serving_embedding.py      |  4 +++-
 vllm/entrypoints/openai/serving_engine.py         |  9 +++++++++
 vllm/entrypoints/openai/serving_pooling.py        |  4 +++-
 vllm/entrypoints/openai/serving_responses.py      |  2 ++
 vllm/entrypoints/openai/serving_score.py          |  4 +++-
 vllm/entrypoints/openai/serving_tokenization.py   |  4 +++-
 vllm/entrypoints/openai/serving_transcription.py  |  8 ++++++--
 vllm/entrypoints/openai/speech_to_text.py         |  4 +++-
 13 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index db02767fdfd7..9a2470649c8d 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1749,6 +1749,7 @@ async def init_app_state(
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
@@ -1767,6 +1768,7 @@ async def init_app_state(
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
         enable_log_outputs=args.enable_log_outputs,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
@@ -1776,6 +1778,7 @@ async def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
         enable_force_include_usage=args.enable_force_include_usage,
+        log_error_stack=args.log_error_stack,
     ) if "generate" in supported_tasks else None
     state.openai_serving_pooling = OpenAIServingPooling(
         engine_client,
@@ -1784,6 +1787,7 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     ) if "encode" in supported_tasks else None
     state.openai_serving_embedding = OpenAIServingEmbedding(
         engine_client,
@@ -1792,12 +1796,14 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     ) if "embed" in supported_tasks else None
     state.openai_serving_classification = ServingClassification(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "classify" in supported_tasks else None
 
     enable_serving_reranking = ("classify" in supported_tasks and getattr(
@@ -1807,6 +1813,7 @@ async def init_app_state(
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if ("embed" in supported_tasks or enable_serving_reranking) else None
 
     state.openai_serving_tokenization = OpenAIServingTokenization(
@@ -1816,18 +1823,21 @@ async def init_app_state(
         request_logger=request_logger,
         chat_template=resolved_chat_template,
         chat_template_content_format=args.chat_template_content_format,
+        log_error_stack=args.log_error_stack,
     )
     state.openai_serving_transcription = OpenAIServingTranscription(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "transcription" in supported_tasks else None
     state.openai_serving_translation = OpenAIServingTranslation(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
+        log_error_stack=args.log_error_stack,
     ) if "transcription" in supported_tasks else None
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 6e4eff5c8024..d0b5d013eb9e 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -180,6 +180,8 @@ class FrontendArgs:
     h11_max_header_count: int = H11_MAX_HEADER_COUNT_DEFAULT
     """Maximum number of HTTP headers allowed in a request for h11 parser.
     Helps mitigate header abuse. Default: 256."""
+    log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
+    """If set to True, log the stack trace of error responses"""
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7e0e62778097..1c0ffdfb9189 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -76,13 +76,15 @@ def __init__(
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         enable_force_include_usage=enable_force_include_usage)
+                         enable_force_include_usage=enable_force_include_usage,
+                         log_error_stack=log_error_stack)
 
         self.response_role = response_role
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 377f7f684717..1d510d0b60a2 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -129,12 +129,14 @@ def __init__(
         models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
             model_config=model_config,
             models=models,
             request_logger=request_logger,
+            log_error_stack=log_error_stack,
         )
 
     async def create_classify(
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a0ce65409403..b81fd63ece7a 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -59,6 +59,7 @@ def __init__(
         return_tokens_as_token_ids: bool = False,
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(
             engine_client=engine_client,
@@ -67,6 +68,7 @@ def __init__(
             request_logger=request_logger,
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             enable_force_include_usage=enable_force_include_usage,
+            log_error_stack=log_error_stack,
         )
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.default_sampling_params = (
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 9dcad8e391c6..45c1932f1873 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -593,11 +593,13 @@ def __init__(
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 0f4a7c0186b6..a97935e109ef 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -5,6 +5,7 @@
 import json
 import sys
 import time
+import traceback
 from collections.abc import AsyncGenerator, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
 from http import HTTPStatus
@@ -205,6 +206,7 @@ def __init__(
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
         enable_force_include_usage: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__()
 
@@ -222,6 +224,7 @@ def __init__(
 
         self._async_tokenizer_pool: dict[AnyTokenizer,
                                          AsyncMicrobatchTokenizer] = {}
+        self.log_error_stack = log_error_stack
 
     def _get_async_tokenizer(self, tokenizer) -> AsyncMicrobatchTokenizer:
         """
@@ -412,6 +415,12 @@ def create_error_response(
             message: str,
             err_type: str = "BadRequestError",
             status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse:
+        if self.log_error_stack:
+            exc_type, _, _ = sys.exc_info()
+            if exc_type is not None:
+                traceback.print_exc()
+            else:
+                traceback.print_stack()
         return ErrorResponse(error=ErrorInfo(
             message=message, type=err_type, code=status_code.value))
 
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index 38745d001ade..e8cb1aed8459 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -58,11 +58,13 @@ def __init__(
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 67eec2d523e3..899cb07b2b37 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -88,6 +88,7 @@ def __init__(
         enable_prompt_tokens_details: bool = False,
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -96,6 +97,7 @@ def __init__(
             request_logger=request_logger,
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             enable_force_include_usage=enable_force_include_usage,
+            log_error_stack=log_error_stack,
         )
 
         self.chat_template = chat_template
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index c246274514db..37838e22a400 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -47,11 +47,13 @@ def __init__(
         models: OpenAIServingModels,
         *,
         request_logger: Optional[RequestLogger],
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
     async def _embedding_score(
         self,
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 58d720474768..2f258255d5f1 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -39,11 +39,13 @@ def __init__(
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
         chat_template_content_format: ChatTemplateContentFormatOption,
+        log_error_stack: bool = False,
     ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
-                         request_logger=request_logger)
+                         request_logger=request_logger,
+                         log_error_stack=log_error_stack)
 
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
diff --git a/vllm/entrypoints/openai/serving_transcription.py b/vllm/entrypoints/openai/serving_transcription.py
index 0d6989fe91bf..9ba58d442522 100644
--- a/vllm/entrypoints/openai/serving_transcription.py
+++ b/vllm/entrypoints/openai/serving_transcription.py
@@ -32,13 +32,15 @@ def __init__(
         *,
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="transcribe")
+                         task_type="transcribe",
+                         log_error_stack=log_error_stack)
 
     async def create_transcription(
         self, audio_data: bytes, request: TranscriptionRequest,
@@ -88,13 +90,15 @@ def __init__(
         *,
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
                          return_tokens_as_token_ids=return_tokens_as_token_ids,
-                         task_type="translate")
+                         task_type="translate",
+                         log_error_stack=log_error_stack)
 
     async def create_translation(
         self, audio_data: bytes, request: TranslationRequest,
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index de2619a78f8e..1cbd7dba393f 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -53,12 +53,14 @@ def __init__(
         request_logger: Optional[RequestLogger],
         return_tokens_as_token_ids: bool = False,
         task_type: Literal["transcribe", "translate"] = "transcribe",
+        log_error_stack: bool = False,
     ):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          models=models,
                          request_logger=request_logger,
-                         return_tokens_as_token_ids=return_tokens_as_token_ids)
+                         return_tokens_as_token_ids=return_tokens_as_token_ids,
+                         log_error_stack=log_error_stack)
 
         self.default_sampling_params = (
             self.model_config.get_diff_sampling_param())

From 142ac0803045b3a3edcd7aa58fe079872903a30c Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Tue, 26 Aug 2025 21:59:14 -0700
Subject: [PATCH 003/125] [Frontend] Optimize beam search performance by
 limiting concurrency (#23599)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 benchmarks/benchmark_throughput.py |   1 -
 tests/conftest.py                  |   8 +-
 tests/samplers/test_beam_search.py |  53 ++++++++++
 vllm/entrypoints/llm.py            | 152 ++++++++++++++++-------------
 4 files changed, 143 insertions(+), 71 deletions(-)

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index c7f290e1eb88..6b24b8c8f3c6 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -96,7 +96,6 @@ def run_vllm(
         end = time.perf_counter()
     else:
         assert lora_requests is None, "BeamSearch API does not support LoRA"
-        prompts = [request.prompt for request in requests]
         # output_len should be the same for all requests.
         output_len = requests[0].expected_output_len
         for request in requests:
diff --git a/tests/conftest.py b/tests/conftest.py
index 2bf88abb0f6c..f8bfdfc8e625 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1022,15 +1022,17 @@ def generate_beam_search(
         images: Optional[PromptImageInput] = None,
         videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
+        concurrency_limit: Optional[int] = None,
     ) -> list[tuple[list[list[int]], list[str]]]:
         inputs = self.get_inputs(prompts,
                                  images=images,
                                  videos=videos,
                                  audios=audios)
 
-        outputs = self.llm.beam_search(
-            inputs,
-            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
+        outputs = self.llm.beam_search(inputs,
+                                       BeamSearchParams(beam_width=beam_width,
+                                                        max_tokens=max_tokens),
+                                       concurrency_limit=concurrency_limit)
         returned_outputs = []
         for output in outputs:
             token_ids = [x.tokens for x in output.sequences]
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index bdf48c7687b2..cc9a88a255f9 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -67,6 +67,59 @@ def test_beam_search_single_input(
                 f"vLLM: {vllm_output_ids}")
 
 
+@pytest.mark.skip_v1  # FIXME: This fails on V1 right now.
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", MAX_TOKENS)
+@pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
+def test_beam_search_with_concurrency_limit(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    beam_width: int,
+) -> None:
+    # example_prompts[1]&[3]&[7] fails due to unknown reason even without
+    # concurency limit. skip them for now.
+    example_prompts = (example_prompts[:8])
+    concurrency_limit = 2
+    assert len(example_prompts) > concurrency_limit
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        outputs_with_limit = vllm_model.generate_beam_search(
+            example_prompts,
+            beam_width,
+            max_tokens,
+            concurrency_limit=concurrency_limit)
+        outputs_without_limit = []
+
+        for i in range(0, len(example_prompts), concurrency_limit):
+            outputs_without_limit.extend(
+                vllm_model.generate_beam_search(
+                    example_prompts[i:i + concurrency_limit], beam_width,
+                    max_tokens))
+
+    correct = True
+    for i in range(len(example_prompts)):
+        output_ids_with_limit, output_texts_with_limit = outputs_with_limit[i]
+        output_ids_without_limit, output_texts_without_limit = (
+            outputs_without_limit[i])
+        for j, (text_with_limit, text_without_limit) in enumerate(
+                zip(output_texts_with_limit, output_texts_without_limit)):
+            print(f">>>{j}-th with limit output:")
+            print(text_with_limit)
+            print(f">>>{j}-th without limit output:")
+            print(text_without_limit)
+        assert len(output_ids_with_limit) == len(output_ids_without_limit)
+        for j in range(len(output_ids_with_limit)):
+            if output_ids_with_limit[j] != output_ids_without_limit[j]:
+                print(f"Test{i} output{j}:\n+limit: {output_ids_with_limit}\n"
+                      f"-limit: {output_ids_without_limit}")
+                correct = False
+    assert correct
+
+
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
 @pytest.mark.parametrize("beam_width", MM_BEAM_WIDTHS)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 8816ff56d684..72b6123670b7 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -523,6 +523,7 @@ def beam_search(
         params: BeamSearchParams,
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         use_tqdm: bool = False,
+        concurrency_limit: Optional[int] = None,
     ) -> list[BeamSearchOutput]:
         """
         Generate sequences using beam search.
@@ -533,6 +534,8 @@ def beam_search(
             params: The beam search parameters.
             lora_request: LoRA request to use for generation, if any.
             use_tqdm: Whether to use tqdm to display the progress bar.
+            concurrency_limit: The maximum number of concurrent requests.
+                If None, the number of concurrent requests is unlimited.
         """
         # TODO: how does beam search work together with length penalty,
         # frequency, penalty, and stopping criteria, etc.?
@@ -551,6 +554,15 @@ def beam_search(
             length_penalty,
         )
 
+        if use_tqdm and concurrency_limit is not None:
+            logger.warning(
+                "Progress bar is not supported when using concurrency_limit. "
+                "Disabling progress bar.")
+            use_tqdm = False
+
+        if concurrency_limit is None:
+            concurrency_limit = len(prompts)
+
         def create_tokens_prompt_from_beam(
                 beam: BeamSearchSequence) -> TokensPrompt:
             token_prompt_kwargs: TokensPrompt = {
@@ -595,73 +607,79 @@ def create_tokens_prompt_from_beam(
                     **mm_kwargs,
                 ), )
 
-        token_iter = range(max_tokens)
-        if use_tqdm:
-            token_iter = tqdm(token_iter,
-                              desc="Beam search",
-                              unit="token",
-                              unit_scale=False)
-            logger.warning(
-                "The progress bar shows the upper bound on token steps and "
-                "may finish early due to stopping conditions. It does not "
-                "reflect instance-level progress.")
-
-        for _ in token_iter:
-            all_beams: list[BeamSearchSequence] = list(
-                sum((instance.beams for instance in instances), []))
-            pos = [0] + list(
-                itertools.accumulate(
-                    len(instance.beams) for instance in instances))
-            instance_start_and_end: list[tuple[int, int]] = list(
-                zip(pos[:-1], pos[1:]))
-
-            if len(all_beams) == 0:
-                break
-
-            # create the corresponding batch entries for prompt & optional lora
-            prompts_batch, lora_req_batch = zip(
-                *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
-                  for beam in all_beams])
-
-            # only runs for one step
-            # we don't need to use tqdm here
-            output = self.generate(prompts_batch,
-                                   sampling_params=beam_search_params,
-                                   use_tqdm=False,
-                                   lora_request=lora_req_batch)
-
-            for (start, end), instance in zip(instance_start_and_end,
-                                              instances):
-                instance_new_beams = []
-                for i in range(start, end):
-                    current_beam = all_beams[i]
-                    result = output[i]
-
-                    if result.outputs[0].logprobs is not None:
-                        # if `result.outputs[0].logprobs` is None, it means
-                        # the sequence is completed because of the max-model-len
-                        # or abortion. we don't need to add it to the new beams.
-                        logprobs = result.outputs[0].logprobs[0]
-                        for token_id, logprob_obj in logprobs.items():
-                            new_beam = BeamSearchSequence(
-                                tokens=current_beam.tokens + [token_id],
-                                logprobs=current_beam.logprobs + [logprobs],
-                                lora_request=current_beam.lora_request,
-                                cum_logprob=current_beam.cum_logprob +
-                                logprob_obj.logprob,
-                                multi_modal_data=current_beam.multi_modal_data,
-                                mm_processor_kwargs=current_beam.
-                                mm_processor_kwargs)
-
-                            if token_id == tokenizer.eos_token_id and \
-                                not ignore_eos:
-                                instance.completed.append(new_beam)
-                            else:
-                                instance_new_beams.append(new_beam)
-                sorted_beams = sorted(instance_new_beams,
-                                      key=sort_beams_key,
-                                      reverse=True)
-                instance.beams = sorted_beams[:beam_width]
+        for prompt_start in range(0, len(prompts), concurrency_limit):
+            instances_batch = instances[prompt_start:prompt_start +
+                                        concurrency_limit]
+
+            token_iter = range(max_tokens)
+            if use_tqdm:
+                token_iter = tqdm(token_iter,
+                                  desc="Beam search",
+                                  unit="token",
+                                  unit_scale=False)
+                logger.warning(
+                    "The progress bar shows the upper bound on token steps and "
+                    "may finish early due to stopping conditions. It does not "
+                    "reflect instance-level progress.")
+            for _ in token_iter:
+                all_beams: list[BeamSearchSequence] = list(
+                    sum((instance.beams for instance in instances_batch), []))
+                pos = [0] + list(
+                    itertools.accumulate(
+                        len(instance.beams) for instance in instances_batch))
+                instance_start_and_end: list[tuple[int, int]] = list(
+                    zip(pos[:-1], pos[1:]))
+
+                if len(all_beams) == 0:
+                    break
+
+                # create corresponding batch entries for prompt & optional lora
+                prompts_batch, lora_req_batch = zip(
+                    *[(create_tokens_prompt_from_beam(beam), beam.lora_request)
+                      for beam in all_beams])
+
+                # only runs for one step
+                # we don't need to use tqdm here
+                output = self.generate(prompts_batch,
+                                       sampling_params=beam_search_params,
+                                       use_tqdm=False,
+                                       lora_request=lora_req_batch)
+
+                for (start, end), instance in zip(instance_start_and_end,
+                                                  instances_batch):
+                    instance_new_beams = []
+                    for i in range(start, end):
+                        current_beam = all_beams[i]
+                        result = output[i]
+
+                        if result.outputs[0].logprobs is not None:
+                            # if `result.outputs[0].logprobs` is None, it means
+                            # the sequence is completed because of the
+                            # max-model-len or abortion. we don't need to add
+                            # it to the new beams.
+                            logprobs = result.outputs[0].logprobs[0]
+                            for token_id, logprob_obj in logprobs.items():
+                                new_beam = BeamSearchSequence(
+                                    tokens=current_beam.tokens + [token_id],
+                                    logprobs=current_beam.logprobs +
+                                    [logprobs],
+                                    lora_request=current_beam.lora_request,
+                                    cum_logprob=current_beam.cum_logprob +
+                                    logprob_obj.logprob,
+                                    multi_modal_data=current_beam.
+                                    multi_modal_data,
+                                    mm_processor_kwargs=current_beam.
+                                    mm_processor_kwargs)
+
+                                if token_id == tokenizer.eos_token_id and \
+                                    not ignore_eos:
+                                    instance.completed.append(new_beam)
+                                else:
+                                    instance_new_beams.append(new_beam)
+                    sorted_beams = sorted(instance_new_beams,
+                                          key=sort_beams_key,
+                                          reverse=True)
+                    instance.beams = sorted_beams[:beam_width]
 
         outputs = []
         for instance in instances:

From d272415e57c95da63c798c22c7d87cc5c0cda21f Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 27 Aug 2025 01:00:21 -0400
Subject: [PATCH 004/125] [Quantization] Expand compressed-tensors MoE matching
 logic to support NFP4 + FP8 MoEs (#22674)

Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Dipika <dipikasikka1@gmail.com>
---
 .../compressed_tensors/compressed_tensors.py  | 13 +++----
 .../compressed_tensors_moe.py                 | 36 +++++++++++++++++--
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ce74375aab42..245cf122ebab 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -425,6 +425,10 @@ def _get_scheme_from_parts(
             weight_quant: BaseModel,
             input_quant: BaseModel,
             format: Optional[str] = None) -> "CompressedTensorsScheme":
+
+        # use the per-layer format if defined, otherwise, use global format
+        format = format if format is not None else self.quant_format
+
         # Detect If Mixed Precision
         if self._is_fp4a16_nvfp4(weight_quant, input_quant):
             return CompressedTensorsW4A16Fp4()
@@ -437,14 +441,14 @@ def _get_scheme_from_parts(
                                             actorder=weight_quant.actorder)
 
         if self._is_wNa16_group_channel(weight_quant, input_quant):
-            if (self.quant_format == CompressionFormat.marlin_24.value
+            if (format == CompressionFormat.marlin_24.value
                     and weight_quant.num_bits in W4A16SPARSE24_SUPPORTED_BITS):
                 assert weight_quant.symmetric
                 return CompressedTensorsW4A16Sparse24(
                     strategy=weight_quant.strategy,
                     num_bits=weight_quant.num_bits,
                     group_size=weight_quant.group_size)
-            if (self.quant_format == CompressionFormat.pack_quantized.value
+            if (format == CompressionFormat.pack_quantized.value
                     and weight_quant.num_bits in WNA16_SUPPORTED_BITS):
                 return CompressedTensorsWNA16(
                     num_bits=weight_quant.num_bits,
@@ -453,10 +457,7 @@ def _get_scheme_from_parts(
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder)
 
-        act_quant_format = is_activation_quantization_format(
-            format
-        ) if format is not None else is_activation_quantization_format(
-            self.quant_format)
+        act_quant_format = is_activation_quantization_format(format)
         if act_quant_format:
             if self._is_fp4a4_nvfp4(weight_quant, input_quant):
                 if cutlass_fp4_supported(
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 1ee3478aa4f4..6279bb8b6057 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -22,6 +22,8 @@
     is_valid_flashinfer_cutlass_fused_moe)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import (  # noqa
     WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    find_matched_target)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize, reorder_w1w3_to_w3w1,
@@ -65,12 +67,40 @@ def __init_(self, moe: FusedMoEConfig):
     @staticmethod
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
-        layer: torch.nn.Module,
+        layer: torch.nn.Module
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
-        weight_quant = quant_config.target_scheme_map["Linear"].get("weights")
-        input_quant = quant_config.target_scheme_map["Linear"].get(
+        # Check if a using "Linear" to select scheems
+        if "Linear" in quant_config.target_scheme_map:
+            matched_target = "Linear"
+        else:
+            # May have instead defined the linear layers in the fused model
+
+            fused_layers = [
+                "re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"
+            ]
+            current_scheme = None
+            for fused_layer in fused_layers:
+                # Check if one of the fused layers are defined in quant_config
+                matched_target = find_matched_target(
+                    layer_name=fused_layer,
+                    module=layer,
+                    targets=quant_config.target_scheme_map.keys(),
+                    fused_mapping=quant_config.packed_modules_mapping)
+
+                # Only valid if down_proj, gate_proj, and up_proj
+                # are mapped to the same quant scheme in the quant_config
+                if current_scheme is None:
+                    current_scheme = quant_config.target_scheme_map.get(
+                        matched_target)
+                else:
+                    assert current_scheme == quant_config.target_scheme_map.get(
+                        matched_target)
+
+        weight_quant = quant_config.target_scheme_map[matched_target].get(
+            "weights")
+        input_quant = quant_config.target_scheme_map[matched_target].get(
             "input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):

From fce10dbed5441b4f918b23a2b63aae72bc00a2f6 Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 27 Aug 2025 13:33:27 +0800
Subject: [PATCH 005/125] [XPU] Add xpu torch.compile support (#22609)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 .buildkite/scripts/hardware_ci/run-xpu-test.sh |  1 +
 vllm/attention/layer.py                        |  3 +--
 vllm/compilation/fix_functionalization.py      |  8 ++++++++
 vllm/platforms/cpu.py                          |  4 ++++
 vllm/platforms/cuda.py                         |  4 ++++
 vllm/platforms/interface.py                    |  8 ++++++++
 vllm/platforms/rocm.py                         |  4 ++++
 vllm/platforms/xpu.py                          | 15 ++++++---------
 8 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 445cd2735c19..73f3e63fbf5f 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -31,6 +31,7 @@ docker run \
     set -e
     echo $ZE_AFFINITY_MASK
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     cd tests
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 2d288bcbe0c9..237802afccde 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -190,8 +190,7 @@ def __init__(
         # torch.compile works by registering the attention as one giant
         # opaque custom op. For other platforms, we directly call them
         # and let torch.compile handle them.
-        self.use_direct_call = not current_platform.is_cuda_alike(
-        ) and not current_platform.is_cpu()
+        self.use_direct_call = not current_platform.opaque_attention_op()
 
         self.use_output = self.attn_backend.accept_output_buffer
         compilation_config = get_current_vllm_config().compilation_config
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 286221d32c1e..60ae14331879 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -9,6 +9,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 
 from .fx_utils import is_func
 from .vllm_inductor_pass import VllmInductorPass
@@ -26,6 +27,13 @@ class FixFunctionalizationPass(VllmInductorPass):
     """
 
     def __call__(self, graph: torch.fx.Graph):
+        # XPU does not support auto-functionalization yet.
+        # Will enable this when switch to vllm-xpu-kernels.
+        if current_platform.is_xpu():
+            logger.debug("XPU platform does not support fix functionalization"
+                         "pass currently.")
+            return
+
         self.begin()
         self.dump_graph(graph, "before_fix_functionalization")
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index c748595a7153..5686fae5cd7d 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -335,3 +335,7 @@ def default_v1(cls, model_config) -> bool:
         return (cls.supports_v1(model_config)
                 and arch in (CpuArchEnum.X86, CpuArchEnum.POWERPC,
                              CpuArchEnum.ARM, CpuArchEnum.S390X))
+
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index c0e0fe35e402..5cbb7346436e 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -442,6 +442,10 @@ def supports_v1(cls, model_config: "ModelConfig") -> bool:
     def use_custom_allreduce(cls) -> bool:
         return True
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+
     @classmethod
     def get_static_graph_wrapper_cls(cls) -> str:
         return "vllm.compilation.cuda_graph.CUDAGraphWrapper"
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index f6c17de86d05..01f3e2d977bc 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -509,6 +509,14 @@ def use_custom_allreduce(cls) -> bool:
         """
         return False
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        """
+        Returns True if we register attention as one giant opaque custom op
+        on the current platform
+        """
+        return False
+
     @classmethod
     def validate_request(
         cls,
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 85b2fe2e480c..c6d14aa87c7f 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -411,6 +411,10 @@ def use_custom_allreduce(cls) -> bool:
         supported_archs = ['gfx94', 'gfx95']
         return any(gfx in gcn_arch for gfx in supported_archs)
 
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True
+
     @classmethod
     def get_cu_count(cls, device_id: int = 0) -> int:
         return torch.cuda.get_device_properties(
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 235e5d8294e5..84f4cd725646 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -90,21 +90,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 64
 
-        # FIXME: Temporarily forcing eager mode
-        # remove after t.compile support stabilizes.
-        if (envs.VLLM_USE_V1 and model_config is not None
-                and not vllm_config.model_config.enforce_eager):
-            from vllm.config import CompilationLevel
-            vllm_config.compilation_config.level = CompilationLevel.NO_COMPILATION  # noqa: E501
-
         # lazy import to avoid circular import
         from vllm.config import CUDAGraphMode
         compilation_config = vllm_config.compilation_config
         if compilation_config.cudagraph_mode is None or \
                 compilation_config.cudagraph_mode.max_cudagraph_mode() \
                     != CUDAGraphMode.NONE:
-            logger.info("[XPU] CUDA graph is not supported on XPU, "
-                        "disabling cudagraphs.")
+            logger.info("[XPU] CUDA graph is not supported on XPU, disabling "
+                        "cudagraphs. Fallback to cudagraph_mode=NONE")
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
 
         # check and update parallel config
@@ -182,3 +175,7 @@ def check_if_supports_dtype(cls, torch_dtype: torch.dtype):
                     "Intel Arc A770 have bfloat16 accuracy known issue. "
                     "You can use float16 instead by explicitly setting the "
                     "`dtype` flag in CLI, for example: --dtype=half.")
+
+    @classmethod
+    def opaque_attention_op(cls) -> bool:
+        return True

From 9de25c294b92e42a12d1fbbb3ab3f633fa80291c Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 27 Aug 2025 13:51:50 +0800
Subject: [PATCH 006/125] [CI/Build] Remove redundant LoRA model tests (#23706)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/conftest.py      |   5 --
 tests/lora/test_baichuan.py | 112 ------------------------------------
 tests/lora/test_phi.py      |  71 -----------------------
 3 files changed, 188 deletions(-)
 delete mode 100644 tests/lora/test_baichuan.py
 delete mode 100644 tests/lora/test_phi.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index cba573b63c04..3475993ff8f0 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -216,11 +216,6 @@ def tinyllama_lora_files():
     return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
 
 
-@pytest.fixture(scope="session")
-def phi2_lora_files():
-    return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
-
-
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
deleted file mode 100644
index 774ebb9db210..000000000000
--- a/tests/lora/test_baichuan.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-import vllm
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "baichuan-inc/Baichuan-7B"
-
-PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(query="How many singers do we have?"),
-        PROMPT_TEMPLATE.format(
-            query=
-            "What is the average, minimum, and maximum age of all singers from France?"  # noqa: E501
-        ),
-        PROMPT_TEMPLATE.format(
-            query=
-            "Show name, country, age for all singers ordered by age from the oldest to the youngest."  # noqa: E501
-        ),
-    ]
-    print(prompts)
-    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_baichuan_lora(baichuan_lora_files):
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=4,
-                   max_lora_rank=64,
-                   trust_remote_code=True)
-
-    expected_lora_output = [
-        "SELECT count(*) FROM singer",
-        "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE Country  =  'France'",  # noqa: E501
-        "SELECT name ,  country ,  age FROM singer ORDER BY age ASC",
-    ]
-
-    output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i] == expected_lora_output[i]
-    output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i] == expected_lora_output[i]
-
-
-@pytest.mark.parametrize("fully_sharded", [True, False])
-def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
-                                           num_gpus_available, fully_sharded):
-    if num_gpus_available < 4:
-        pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
-
-    llm_tp1 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
-
-    del llm_tp1
-    cleanup_dist_env_and_memory()
-
-    llm_tp2 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=2,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
-
-    del llm_tp2
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp2
-
-    llm_tp4 = vllm.LLM(MODEL_PATH,
-                       enable_lora=True,
-                       max_num_seqs=16,
-                       max_loras=4,
-                       max_lora_rank=64,
-                       tensor_parallel_size=4,
-                       trust_remote_code=True,
-                       fully_sharded_loras=fully_sharded)
-    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
-
-    del llm_tp4
-    cleanup_dist_env_and_memory()
-
-    assert output_tp1 == output_tp4
diff --git a/tests/lora/test_phi.py b/tests/lora/test_phi.py
deleted file mode 100644
index 3090941e6367..000000000000
--- a/tests/lora/test_phi.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import vllm
-from vllm.lora.request import LoRARequest
-
-MODEL_PATH = "microsoft/phi-2"
-
-PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:"  # noqa: E501
-
-
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
-    prompts = [
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which catalog publisher has published the most catalogs?",
-            context="CREATE TABLE catalogs (catalog_publisher VARCHAR);"),
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "Which trip started from the station with the largest dock count? Give me the trip id.",  # noqa: E501
-            context=
-            "CREATE TABLE trip (id VARCHAR, start_station_id VARCHAR); CREATE TABLE station (id VARCHAR, dock_count VARCHAR);"  # noqa: E501
-        ),
-        PROMPT_TEMPLATE.format(
-            sql_prompt=
-            "How many marine species are found in the Southern Ocean?",  # noqa: E501
-            context=
-            "CREATE TABLE marine_species (name VARCHAR(50), common_name VARCHAR(50), location VARCHAR(50));"  # noqa: E501
-        ),
-    ]
-    sampling_params = vllm.SamplingParams(temperature=0,
-                                          max_tokens=64,
-                                          stop="### End")
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None,
-    )
-    # Print the outputs.
-    generated_texts: list[str] = []
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text.strip()
-        generated_texts.append(generated_text)
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-    return generated_texts
-
-
-def test_phi2_lora(phi2_lora_files):
-    # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
-    # Otherwise, the lora-test will fail due to CUDA OOM.
-    llm = vllm.LLM(MODEL_PATH,
-                   max_model_len=1024,
-                   enable_lora=True,
-                   max_loras=2,
-                   enforce_eager=True,
-                   enable_chunked_prefill=True)
-
-    expected_lora_output = [
-        "SELECT catalog_publisher, COUNT(*) as num_catalogs FROM catalogs GROUP BY catalog_publisher ORDER BY num_catalogs DESC LIMIT 1;",  # noqa: E501
-        "SELECT trip.id FROM trip JOIN station ON trip.start_station_id = station.id WHERE station.dock_count = (SELECT MAX(dock_count) FROM station);",  # noqa: E501
-        "SELECT COUNT(*) FROM marine_species WHERE location = 'Southern Ocean';",  # noqa: E501
-    ]
-
-    output1 = do_sample(llm, phi2_lora_files, lora_id=1)
-    for i in range(len(expected_lora_output)):
-        assert output1[i].startswith(expected_lora_output[i])
-    output2 = do_sample(llm, phi2_lora_files, lora_id=2)
-    for i in range(len(expected_lora_output)):
-        assert output2[i].startswith(expected_lora_output[i])

From 8dbf6ed7be3f8602257ce1879825d4b5e3554d67 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Wed, 27 Aug 2025 13:54:39 +0800
Subject: [PATCH 007/125] [Bugfix] fix when config.yaml config value is list
 parse error (#23528)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 tests/utils_/test_utils.py | 41 ++++++++++++++++++++++++++++++++++++++
 vllm/utils/__init__.py     |  9 +++++++--
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 084d82dee11b..04195ea0cf92 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -5,13 +5,17 @@
 import asyncio
 import hashlib
 import json
+import os
 import pickle
 import socket
+import tempfile
 from collections.abc import AsyncIterator
+from pathlib import Path
 from unittest.mock import patch
 
 import pytest
 import torch
+import yaml
 import zmq
 from transformers import AutoTokenizer
 from vllm_test_utils.monitor import monitor
@@ -991,3 +995,40 @@ def child_thread_func():
         child_thread.join(timeout=5)
         if child_thread.is_alive():
             pytest.fail("Child thread failed to exit properly")
+
+
+def test_load_config_file(tmp_path):
+    # Define the configuration data
+    config_data = {
+        "enable-logging": True,
+        "list-arg": ["item1", "item2"],
+        "port": 12323,
+        "tensor-parallel-size": 4
+    }
+
+    # Write the configuration data to a temporary YAML file
+    config_file_path = tmp_path / "config.yaml"
+    with open(config_file_path, "w") as config_file:
+        yaml.dump(config_data, config_file)
+
+    # Initialize the parser
+    parser = FlexibleArgumentParser()
+
+    # Call the function with the temporary file path
+    processed_args = parser.load_config_file(str(config_file_path))
+
+    # Expected output
+    expected_args = [
+        "--enable-logging",
+        "--list-arg",
+        "item1",
+        "item2",
+        "--port",
+        "12323",
+        "--tensor-parallel-size",
+        "4",
+    ]
+
+    # Assert that the processed arguments match the expected output
+    assert processed_args == expected_args
+    os.remove(str(config_file_path))
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 7c34a858c0a2..60bddc5b500b 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1974,7 +1974,7 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]:
 
         file_path = args[index + 1]
 
-        config_args = self._load_config_file(file_path)
+        config_args = self.load_config_file(file_path)
 
         # 0th index is for {serve,chat,complete}
         # optionally followed by model_tag (only for serve)
@@ -2005,7 +2005,7 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]:
 
         return args
 
-    def _load_config_file(self, file_path: str) -> list[str]:
+    def load_config_file(self, file_path: str) -> list[str]:
         """Loads a yaml file and returns the key value pairs as a
         flattened list with argparse like pattern
         ```yaml
@@ -2046,6 +2046,11 @@ def _load_config_file(self, file_path: str) -> list[str]:
             if isinstance(value, bool) and key not in store_boolean_arguments:
                 if value:
                     processed_args.append('--' + key)
+            elif isinstance(value, list):
+                if value:
+                    processed_args.append('--' + key)
+                    for item in value:
+                        processed_args.append(str(item))
             else:
                 processed_args.append('--' + key)
                 processed_args.append(str(value))

From 69244e67e6822f1c15816f887659e1ccc18c2632 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 14:19:13 +0800
Subject: [PATCH 008/125] [Core] Use key-only cache for
 `BaseMultiModalProcessor` (#23018)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/configuration/conserving_memory.md       |   2 +-
 docs/configuration/optimization.md            |  44 +-
 .../multimodal/processing/test_common.py      |   8 +-
 tests/multimodal/test_cache.py                | 182 +++++++-
 vllm/config/__init__.py                       |  26 +-
 vllm/engine/arg_utils.py                      |  14 +-
 vllm/engine/llm_engine.py                     |  15 +-
 vllm/inputs/preprocess.py                     |  22 +-
 vllm/inputs/registry.py                       |  12 +-
 .../models/hyperclovax_vision.py              |   7 +-
 vllm/model_executor/models/llava.py           |   8 +-
 vllm/model_executor/models/minicpmv.py        |  40 +-
 vllm/model_executor/models/mistral3.py        |   8 +-
 vllm/model_executor/models/phi3v.py           |  20 +-
 vllm/model_executor/models/phi4mm.py          |  21 +-
 vllm/model_executor/models/tarsier.py         |   7 +-
 vllm/multimodal/cache.py                      | 405 +++++++++++++++++-
 vllm/multimodal/inputs.py                     |  38 +-
 vllm/multimodal/processing.py                 | 187 ++++----
 vllm/multimodal/profiling.py                  |   4 +-
 vllm/multimodal/registry.py                   |  90 ++--
 vllm/v1/engine/async_llm.py                   |   3 +-
 vllm/v1/engine/core.py                        |  17 +-
 vllm/v1/engine/llm_engine.py                  |   3 +-
 vllm/v1/engine/mm_input_cache.py              | 121 ------
 vllm/v1/engine/processor.py                   |  29 +-
 vllm/v1/worker/gpu_model_runner.py            |   3 +
 vllm/v1/worker/tpu_model_runner.py            |   3 +
 vllm/v1/worker/utils.py                       |   9 +-
 29 files changed, 954 insertions(+), 394 deletions(-)
 delete mode 100644 vllm/v1/engine/mm_input_cache.py

diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 058eba5fe0b1..efda9c8e019e 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
 
 If you run out of CPU RAM, try the following options:
 
-- (Multi-modal models only) you can set the size of multi-modal processor cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB per API process + 4 GiB per engine core process)
+- (Multi-modal models only) you can set the size of multi-modal cache by setting `mm_processor_cache_gb` engine argument (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
 ## Multi-modal input limits
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index bb47e1b90f08..3eaf2185a559 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -204,20 +204,33 @@ vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
     to avoid CPU resource exhaustion.
 
 !!! note
-    [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
-    because it requires a one-to-one correspondence between API and engine core processes.
+    API server scale-out disables [multi-modal IPC caching](#ipc-caching)
+    because it requires a one-to-one correspondance between API and engine core processes.
 
-## Multi-Modal Caching
+    This does not impact [multi-modal processor caching](#processor-caching).
 
-### Processor Cache
+## Multi-Modal Caching
 
-By default, the multi-modal processor cache is enabled to avoid repeatedly processing
-the same multi-modal inputs via Hugging Face `AutoProcessor`,
+Multi-modal caching avoids repeated transfer or processing of the same multi-modal data,
 which commonly occurs in multi-turn conversations.
 
-You can adjust the size of the cache by setting the value of `mm_processor_cache_gb`
-(default 4 GiB per API process + 4 GiB per engine core process).
-If you do not benefit much from the cache, you can disable it completely via `mm_processor_cache_gb=0`.
+### Processor Caching
+
+Multi-modal processor caching is automatically enabled
+to avoid repeatedly processing the same multi-modal inputs in `BaseMultiModalProcessor`.
+
+### IPC Caching
+
+Multi-modal IPC caching is automatically enabled when
+there is a one-to-one correspondance between API (`P0`) and engine core (`P1`) processes,
+to avoid repeatedly transferring the same multi-modal inputs between them.
+
+### Configuration
+
+You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB).
+
+If you do not benefit much from the cache, you can disable both IPC
+and processor caching completely via `mm_processor_cache_gb=0`.
 
 Examples:
 
@@ -230,3 +243,16 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
           mm_processor_cache_gb=0)
 ```
+
+### Cache Placement
+
+Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
+
+| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory |
+|-------------------|-------------|------------|------------|-------------|
+| ✅ | ✅ | K | K + V | `mm_processor_cache_gb * data_parallel_size` |
+| ✅ | ❌ | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
+| ❌ | ❌ | N/A | N/A | `0` |
+
+K: Stores the hashes of multi-modal items  
+V: Stores the processed tensor data of multi-modal items
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 6361cb9b5586..3ff4360b8334 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -14,8 +14,9 @@
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
-from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
+from vllm.multimodal.processing import BaseMultiModalProcessor
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                cached_tokenizer_from_config,
                                                encode_tokens)
@@ -63,6 +64,8 @@ def _test_processing_correctness(
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
+        # Ensure that the cache can fit all of the data
+        mm_processor_cache_gb=2048,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
@@ -71,8 +74,7 @@ def _test_processing_correctness(
         model_config,
         tokenizer=cached_tokenizer_from_config(model_config),
     )
-    # Ensure that it can fit all of the data
-    cache = ProcessingCache(capacity_gb=2048)
+    cache = MultiModalProcessorOnlyCache(model_config)
 
     processing_info = factories.info(ctx)
     supported_mm_limits = processing_info.get_supported_mm_limits()
diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py
index 088cd00db2e0..44c05db2278f 100644
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -1,32 +1,64 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import numpy as np
 import pytest
 import torch
 
-from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
+from vllm.config import ModelConfig, ParallelConfig, VllmConfig
+from vllm.multimodal.cache import (MultiModalCache,
+                                   MultiModalProcessorCacheItem,
+                                   MultiModalProcessorCacheItemMetadata,
+                                   processor_cache_from_config,
+                                   receiver_cache_from_config)
+from vllm.multimodal.hasher import MultiModalHasher
 from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargsItem,
                                     MultiModalKwargsItems,
                                     MultiModalSharedField)
+from vllm.multimodal.processing import PromptInsertion
+from vllm.multimodal.registry import MultiModalRegistry
+
 
+def _dummy_elem(
+    modality: str,
+    key: str,
+    size: int,
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
+    if rng is None:
+        data = torch.empty((size, ), dtype=torch.int8)
+    else:
+        data = torch.from_numpy(rng.randint(4, size=(size, ), dtype=np.int8))
 
-def _dummy_elem(modality: str, key: str, size: int):
     return MultiModalFieldElem(
         modality=modality,
         key=key,
-        data=torch.empty((size, ), dtype=torch.int8),
+        data=data,
         field=MultiModalSharedField(1),
     )
 
 
-def _dummy_item(modality: str, size_by_key: dict[str, int]):
+def _dummy_item(
+    modality: str,
+    size_by_key: dict[str, int],
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
     return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+        _dummy_elem(modality, key, size, rng=rng)
+        for key, size in size_by_key.items()
     ])
 
 
-def _dummy_items(size_by_key_modality: dict[str, dict[str, int]]):
+def _dummy_items(
+    size_by_key_modality: dict[str, dict[str, int]],
+    *,
+    rng: Optional[np.random.RandomState] = None,
+):
     return MultiModalKwargsItems.from_seq([
-        _dummy_item(modality, size_by_key)
+        _dummy_item(modality, size_by_key, rng=rng)
         for modality, size_by_key in size_by_key_modality.items()
     ])
 
@@ -48,5 +80,139 @@ def test_cache_item_size(item, expected_size):
     cache[""] = item
     assert cache.currsize == expected_size
 
-    cache[""] = MultiModalCacheItemMetadata.wraps(item)
+    prompt_update = PromptInsertion("dummy", "target", "insertion") \
+        .resolve(0)
+
+    cache[""] = MultiModalProcessorCacheItem(item, [prompt_update])
+    assert cache.currsize == expected_size
+
+    cache[""] = MultiModalProcessorCacheItemMetadata(item, [prompt_update])
     assert cache.currsize == expected_size
+
+
+def _create_vllm_config(
+    *,
+    mm_processor_cache_gb: float,
+    enable_ipc: bool,
+):
+    return VllmConfig(
+        model_config=ModelConfig(mm_processor_cache_gb=mm_processor_cache_gb),
+        parallel_config=ParallelConfig(
+            data_parallel_size=1 if enable_ipc else 2),
+    )
+
+
+def _compare_caches(
+    config_0: VllmConfig,
+    config_1: VllmConfig,
+    *,
+    item_capacity: int = 8,
+    hit_rate: float = 0.5,
+    max_items_per_iter: int = 3,
+    is_cached_calls_per_iter: int,
+    n_iter: int = 100,
+    seed: int = 0,
+):
+    mm_registry = MultiModalRegistry()
+    cache_0_p0 = processor_cache_from_config(config_0, mm_registry)
+    cache_0_p1 = receiver_cache_from_config(config_0, mm_registry)
+    cache_1_p0 = processor_cache_from_config(config_1, mm_registry)
+    cache_1_p1 = receiver_cache_from_config(config_1, mm_registry)
+
+    cache_size_gb = max(
+        config_0.model_config.mm_processor_cache_gb,
+        config_1.model_config.mm_processor_cache_gb,
+    )
+    item_size_gb = int(cache_size_gb / item_capacity)
+
+    rng = np.random.RandomState(seed)
+    all_items = [
+        _dummy_item("item", {"key": item_size_gb}, rng=rng)
+        for _ in range(int(item_capacity / hit_rate))
+    ]
+    all_hashes = [
+        MultiModalHasher.hash_kwargs(item=item.get_data())
+        for item in all_items
+    ]
+
+    # Should not be used since there is nothing to convert to text
+    prompt_update = PromptInsertion("dummy", "target", "insertion")
+
+    for it in range(n_iter):
+        num_items_to_select = rng.randint(0, max_items_per_iter)
+        item_idxs_to_select = rng.choice(len(all_items), num_items_to_select)
+
+        selected_items = [all_items[idx] for idx in item_idxs_to_select]
+        selected_hashes = [all_hashes[idx] for idx in item_idxs_to_select]
+
+        if cache_0_p0 is None:
+            cache_0_p0_out = selected_items
+        else:
+            for _ in range(is_cached_calls_per_iter):
+                cache_0_p0.is_cached(selected_hashes)
+            cache_0_p0_out = [
+                item for item, _ in cache_0_p0.get_and_update(
+                    [(item, prompt_update.content) for item in selected_items],
+                    selected_hashes,
+                )
+            ]
+
+        if cache_1_p0 is None:
+            cache_1_p0_out = selected_items
+        else:
+            for _ in range(is_cached_calls_per_iter):
+                cache_1_p0.is_cached(selected_hashes)
+            cache_1_p0_out = [
+                item for item, _ in cache_1_p0.get_and_update(
+                    [(item, prompt_update.content) for item in selected_items],
+                    selected_hashes,
+                )
+            ]
+
+        if cache_0_p1 is None:
+            cache_0_p1_out = cache_0_p0_out
+        else:
+            cache_0_p1_out = cache_0_p1.get_and_update(cache_0_p0_out,
+                                                       selected_hashes)
+
+        if cache_1_p1 is None:
+            cache_1_p1_out = cache_1_p0_out
+        else:
+            cache_1_p1_out = cache_1_p1.get_and_update(cache_1_p0_out,
+                                                       selected_hashes)
+
+        assert cache_0_p1_out == cache_1_p1_out, f"Failed at {it=}"
+
+
+@pytest.mark.parametrize("is_cached_calls_per_iter", [1, 2, 3])
+def test_ipc_enable_disable_consistency(is_cached_calls_per_iter):
+    cache_size_gb = 1 / (1 << 20)
+
+    vllm_config_ipc_enabled = _create_vllm_config(
+        mm_processor_cache_gb=cache_size_gb,
+        enable_ipc=True,
+    )
+    vllm_config_ipc_disabled = _create_vllm_config(
+        mm_processor_cache_gb=0,
+        enable_ipc=False,
+    )
+    vllm_config_cache_disabled = _create_vllm_config(
+        mm_processor_cache_gb=cache_size_gb,
+        enable_ipc=True,
+    )
+
+    _compare_caches(
+        vllm_config_ipc_enabled,
+        vllm_config_ipc_disabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
+    _compare_caches(
+        vllm_config_ipc_disabled,
+        vllm_config_cache_disabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
+    _compare_caches(
+        vllm_config_cache_disabled,
+        vllm_config_ipc_enabled,
+        is_cached_calls_per_iter=is_cached_calls_per_iter,
+    )
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index cd0e17977ede..ac6f51df9549 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -437,7 +437,7 @@ class ModelConfig:
     from `AutoProcessor.from_pretrained`. The available overrides depend on the
     model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
     """
-    mm_processor_cache_gb: int = 4
+    mm_processor_cache_gb: float = 4
     """The size (in GiB) of the multi-modal processor cache, which is used to
     avoid re-processing past multi-modal inputs.
 
@@ -884,12 +884,6 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
 
         return None
 
-    def set_mm_processor_cache_gb(self, value: int) -> None:
-        mm_config = self.get_multimodal_config()
-
-        self.mm_processor_cache_gb = value
-        mm_config.mm_processor_cache_gb = value
-
     def _get_encoder_config(self):
         return get_sentence_transformer_tokenizer_config(
             self.model, self.revision)
@@ -1697,22 +1691,6 @@ def uses_mrope(self) -> bool:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
-    @property
-    def enable_mm_processor_cache(self) -> bool:
-        """Whether the multi-modal processor cache should be enabled."""
-        mm_config = self.multimodal_config
-        if mm_config is None:
-            return False
-
-        return mm_config.mm_processor_cache_gb > 0
-
-    def get_mm_input_cache_gb(self) -> int:
-        mm_config = self.multimodal_config
-        if mm_config is None:
-            return 0
-
-        return envs.VLLM_MM_INPUT_CACHE_GIB
-
     @property
     def is_cross_encoder(self) -> bool:
         return (self._model_info.supports_cross_encoding
@@ -2561,7 +2539,7 @@ class MultiModalConfig:
     `{"num_crops": 4}`.
     """
 
-    mm_processor_cache_gb: int = 4
+    mm_processor_cache_gb: float = 4
     """
     The size (in GiB) of the multi-modal processor cache, which is used to
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f24c50ad7326..9e7c95ea5205 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -351,7 +351,7 @@ class EngineArgs:
     mm_processor_kwargs: Optional[Dict[str, Any]] = \
         MultiModalConfig.mm_processor_kwargs
     disable_mm_preprocessor_cache: bool = False  # DEPRECATED
-    mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
+    mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
     mm_encoder_tp_mode: MMEncoderTPMode = MultiModalConfig.mm_encoder_tp_mode
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     # LoRA fields
@@ -1293,18 +1293,6 @@ def create_engine_config(
             worker_extension_cls=self.worker_extension_cls,
         )
 
-        if model_config.is_multimodal_model:
-            dp_supports_mm_processor_cache = (self.data_parallel_size == 1
-                                              or data_parallel_external_lb)
-            if (not dp_supports_mm_processor_cache
-                    and model_config.mm_processor_cache_gb > 0):
-                logger.warning(
-                    "Multi-modal processor cache is disabled because "
-                    "it is not compatible with data parallelism when "
-                    "there does not exist a one-to-one correspondance "
-                    "between API and engine core processes.")
-                model_config.set_mm_processor_cache_gb(0)
-
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
             target_parallel_config=parallel_config,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index cbd714c159eb..03c2f0375da4 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -36,6 +36,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
@@ -250,9 +251,13 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
         self.generation_config_fields = (
             self.model_config.try_get_generation_config())
 
-        self.input_preprocessor = InputPreprocessor(self.model_config,
-                                                    self.tokenizer,
-                                                    mm_registry)
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            self.tokenizer,
+            mm_registry,
+            mm_processor_cache=processor_only_cache_from_config(
+                self.model_config, mm_registry),
+        )
 
         self.model_executor = executor_class(vllm_config=vllm_config)
 
@@ -840,8 +845,8 @@ def has_unfinished_requests_for_virtual_engine(
 
     def reset_mm_cache(self) -> bool:
         """Reset the multi-modal cache."""
-        return self.input_preprocessor.mm_registry.reset_processor_cache(
-            self.model_config)
+        self.input_preprocessor.clear_cache()
+        return True
 
     def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
         """Reset prefix cache for all devices."""
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3f521012e82a..f0d0cab3df3d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -11,6 +11,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                                     MultiModalInputs)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
@@ -32,12 +33,14 @@ def __init__(
         model_config: ModelConfig,
         tokenizer: Optional[TokenizerGroup],
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+        mm_processor_cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> None:
         super().__init__()
 
         self.model_config = model_config
         self.tokenizer = tokenizer
         self.mm_registry = mm_registry
+        self.mm_processor_cache = mm_processor_cache
 
     def get_tokenizer_group(self) -> TokenizerGroup:
         if self.tokenizer is None:
@@ -261,8 +264,11 @@ def _process_multimodal(
         """
         tokenizer = self._get_mm_tokenizer(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(self.model_config,
-                                                         tokenizer=tokenizer)
+        mm_processor = self.mm_registry.create_processor(
+            self.model_config,
+            tokenizer=tokenizer,
+            cache=self.mm_processor_cache,
+        )
 
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
@@ -286,8 +292,12 @@ async def _process_multimodal_async(
         """
         tokenizer = await self._get_mm_tokenizer_async(lora_request)
 
-        mm_processor = self.mm_registry.create_processor(self.model_config,
-                                                         tokenizer=tokenizer)
+        mm_processor = self.mm_registry.create_processor(
+            self.model_config,
+            tokenizer=tokenizer,
+            cache=self.mm_processor_cache,
+        )
+
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
@@ -860,3 +870,7 @@ async def preprocess_async(
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
         )
+
+    def clear_cache(self) -> None:
+        if self.mm_processor_cache is not None:
+            self.mm_processor_cache.clear_cache()
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index ef146fdfbf97..f0b392e9767a 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -223,20 +223,26 @@ def dummy_data_for_profiling(
         The model is identified by ``model_config``.
         """
         # Avoid circular import
+        from vllm.multimodal.cache import processor_only_cache_from_config
         from vllm.sequence import SequenceData
 
         if not model_config.is_multimodal_model:
             seq_data = SequenceData.from_prompt_token_counts((0, seq_len))
             return DummyData(seq_data=seq_data)
 
+        cache = processor_only_cache_from_config(model_config, mm_registry)
+
         # Encoder dummy data does not contain multi-modal data
         if is_encoder_data:
-            enc_data = mm_registry.get_encoder_dummy_data(
-                model_config, seq_len)
+            enc_data = mm_registry.get_encoder_dummy_data(model_config,
+                                                          seq_len,
+                                                          cache=cache)
             seq_data = SequenceData.from_seqs(enc_data.prompt_token_ids)
             return DummyData(seq_data=seq_data)
 
-        dec_data = mm_registry.get_decoder_dummy_data(model_config, seq_len)
+        dec_data = mm_registry.get_decoder_dummy_data(model_config,
+                                                      seq_len,
+                                                      cache=cache)
 
         return DummyData(
             seq_data=SequenceData.from_seqs(dec_data.prompt_token_ids),
diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py
index eeb8291c7784..53f0585541b1 100644
--- a/vllm/model_executor/models/hyperclovax_vision.py
+++ b/vllm/model_executor/models/hyperclovax_vision.py
@@ -33,12 +33,13 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
 from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
@@ -367,7 +368,7 @@ def _build_hcxvision_hf_processor(
     info: HCXVisionProcessingInfo,
     dummy_inputs: BaseDummyInputsBuilder[HCXVisionProcessingInfo],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, HCXVisionProcessingInfo):
         return HCXVisionMultiModalProcessor(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index bc53982c938c..0ee26b68345c 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -22,14 +22,14 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalInputs, MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
@@ -394,7 +394,7 @@ def _build_llava_or_pixtral_hf_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, PixtralHFProcessingInfo):
         return PixtralHFMultiModalProcessor(
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index a2a71bdd12b3..c22d871ab20d 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -58,7 +58,8 @@
                                    VideoItem, VideoProcessorItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate, PromptUpdateDetails)
+                                        PromptUpdate, PromptUpdateDetails,
+                                        ResolvedPromptUpdate, _seq2text)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -744,6 +745,43 @@ def get_video_replacement(item_idx: int):
             for modality, pattern in placeholders
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            tokenizer = self.info.get_tokenizer()
+            image_processor = self.info.get_image_processor()
+            version = self.info.get_model_version()
+
+            text = _seq2text(tokenizer, cached_update.content.full)
+            prev_item_idx = cached_update.item_idx
+
+            if version == (2, 0) or version == (2, 5):
+                im_start = image_processor.im_start_token
+                im_end = image_processor.im_end_token
+            else:
+                im_start = image_processor.im_id_start
+                im_end = image_processor.im_id_end
+
+            new_update = new_update.with_content(
+                PromptUpdateDetails.select_text(
+                    text.replace(
+                        f"{im_start}{prev_item_idx}{im_end}",
+                        f"{im_start}{new_item_idx}{im_end}",
+                        1,
+                    ),
+                    "<unk>",
+                ))
+
+        return new_update
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 438513433d3b..08948960b275 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -22,14 +22,14 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargsItems)
 from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
                                    MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate,
-                                        PromptUpdateDetails)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -322,7 +322,7 @@ def _build_mistral3_processor(
     info: _I,
     dummy_inputs: BaseDummyInputsBuilder[_I],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     assert isinstance(info, Mistral3ProcessingInfo)
     return Mistral3MultiModalProcessor(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 61e09d56046c..4522c7043d01 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -41,7 +41,8 @@
                                         BaseProcessingInfo,
                                         MultiModalPromptUpdates,
                                         PlaceholderFeaturesInfo,
-                                        PromptReplacement, PromptUpdate)
+                                        PromptReplacement, PromptUpdate,
+                                        ResolvedPromptUpdate)
 # yapf: enable
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
@@ -440,6 +441,23 @@ def get_replacement_phi3v(item_idx: int):
             )
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            hf_processor = self.info.get_hf_processor()
+            image_tokens: list[str] = hf_processor.img_tokens  # type: ignore
+            new_update = new_update.with_target(image_tokens[new_item_idx])
+
+        return new_update
+
     def _apply_prompt_updates(
         self,
         token_ids: list[int],
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index 5129770e8d49..211cbd9c819c 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -27,7 +27,7 @@
                                    MultiModalDataItems, MultiModalDataParser)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                         BaseProcessingInfo, PromptReplacement,
-                                        PromptUpdate)
+                                        PromptUpdate, ResolvedPromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -850,6 +850,25 @@ def get_audio_replacement_phi4mm(item_idx: int):
             ),
         ]
 
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        new_update = super()._recompute_cached_prompt_update(
+            cached_update,
+            new_item_idx,
+        )
+
+        if cached_update.modality == "image":
+            image_tokens: list[str] = self.info.image_tokens  # type: ignore
+            new_update = new_update.with_target(image_tokens[new_item_idx])
+        elif cached_update.modality == "audio":
+            audio_tokens: list[str] = self.info.audio_tokens  # type: ignore
+            new_update = new_update.with_target(audio_tokens[new_item_idx])
+
+        return new_update
+
 
 @MULTIMODAL_REGISTRY.register_processor(
     Phi4MMMultiModalProcessor,
diff --git a/vllm/model_executor/models/tarsier.py b/vllm/model_executor/models/tarsier.py
index 9b9cca8c6bd3..c66867315e55 100644
--- a/vllm/model_executor/models/tarsier.py
+++ b/vllm/model_executor/models/tarsier.py
@@ -25,12 +25,13 @@
 from vllm.model_executor.models.llava import LlavaDummyInputsBuilder
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargsItems
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
                                    ImageSize, MultiModalDataItems)
 from vllm.multimodal.processing import (BaseMultiModalProcessor,
-                                        BaseProcessingInfo, ProcessingCache,
-                                        PromptReplacement, PromptUpdate)
+                                        BaseProcessingInfo, PromptReplacement,
+                                        PromptUpdate)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 from vllm.utils.jsontree import json_map_leaves
@@ -332,7 +333,7 @@ def _build_tarsier_hf_processor(
     info: _I_Tarsier,
     dummy_inputs: BaseDummyInputsBuilder[_I_Tarsier],
     *,
-    cache: Optional[ProcessingCache] = None,
+    cache: Optional[BaseMultiModalProcessorCache] = None,
 ) -> BaseMultiModalProcessor:
     if isinstance(info, TarsierProcessingInfo):
         return TarsierMultiModalProcessor(
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 5cec8e71fb26..0e81cb6d4d19 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import sys
-from collections.abc import Mapping
-from dataclasses import dataclass
-from typing import TypeVar, Union
+from abc import ABC, abstractmethod
+from collections.abc import Mapping, Sequence
+from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union
 
 import torch
+from typing_extensions import TypeAlias, override
 
 from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, LRUCache
@@ -15,24 +16,67 @@
                      MultiModalKwargsItem, MultiModalKwargsItems,
                      NestedTensors)
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, VllmConfig
+
+    from .processing import ResolvedPromptUpdate
+    from .registry import MultiModalRegistry
+
 logger = init_logger(__name__)
 
 
-@dataclass
-class MultiModalCacheItemMetadata:
-    size: int
+class MultiModalProcessorCacheItem:
+    """
+    The data to store inside `MultiModalProcessorOnlyCache`.
 
-    @classmethod
-    def wraps(cls, value: "MultiModalCacheValue"):
-        return cls(size=MultiModalCache.get_item_size(value))
+    Args:
+        item: The processed tensor data corresponding to a multi-modal item.
+        prompt_updates: The prompt updates corresponding to `item`.
+    """
+
+    def __init__(
+        self,
+        item: MultiModalKwargsItem,
+        prompt_updates: Sequence["ResolvedPromptUpdate"],
+    ) -> None:
+        super().__init__()
+
+        self.item = item
+        self.prompt_updates = prompt_updates
+
+
+class MultiModalProcessorCacheItemMetadata:
+    """
+    The metadata to store inside `MultiModalProcessorSenderCache`.
+
+    Args:
+        item: The processed tensor data corresponding to a multi-modal item.
+            Since P1 already stores the tensor data, we only store its size
+            metadata in P0 to reduce memory usage. The size metadata is still
+            needed to keep the same cache eviction policy as P0.
+        prompt_updates: The prompt updates corresponding to `item`.
+            This needs to stay on P0 because for some models, they are
+            dependent on the processed tensor data (cached on P1).
+    """
+
+    def __init__(
+        self,
+        item: MultiModalKwargsItem,
+        prompt_updates: Sequence["ResolvedPromptUpdate"],
+    ) -> None:
+        super().__init__()
+
+        self.item_size = MultiModalCache.get_item_size(item)
+        self.prompt_updates = prompt_updates
 
 
 MultiModalCacheValue = Union[
+    MultiModalProcessorCacheItem,
+    MultiModalProcessorCacheItemMetadata,
     MultiModalKwargsItems,
     MultiModalKwargsItem,
     MultiModalKwargs,
     Mapping[str, NestedTensors],
-    MultiModalCacheItemMetadata,
 ]
 
 _V = TypeVar("_V", bound=MultiModalCacheValue)
@@ -47,8 +91,10 @@ def get_leaf_size(
         *,
         debug: bool = False,
     ) -> int:
-        if isinstance(leaf, MultiModalFieldElem):
-            return cls.get_item_size(leaf.data)  # type: ignore
+        if isinstance(leaf, MultiModalProcessorCacheItem):
+            return cls.get_leaf_size(leaf.item)
+        if isinstance(leaf, MultiModalProcessorCacheItemMetadata):
+            return leaf.item_size
 
         # These are not subclasses of dict
         if isinstance(leaf, MultiModalKwargsItems):
@@ -58,13 +104,13 @@ def get_leaf_size(
         if isinstance(leaf, MultiModalKwargs):
             return cls.get_item_size(leaf.data)  # type: ignore
 
+        if isinstance(leaf, MultiModalFieldElem):
+            return cls.get_item_size(leaf.data)  # type: ignore
+
         # sys.getsizeof doesn't work for tensors
         if isinstance(leaf, torch.Tensor):
             return leaf.nbytes
 
-        if isinstance(leaf, MultiModalCacheItemMetadata):
-            return leaf.size
-
         return sys.getsizeof(leaf)
 
     @classmethod
@@ -98,3 +144,332 @@ def get_lru_cache(
             GiB_bytes * capacity_gb,
             getsizeof=lambda x: cls.get_item_size(x, debug=debug),
         )
+
+
+_I = TypeVar("_I", contravariant=True)
+_O = TypeVar("_O", covariant=True)
+
+
+class BaseMultiModalCache(ABC, Generic[_I, _O]):
+    """
+    Abstract base class to read/write multi-modal items from cache.
+
+    The idea of multi-modal caching is based on having a client and server
+    where the client executes in the frontend process (=P0) and
+    the server in the core process (=P1). The data flow is as follows:
+
+    ```
+                  is_cached() x N    get_and_update()
+    P0: From API -----------------> -----------------> To P1
+
+                 get_and_update()
+    P1: From P0 -----------------> To model
+    ```
+
+    `is_cached()` can be called any number of times in P0. However,
+    `get_and_update()` must be called in P0 and P1 one after another
+    so that their cache eviction order remains the same.
+
+    This ensures that the keys in P0 and P1 caches are mirrored,
+    allowing us to determine whether a key is cached in P1 by looking
+    up the P0 cache, without having to communicate with P1.
+    """
+
+    @abstractmethod
+    def get_and_update_item(
+        self,
+        mm_item: _I,
+        mm_hash: str,
+    ) -> _O:
+        """
+        Possibly update a multi-modal item based on whether it is
+        in the underlying cache.
+        
+        This update is done out-of-place and updates the cache eviction order.
+
+        Args:
+            mm_item: The multi-modal item to update.
+            mm_hash: The hash of `mm_item`.
+
+        Returns:
+            The update multi-modal item.
+        """
+        raise NotImplementedError
+
+    def get_and_update(
+        self,
+        mm_items: Sequence[_I],
+        mm_hashes: list[str],
+    ) -> list[_O]:
+        """
+        Possibly update a sequence of multi-modal items based on whether they
+        are in the underlying cache.
+
+        This update is done out-of-place and updates the cache eviction order.
+
+        Args:
+            mm_items: The multi-modal items to update.
+            mm_hashes: The hash of each item in `mm_items`.
+
+        Returns:
+            A new list of updated multi-modal items.
+        """
+        assert len(mm_items) == len(mm_hashes)
+
+        return [
+            self.get_and_update_item(mm_item, mm_hash)
+            for mm_item, mm_hash in zip(mm_items, mm_hashes)
+        ]
+
+    @abstractmethod
+    def clear_cache(self) -> None:
+        """Clear the underlying cache."""
+        raise NotImplementedError
+
+
+MultiModalProcessorCacheInItem: TypeAlias = \
+    Optional[tuple[MultiModalKwargsItem, Sequence["ResolvedPromptUpdate"]]]
+
+
+MultiModalProcessorCacheOutItem: TypeAlias = \
+    tuple[Optional[MultiModalKwargsItem], Sequence["ResolvedPromptUpdate"]]
+
+
+class BaseMultiModalProcessorCache(
+        BaseMultiModalCache[MultiModalProcessorCacheInItem,
+                            MultiModalProcessorCacheOutItem]):
+    """The required interface for caches on P0."""
+
+    @abstractmethod
+    def is_cached_item(self, mm_hash: str) -> bool:
+        """
+        Check whether a multi-modal item is
+        in the underlying cache.
+
+        This **DOES NOT** update the cache eviction order.
+
+        Args:
+            mm_hash: The hash of the item to check.
+
+        Returns:
+            `True` if the item is cached, otherwise `False`.
+        """
+        raise NotImplementedError
+
+    def is_cached(self, mm_hashes: list[str]) -> list[bool]:
+        """
+        Check whether a sequence of multi-modal items are
+        in the underlying cache.
+
+        This **DOES NOT** update the cache eviction order.
+    
+        Args:
+            mm_hashes: The hash of each item to check.
+
+        Returns:
+            For each item, `True` if the item is cached, otherwise `False`.
+        """
+        return [self.is_cached_item(mm_hash) for mm_hash in mm_hashes]
+
+
+class MultiModalProcessorOnlyCache(BaseMultiModalProcessorCache):
+    """
+    The cache which is used on P0 when IPC caching is disabled.
+
+    How to update each item:
+
+    - If the item is in the cache, replace the input with the cached item.
+    - If the item is not in the cache, store that item (which includes
+      tensor data and metadata) into the cache, and return the input.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalProcessorCacheItem,
+        )
+
+    @override
+    def is_cached_item(self, mm_hash: str) -> bool:
+        return mm_hash in self._cache
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: MultiModalProcessorCacheInItem,
+        mm_hash: str,
+    ) -> MultiModalProcessorCacheOutItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return cached_item.item, cached_item.prompt_updates
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = MultiModalProcessorCacheItem(*mm_item)
+
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+class MultiModalProcessorSenderCache(BaseMultiModalProcessorCache):
+    """
+    The cache which is used on P0 when IPC caching is enabled.
+
+    How to update each item:
+
+    - If the item is already in the cache, clear the input to avoid
+      unnecessary IPC.
+
+    - If the item is not in the cache, store the metadata of that item so
+      that the eviction policy remains the same as the cache on P1,
+      and return the input.
+      By only storing the metadata, we avoid keeping the data itself in
+      memory inside P0.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalProcessorCacheItemMetadata,
+        )
+
+    @override
+    def is_cached_item(self, mm_hash: str) -> bool:
+        return mm_hash in self._cache
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: MultiModalProcessorCacheInItem,
+        mm_hash: str,
+    ) -> MultiModalProcessorCacheOutItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return None, cached_item.prompt_updates
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = MultiModalProcessorCacheItemMetadata(*mm_item)
+
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+def _enable_processor_cache(
+    model_config: "ModelConfig",
+    mm_registry: "MultiModalRegistry",
+) -> bool:
+    if not mm_registry.supports_multimodal_inputs(model_config):
+        return False
+
+    mm_config = model_config.get_multimodal_config()
+    return mm_config.mm_processor_cache_gb > 0
+
+
+def _enable_ipc_cache(vllm_config: "VllmConfig") -> bool:
+    parallel_config = vllm_config.parallel_config
+    supports_ipc_cache = (parallel_config.data_parallel_size == 1
+                          or parallel_config.data_parallel_external_lb)
+
+    return supports_ipc_cache
+
+
+def processor_cache_from_config(
+    vllm_config: "VllmConfig",
+    mm_registry: "MultiModalRegistry",
+) -> Optional[BaseMultiModalProcessorCache]:
+    """Return a `BaseMultiModalProcessorCache`, if enabled."""
+    model_config = vllm_config.model_config
+
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    if not _enable_ipc_cache(vllm_config):
+        return MultiModalProcessorOnlyCache(model_config)
+
+    return MultiModalProcessorSenderCache(model_config)
+
+
+def processor_only_cache_from_config(
+    model_config: "ModelConfig",
+    mm_registry: "MultiModalRegistry",
+):
+    """Return a `MultiModalProcessorOnlyCache`, if enabled."""
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    return MultiModalProcessorOnlyCache(model_config)
+
+
+class BaseMultiModalReceiverCache(
+        BaseMultiModalCache[Optional[MultiModalKwargsItem],
+                            MultiModalKwargsItem]):
+    """The required interface for caches on P1."""
+
+
+class MultiModalReceiverCache(BaseMultiModalReceiverCache):
+    """
+    The cache which is used on P1 when IPC caching is enabled.
+
+    How to update each item:
+
+    - If the item is in the cache, replace the input with the cached item.
+    - If the item is not in the cache, store that item (which includes tensor
+      data) into the cache, and return the input.
+    """
+
+    def __init__(self, model_config: "ModelConfig") -> None:
+        super().__init__()
+
+        mm_config = model_config.get_multimodal_config()
+
+        self._cache = MultiModalCache.get_lru_cache(
+            mm_config.mm_processor_cache_gb,
+            MultiModalKwargsItem,
+        )
+
+    @override
+    def get_and_update_item(
+        self,
+        mm_item: Optional[MultiModalKwargsItem],
+        mm_hash: str,
+    ) -> MultiModalKwargsItem:
+        if (cached_item := self._cache.get(mm_hash)) is not None:
+            return cached_item
+
+        assert mm_item is not None, f"Expected a cached item for {mm_hash=}"
+
+        self._cache[mm_hash] = mm_item
+        return mm_item
+
+    @override
+    def clear_cache(self) -> None:
+        self._cache.clear()
+
+
+def receiver_cache_from_config(
+    vllm_config: "VllmConfig",
+    mm_registry: "MultiModalRegistry",
+) -> Optional[BaseMultiModalReceiverCache]:
+    """Return a `BaseMultiModalReceiverCache`, if enabled."""
+    model_config = vllm_config.model_config
+
+    if not _enable_processor_cache(model_config, mm_registry):
+        return None
+
+    if not _enable_ipc_cache(vllm_config):
+        return None
+
+    return MultiModalReceiverCache(model_config)
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 581f9a109cce..2c0ebaced67e 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -7,11 +7,11 @@
 from dataclasses import dataclass
 from functools import partial
 from itertools import accumulate
-from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
-                    Union, cast, final)
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, Union,
+                    cast, final)
 
 import numpy as np
-from typing_extensions import NotRequired, TypeAlias, deprecated
+from typing_extensions import NotRequired, TypeAlias, TypeVar, deprecated
 
 from vllm.utils import LazyLoader, full_groupby, is_list_of
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -668,7 +668,15 @@ def get_data(self) -> dict[str, NestedTensors]:
         return {key: elem.data for key, elem in self.items()}
 
 
-class MultiModalKwargsItems(UserDict[str, Sequence[MultiModalKwargsItem]]):
+_I = TypeVar(
+    "_I",
+    MultiModalKwargsItem,
+    Optional[MultiModalKwargsItem],
+    default=MultiModalKwargsItem,
+)
+
+
+class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
     """
     A dictionary of
     [`MultiModalKwargsItem`][vllm.multimodal.inputs.MultiModalKwargsItem]s
@@ -714,27 +722,37 @@ def from_seq(items: Sequence[MultiModalKwargsItem]):
         items_by_modality = full_groupby(items, key=lambda x: x.modality)
         return MultiModalKwargsItems(items_by_modality)
 
-    def __getitem__(self, modality: str):
+    def __getitem__(self, modality: str) -> Sequence[_I]:
         if modality not in self:
             raise KeyError(f"Modality {modality!r} not found. "
                            f"Available modalities: {set(self.keys())}")
 
-        return super().__getitem__(modality)
+        return super().__getitem__(modality)  # type: ignore[return-value]
 
     def get_data(self, *, pin_memory: bool = False) -> "MultiModalKwargs":
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
-        for items in self.values():
-            for item in items:
+        for modality, items in self.items():
+            for i, item in enumerate(items):
+                if item is None:
+                    raise RuntimeError("Cannot build data from empty "
+                                       f"mm_items[{modality}][{i}]")
+
                 for key, elem in item.items():
                     elems_by_key[key].append(elem)
 
         return MultiModalKwargs({
             key:
             elems[0].field.reduce_data(elems, pin_memory=pin_memory)
-            for key, elems in elems_by_key.items() if len(elems) > 0
+            for key, elems in elems_by_key.items()
         })
 
 
+MultiModalKwargsOptionalItems: TypeAlias = Union[
+    MultiModalKwargsItems[MultiModalKwargsItem],
+    MultiModalKwargsItems[Optional[MultiModalKwargsItem]],
+]
+
+
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
@@ -898,7 +916,7 @@ class MultiModalInputs(TypedDict):
     token_type_ids: NotRequired[list[int]]
     """The token type IDs of the prompt."""
 
-    mm_kwargs: MultiModalKwargsItems
+    mm_kwargs: MultiModalKwargsOptionalItems
     """Keyword arguments to be directly passed to the model after batching."""
 
     mm_hashes: "MultiModalHashDict"
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 8c225e2a3c08..6ecdf80d4aa6 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping,
                              Sequence)
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from enum import Enum
 from functools import lru_cache
 from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
@@ -20,12 +20,11 @@
                                                encode_tokens)
 from vllm.utils import flatten_2d_lists, full_groupby
 
-from .cache import MultiModalCache
 from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalFieldConfig, MultiModalInputs,
                      MultiModalKwargsItem, MultiModalKwargsItems,
-                     PlaceholderRange)
+                     MultiModalKwargsOptionalItems, PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -34,6 +33,7 @@
     from transformers.feature_extraction_utils import BatchFeature
     from transformers.processing_utils import ProcessorMixin
 
+    from .cache import BaseMultiModalProcessorCache
     from .profiling import BaseDummyInputsBuilder
 
 logger = init_logger(__name__)
@@ -557,6 +557,15 @@ def iter_matches(
 
         return self.iter_token_matches(prompt, tokenizer, start_idx=start_idx)
 
+    def with_target(self, target: UpdateTarget):
+        return replace(self, target=target)
+
+    def with_content(self, content: PromptUpdateInfo):
+        if not isinstance(content, PromptUpdateDetails):
+            content = PromptUpdateDetails.from_seq(content)
+
+        return replace(self, content=content)
+
 
 class _TokenMatch(NamedTuple):
     start_idx: int
@@ -865,21 +874,6 @@ def find_mm_placeholders(
     return dict(full_groupby_modality(it))
 
 
-class ProcessingCache(MultiModalCache):
-
-    def __init__(self, capacity_gb: float) -> None:
-        super().__init__()
-
-        self._cache = self.get_lru_cache(capacity_gb, MultiModalKwargsItem)
-
-        self.get = self._cache.get
-        self.put = self._cache.put
-        self.reset = self._cache.clear
-
-
-_CacheItemOrHash = Union[MultiModalKwargsItem, str]
-
-
 class BaseProcessingInfo:
     """Base class to provide the information necessary for data processing."""
 
@@ -982,7 +976,7 @@ def get_mm_max_tokens_per_item(
 
 
 class MultiModalProcessingInfo(NamedTuple):
-    kwargs: MultiModalKwargsItems
+    kwargs: MultiModalKwargsOptionalItems
     hashes: MultiModalHashes
     prompt_updates: MultiModalPromptUpdates
 
@@ -994,11 +988,13 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     Not to be confused with `transformers.ProcessorMixin`.
     """
 
-    def __init__(self,
-                 info: _I,
-                 dummy_inputs: "BaseDummyInputsBuilder[_I]",
-                 *,
-                 cache: Optional[ProcessingCache] = None) -> None:
+    def __init__(
+        self,
+        info: _I,
+        dummy_inputs: "BaseDummyInputsBuilder[_I]",
+        *,
+        cache: Optional["BaseMultiModalProcessorCache"] = None,
+    ) -> None:
         super().__init__()
 
         self.info = info
@@ -1355,32 +1351,6 @@ def _apply_hf_processor_main(
 
         return prompt_ids, mm_processed_data, False
 
-    def _get_cache_missing_items(
-        self,
-        cache: ProcessingCache,
-        mm_data_items: MultiModalDataItems,
-        mm_hashes: MultiModalHashes,
-    ) -> tuple[dict[str, list[_CacheItemOrHash]], MultiModalDataItems]:
-        mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]] = {
-            modality: [(h if (v := cache.get(h)) is None else v)
-                       for h in hashes]
-            for modality, hashes in mm_hashes.items()
-        }
-
-        mm_missing_idxs = {
-            modality: [
-                idx for idx, item_or_hash in enumerate(items_or_hashes)
-                if isinstance(item_or_hash, str)
-            ]
-            for modality, items_or_hashes in mm_cache_items_or_hashes.items()
-        }
-        mm_missing_data = {
-            modality: [mm_data_items[modality][idx] for idx in idxs]
-            for modality, idxs in mm_missing_idxs.items()
-        }
-
-        return mm_cache_items_or_hashes, self._to_mm_items(mm_missing_data)
-
     def _hash_mm_items(
         self,
         mm_items: MultiModalDataItems,
@@ -1401,28 +1371,92 @@ def _hash_mm_items(
             for modality, items in mm_items.items()
         }
 
+    def _get_cache_missing_items(
+        self,
+        cache: "BaseMultiModalProcessorCache",
+        mm_data_items: MultiModalDataItems,
+        mm_hashes: MultiModalHashes,
+    ) -> MultiModalDataItems:
+        mm_is_cached = {
+            modality: cache.is_cached(hashes)
+            for modality, hashes in mm_hashes.items()
+        }
+
+        mm_missing_idxs = {
+            modality: [
+                idx for idx, item_is_cached in enumerate(items_is_cached)
+                if not item_is_cached
+            ]
+            for modality, items_is_cached in mm_is_cached.items()
+        }
+        mm_missing_data = {
+            modality: [mm_data_items[modality][idx] for idx in idxs]
+            for modality, idxs in mm_missing_idxs.items()
+        }
+
+        return self._to_mm_items(mm_missing_data)
+
+    def _recompute_cached_prompt_update(
+        self,
+        cached_update: ResolvedPromptUpdate,
+        new_item_idx: int,
+    ) -> ResolvedPromptUpdate:
+        """
+        Override this if other attributes of `ResolvedPromptUpdate`
+        also need to be recomputed after retrieving from the cache.
+        """
+        return replace(cached_update, item_idx=new_item_idx)
+
     def _merge_mm_kwargs(
         self,
-        cache: ProcessingCache,
-        mm_cache_items_or_hashes: dict[str, list[_CacheItemOrHash]],
+        cache: "BaseMultiModalProcessorCache",
+        mm_hashes: MultiModalHashes,
         mm_missing_kwargs: MultiModalKwargsItems,
-    ) -> MultiModalKwargsItems:
+        mm_missing_prompt_updates: MultiModalPromptUpdates,
+    ) -> tuple[MultiModalKwargsOptionalItems, MultiModalPromptUpdates]:
+        # Need to calculate this at the beginning to avoid skipping cache logic
+        # for subsequently repeated items in the same modality
+        mm_is_cached = {
+            modality: cache.is_cached(hashes)
+            for modality, hashes in mm_hashes.items()
+        }
+
         mm_missing_next_idx = defaultdict[str, int](lambda: 0)
 
-        merged_items = defaultdict[str, list[MultiModalKwargsItem]](list)
-        for modality, items_or_hashes in mm_cache_items_or_hashes.items():
-            for item_or_hash in items_or_hashes:
-                if isinstance(item_or_hash, str):
-                    kw_item = mm_missing_kwargs[modality][
-                        mm_missing_next_idx[modality]]
-                    cache.put(item_or_hash, kw_item)
+        merged_kwargs = defaultdict[str,
+                                    list[Optional[MultiModalKwargsItem]]](list)
+        merged_prompt_updates = defaultdict[
+            str, list[Sequence[ResolvedPromptUpdate]]](list)
+        for modality, hashes in mm_hashes.items():
+            missing_kwargs = mm_missing_kwargs.get(modality, [])
+            missing_prompt_updates = mm_missing_prompt_updates.get(
+                modality, [])
+
+            for item_idx, item_hash in enumerate(hashes):
+                kwargs: Optional[MultiModalKwargsItem]
+                if not mm_is_cached[modality][item_idx]:
+                    missing_next_idx = mm_missing_next_idx[modality]
+                    kwargs = missing_kwargs[missing_next_idx]
+                    updates = missing_prompt_updates[missing_next_idx]
+
                     mm_missing_next_idx[modality] += 1
+
+                    item = kwargs, updates
                 else:
-                    kw_item = item_or_hash
+                    item = None
+
+                kwargs, updates = cache.get_and_update_item(item, item_hash)
+
+                merged_kwargs[modality].append(kwargs)
+                merged_prompt_updates[modality].append([
+                    self._recompute_cached_prompt_update(update, item_idx)
+                    for update in updates
+                ])
 
-                merged_items[modality].append(kw_item)
+        mm_kwargs = MultiModalKwargsItems(merged_kwargs)
+        mm_prompt_updates = dict(merged_prompt_updates)
 
-        return MultiModalKwargsItems(merged_items)
+        return mm_kwargs, mm_prompt_updates
 
     def _apply_hf_processor(
         self,
@@ -1490,10 +1524,8 @@ def _cached_apply_hf_processor(
 
         mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
                                         tokenization_kwargs)
-        (
-            mm_cache_items_or_hashes,
-            mm_missing_data_items,
-        ) = self._get_cache_missing_items(
+
+        mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
             mm_data_items=mm_data_items,
             mm_hashes=mm_hashes,
@@ -1520,16 +1552,17 @@ def _cached_apply_hf_processor(
                                        hf_processor_mm_kwargs),
         )
 
-        mm_kwargs = self._merge_mm_kwargs(
-            cache,
-            mm_cache_items_or_hashes=mm_cache_items_or_hashes,
-            mm_missing_kwargs=mm_missing_kwargs,
+        mm_missing_prompt_updates = self._get_mm_prompt_updates(
+            mm_missing_data_items,
+            hf_processor_mm_kwargs,
+            mm_missing_kwargs,
         )
 
-        mm_prompt_updates = self._get_mm_prompt_updates(
-            mm_data_items,
-            hf_processor_mm_kwargs,
-            mm_kwargs,
+        mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs(
+            cache,
+            mm_hashes=mm_hashes,
+            mm_missing_kwargs=mm_missing_kwargs,
+            mm_missing_prompt_updates=mm_missing_prompt_updates,
         )
 
         mm_info = MultiModalProcessingInfo(
@@ -1614,7 +1647,7 @@ def _apply_prompt_updates(
 
     def _validate_mm_kwargs(
         self,
-        mm_kwargs: MultiModalKwargsItems,
+        mm_kwargs: MultiModalKwargsOptionalItems,
         mm_item_counts: Mapping[str, int],
     ) -> None:
         for modality, item_count in mm_item_counts.items():
@@ -1655,7 +1688,7 @@ def _maybe_apply_prompt_updates(
         self,
         mm_items: MultiModalDataItems,
         prompt_ids: list[int],
-        mm_kwargs: MultiModalKwargsItems,
+        mm_kwargs: MultiModalKwargsOptionalItems,
         mm_prompt_updates: MultiModalPromptUpdates,
         is_update_applied: bool,
     ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index ea2efbdd8b52..ffc69a2db60a 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -13,7 +13,7 @@
 from vllm.logger import init_logger
 
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                     MultiModalInputs, MultiModalKwargsItems,
+                     MultiModalInputs, MultiModalKwargsOptionalItems,
                      MultiModalPlaceholderDict)
 from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
                          EncDecMultiModalProcessor)
@@ -43,7 +43,7 @@ class DummyDecoderData(NamedTuple):
     """Dummy data used for profiling."""
 
     prompt_token_ids: list[int]
-    multi_modal_data: MultiModalKwargsItems
+    multi_modal_data: MultiModalKwargsOptionalItems
     multi_modal_placeholders: MultiModalPlaceholderDict
 
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8cd9e5604872..38adbf8f3536 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from functools import lru_cache
 from typing import TYPE_CHECKING, Generic, Optional, Protocol, TypeVar
 
 import torch.nn as nn
@@ -13,8 +12,9 @@
                                                cached_tokenizer_from_config)
 from vllm.utils import ClassRegistry
 
-from .processing import (BaseMultiModalProcessor, BaseProcessingInfo,
-                         ProcessingCache)
+from .cache import (BaseMultiModalProcessorCache,
+                    processor_only_cache_from_config)
+from .processing import BaseMultiModalProcessor, BaseProcessingInfo
 from .profiling import (BaseDummyInputsBuilder, DummyDecoderData,
                         DummyEncoderData, MultiModalProfiler)
 
@@ -65,7 +65,7 @@ def __call__(
         info: _I,
         dummy_inputs: BaseDummyInputsBuilder[_I],
         *,
-        cache: Optional[ProcessingCache] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> BaseMultiModalProcessor[_I]:
         ...
 
@@ -80,20 +80,13 @@ def build_processor(
         self,
         ctx: InputProcessingContext,
         *,
-        cache: Optional[ProcessingCache] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ):
         info = self.info(ctx)
         dummy_inputs_builder = self.dummy_inputs(info)
         return self.processor(info, dummy_inputs_builder, cache=cache)
 
 
-# Make sure a different cache is used for each model config
-# NOTE: ModelConfig is not hashable so it cannot be passed directly
-@lru_cache(maxsize=1)
-def _get_processor_cache(model_id: str, capacity_gb: int):
-    return ProcessingCache(capacity_gb) if capacity_gb > 0 else None
-
-
 class MultiModalRegistry:
     """
     A registry that dispatches data processing according to the model.
@@ -103,31 +96,6 @@ def __init__(self) -> None:
         self._processor_factories = ClassRegistry[nn.Module,
                                                   _ProcessorFactories]()
 
-    def _get_processor_cache(self, model_config: "ModelConfig"):
-        model_id = model_config.model
-        capacity_gb = model_config.mm_processor_cache_gb
-        return _get_processor_cache(model_id, capacity_gb)
-
-    def reset_processor_cache(self, model_config: "ModelConfig") -> bool:
-        """Reset the multi-modal processing cache."""
-        if processor_cache := self._get_processor_cache(model_config):
-            processor_cache.reset()
-
-        return True  # Success
-
-    def enable_mm_input_cache(self, model_config: "ModelConfig") -> bool:
-        """Whether the multi-modal input cache should be enabled.
-        NOTE: This is put under MultiModalRegistry on purpose to respect 
-        text-only mode for multimodal models.
-        """
-
-        if not self.supports_multimodal_inputs(model_config):
-            return False
-
-        mm_config = model_config.get_multimodal_config()
-
-        return mm_config.mm_processor_cache_gb > 0
-
     def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
         """
         Checks if the model supports multimodal inputs.
@@ -157,6 +125,8 @@ def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
     def get_max_tokens_per_item_by_modality(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -165,11 +135,11 @@ def get_max_tokens_per_item_by_modality(
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
 
         seq_len = model_config.max_model_len
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
 
         return profiler.get_mm_max_contiguous_tokens(
             seq_len,
@@ -182,6 +152,8 @@ def get_max_tokens_per_item_by_modality(
     def get_max_tokens_per_item_by_nonzero_modality(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of tokens per data item from each modality based
@@ -192,15 +164,19 @@ def get_max_tokens_per_item_by_nonzero_modality(
             This is currently directly used only in V1 for profiling the memory
             usage of a model.
         """
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
+        max_tokens_per_item = self.get_max_tokens_per_item_by_modality(
+            model_config,
+            cache=cache,
+        )
 
         return {
             key: max_tokens_per_mm_item
-            for key, max_tokens_per_mm_item in
-            self.get_max_tokens_per_item_by_modality(model_config).items()
+            for key, max_tokens_per_mm_item in max_tokens_per_item.items()
             if mm_limits[key] > 0
         }
 
+    # TODO: Remove once V0 is gone
     def get_max_tokens_by_modality(
         self,
         model_config: "ModelConfig",
@@ -209,14 +185,19 @@ def get_max_tokens_by_modality(
         Get the maximum number of tokens from each modality
         for profiling the memory usage of a model.
         """
-        mm_limits = self.get_mm_limits_per_prompt(model_config)
+        cache = processor_only_cache_from_config(model_config, self)
+        mm_limits = self.get_mm_limits_per_prompt(model_config, cache=cache)
+        max_tokens_per_item = self.get_max_tokens_per_item_by_modality(
+            model_config,
+            cache=cache,
+        )
 
         return {
             key: mm_limits[key] * max_tokens_per_mm_item
-            for key, max_tokens_per_mm_item in
-            self.get_max_tokens_per_item_by_modality(model_config).items()
+            for key, max_tokens_per_mm_item in max_tokens_per_item.items()
         }
 
+    # TODO: Remove once V0 is gone
     def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum number of multi-modal tokens
@@ -227,6 +208,8 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
     def get_mm_limits_per_prompt(
         self,
         model_config: "ModelConfig",
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> Mapping[str, int]:
         """
         Get the maximum number of multi-modal input instances for each modality
@@ -235,7 +218,7 @@ def get_mm_limits_per_prompt(
         if not model_config.is_multimodal_model:
             return {}
 
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         return profiler.get_mm_limits()
 
@@ -303,7 +286,7 @@ def create_processor(
         model_config: "ModelConfig",
         *,
         tokenizer: Optional[AnyTokenizer] = None,
-        disable_cache: Optional[bool] = None,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
@@ -311,15 +294,10 @@ def create_processor(
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
-        if disable_cache is None:
-            disable_cache = not model_config.enable_mm_processor_cache
-
         model_cls = self._get_model_cls(model_config)
         factories = self._processor_factories[model_cls]
 
         ctx = self._create_processing_ctx(model_config, tokenizer)
-        cache = None if disable_cache else self._get_processor_cache(
-            model_config)
 
         return factories.build_processor(ctx, cache=cache)
 
@@ -328,13 +306,15 @@ def get_decoder_dummy_data(
         model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> DummyDecoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by ``model_config``.
         """
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts)
 
@@ -352,13 +332,15 @@ def get_encoder_dummy_data(
         model_config: "ModelConfig",
         seq_len: int,
         mm_counts: Optional[Mapping[str, int]] = None,
+        *,
+        cache: Optional[BaseMultiModalProcessorCache] = None,
     ) -> DummyEncoderData:
         """
         Create dummy data for profiling the memory usage of a model.
 
         The model is identified by ``model_config``.
         """
-        processor = self.create_processor(model_config, disable_cache=False)
+        processor = self.create_processor(model_config, cache=cache)
         profiler = MultiModalProfiler(processor)
         dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts)
 
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 342d7b24f8e9..dbea0b610b31 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -597,8 +597,7 @@ async def stop_profile(self) -> None:
         await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
-        self.processor.mm_registry.reset_processor_cache(self.model_config)
-        self.processor.mm_input_cache_client.reset()
+        self.processor.clear_cache()
         await self.engine_core.reset_mm_cache_async()
 
     async def reset_prefix_cache(self,
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 32765cda6482..b61482806184 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -22,6 +22,7 @@
 from vllm.logging_utils.dump_input import dump_engine_exception
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.cache import receiver_cache_from_config
 from vllm.tasks import POOLING_TASKS, SupportedTask
 from vllm.transformers_utils.config import (
     maybe_register_config_serialize_by_value)
@@ -38,7 +39,6 @@
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
                             UtilityOutput, UtilityResult)
-from vllm.v1.engine.mm_input_cache import MultiModalInputCacheServer
 from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -128,8 +128,9 @@ def __init__(self,
         )
         self.use_spec_decode = vllm_config.speculative_config is not None
 
-        self.mm_input_cache_server = MultiModalInputCacheServer(
-            vllm_config.model_config, MULTIMODAL_REGISTRY)
+        self.mm_registry = mm_registry = MULTIMODAL_REGISTRY
+        self.mm_receiver_cache = receiver_cache_from_config(
+            vllm_config, mm_registry)
 
         # Setup batch queue for pipeline parallelism.
         # Batch queue for scheduled batches. This enables us to asynchronously
@@ -370,7 +371,8 @@ def reset_mm_cache(self):
             logger.warning("Resetting the multi-modal cache when requests are "
                            "in progress may lead to desynced internal caches.")
 
-        self.mm_input_cache_server.reset()
+        if self.mm_receiver_cache is not None:
+            self.mm_receiver_cache.clear_cache()
 
     def reset_prefix_cache(self):
         self.scheduler.reset_prefix_cache()
@@ -435,10 +437,11 @@ def preprocess_add_request(
             assert request.mm_kwargs is not None
 
             # Note on thread safety: no race condition.
-            # `mm_input_cache_server` is reset at the end of LLMEngine init,
+            # `mm_receiver_cache` is reset at the end of LLMEngine init,
             # and will only accessed in the input processing thread afterwards.
-            request.mm_kwargs = self.mm_input_cache_server.get_and_update(
-                request.mm_kwargs, request.mm_hashes)
+            if self.mm_receiver_cache is not None:
+                request.mm_kwargs = self.mm_receiver_cache.get_and_update(
+                    request.mm_kwargs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request,
                                                self.request_block_hasher)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 5a00a930951c..7130f666ef19 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -271,8 +271,7 @@ def stop_profile(self):
         self.engine_core.profile(False)
 
     def reset_mm_cache(self):
-        self.processor.mm_registry.reset_processor_cache(self.model_config)
-        self.processor.mm_input_cache_client.reset()
+        self.processor.clear_cache()
         self.engine_core.reset_mm_cache()
 
     def reset_prefix_cache(self, device: Optional[Device] = None):
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
deleted file mode 100644
index aa7dc62fd4ac..000000000000
--- a/vllm/v1/engine/mm_input_cache.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
-
-from vllm.multimodal import MultiModalRegistry
-from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
-from vllm.multimodal.inputs import MultiModalKwargsItem
-from vllm.utils import is_list_of
-
-if TYPE_CHECKING:
-    from vllm.config import ModelConfig
-
-# The idea of multimodal input caching is based on having a client and
-# a server, where the client executes in the frontend process (=P0) and the
-# server in the core process (=P1).
-#
-# -- P0:
-#  - BaseMultiModalProcessor calls MultiModalHasher to get the `mm_hash` of
-#    each input multi-modal item (e.g. image),
-#  - BaseMultiModalProcessor processes the input items into `mm_kwargs`,
-#    which are MultiModalKwargsItem instances that each correspond to an
-#    input multi-modal item.
-#  - MultiModalInputCacheClient accepts the `mm_kwargs` and corresponding
-#    `mm_hash` for each item. It stores the `mm_hash` as keys and the size
-#    of `mm_kwargs`, but not the `mm_kwargs` themselves, to avoid taking
-#    up additional memory in P0.
-#  - The `mm_hash` is always sent to P1.
-#  - The corresponding `mm_kwargs` are only sent to P1 if they are not cached
-#    in MultiModalInputCacheServer.
-#
-# -- P1:
-#  - If the `mm_hash` is cached (i.e. `mm_kwargs` are not sent from P0),
-#    MultiModalInputCacheServer retrieves the corresponding `mm_kwargs`.
-#  - If the `mm_hash` is not cached (i.e. `mm_kwargs` are sent from P0),
-#    MultiModalInputCacheServer stores `mm_kwargs` under the key `mm_hash`.
-#  - Either way, the `mm_hash` and corresponding `mm_kwargs` are sent to
-#    the engine for model execution.
-#
-# Both Client and Server must perform cache update and eviction based on the
-# same item size. This ensures that the keys of MultiModalInputCacheClient
-# and MultiModalInputCacheServer are mirrored, allowing us to determine in P0
-# whether a key is cached in MultiModalInputCacheServer by querying
-# MultiModalInputCacheClient without having to communicate with P1.
-
-
-class MultiModalInputCacheClient:
-    """Used by P0 to check whether multi-modal kwargs are cached in P1."""
-
-    def __init__(self, model_config: "ModelConfig",
-                 mm_registry: MultiModalRegistry) -> None:
-        super().__init__()
-
-        self.enabled = mm_registry.enable_mm_input_cache(model_config)
-        self.mm_cache = MultiModalCache.get_lru_cache(
-            model_config.get_mm_input_cache_gb(),
-            MultiModalCacheItemMetadata,
-        )
-
-    def get_and_update(
-        self,
-        mm_kwargs: Sequence[MultiModalKwargsItem],
-        mm_hashes: list[str],
-    ) -> list[Optional[MultiModalKwargsItem]]:
-        if not self.enabled:
-            return list(mm_kwargs)
-
-        assert len(mm_kwargs) == len(mm_hashes)
-
-        out_mm_items = list[Optional[MultiModalKwargsItem]]()
-        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
-            if self.mm_cache.get(mm_hash) is not None:
-                out_mm_items.append(None)
-            else:
-                self.mm_cache[mm_hash] = \
-                    MultiModalCacheItemMetadata.wraps(mm_item)
-                out_mm_items.append(mm_item)
-
-        return out_mm_items
-
-    def reset(self) -> None:
-        self.mm_cache.clear()
-
-
-class MultiModalInputCacheServer:
-    """Used by P1 to avoid requiring past multi-modal kwargs from P0."""
-
-    def __init__(self, model_config: "ModelConfig",
-                 mm_registry: MultiModalRegistry) -> None:
-        super().__init__()
-
-        self.enabled = mm_registry.enable_mm_input_cache(model_config)
-        self.mm_cache = MultiModalCache.get_lru_cache(
-            model_config.get_mm_input_cache_gb(),
-            MultiModalKwargsItem,
-        )
-
-    def get_and_update(
-        self,
-        mm_kwargs: Sequence[Optional[MultiModalKwargsItem]],
-        mm_hashes: list[str],
-    ) -> list[MultiModalKwargsItem]:
-        if not self.enabled:
-            mm_kwargs_lst = list(mm_kwargs)
-            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem)
-            return mm_kwargs_lst
-
-        assert len(mm_kwargs) == len(mm_hashes)
-
-        out_mm_items = list[MultiModalKwargsItem]()
-        for mm_item, mm_hash in zip(mm_kwargs, mm_hashes):
-            if mm_item is None:
-                out_mm_items.append(self.mm_cache[mm_hash])
-            else:
-                self.mm_cache[mm_hash] = mm_item
-                out_mm_items.append(mm_item)
-
-        return out_mm_items
-
-    def reset(self) -> None:
-        self.mm_cache.clear()
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 300b0713b2ff..7ed60156626b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -11,6 +11,7 @@
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_cache_from_config
 from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
@@ -18,7 +19,6 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
 from vllm.v1.structured_output.backend_guidance import (
     validate_guidance_grammar)
 from vllm.v1.structured_output.backend_lm_format_enforcer import (
@@ -47,16 +47,17 @@ def __init__(
 
         self.generation_config_fields = (
             self.model_config.try_get_generation_config())
-        self.input_preprocessor = InputPreprocessor(self.model_config,
-                                                    self.tokenizer,
-                                                    mm_registry)
 
-        self.mm_input_cache_client = MultiModalInputCacheClient(
-            self.model_config, mm_registry)
+        self.mm_registry = mm_registry
+        self.mm_processor_cache = processor_cache_from_config(
+            vllm_config, mm_registry)
 
-    @property
-    def mm_registry(self):
-        return self.input_preprocessor.mm_registry
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            self.tokenizer,
+            mm_registry,
+            mm_processor_cache=self.mm_processor_cache,
+        )
 
     def _validate_logprobs(
         self,
@@ -310,7 +311,7 @@ def process_inputs(
             # in the input sequence.
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
-            orig_sorted_mm_inputs = [
+            sorted_mm_inputs = [
                 decoder_mm_inputs[modality][idx]
                 for modality, idx in sorted_mm_idxs
             ]
@@ -323,11 +324,6 @@ def process_inputs(
                 for modality, idx in sorted_mm_idxs
             ]
 
-            sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
-                orig_sorted_mm_inputs,
-                sorted_mm_hashes,
-            )
-
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=decoder_inputs["prompt_token_ids"],
@@ -415,3 +411,6 @@ def _validate_model_input(
             # TODO: Find out how many placeholder tokens are there so we can
             # check that chunked prefill does not truncate them
             # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
+    def clear_cache(self) -> None:
+        self.input_preprocessor.clear_cache()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index f1ceaaae62a7..053aaf4f968e 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2186,10 +2186,13 @@ def _get_mm_dummy_batch(
         max_items_per_batch: int,
     ) -> BatchedTensorInputs:
         """Dummy data for profiling and precompiling multimodal models."""
+        assert self.mm_budget is not None
+
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
             model_config=self.model_config,
             seq_len=self.max_num_tokens,
             mm_counts={modality: 1},
+            cache=self.mm_budget.cache,
         )
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 4a485b7e077d..d36423660427 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -1813,10 +1813,13 @@ def _get_mm_dummy_batch(
         max_items_per_batch: int,
     ) -> BatchedTensorInputs:
         """Dummy data for profiling and precompiling multimodal models."""
+        assert self.mm_budget is not None
+
         dummy_decoder_data = self.mm_registry.get_decoder_dummy_data(
             model_config=self.model_config,
             seq_len=self.max_num_tokens,
             mm_counts={modality: 1},
+            cache=self.mm_budget.cache,
         )
         dummy_mm_data = dummy_decoder_data.multi_modal_data
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index b96473e7b164..82ede5ad8eb1 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -10,6 +10,7 @@
 from vllm.config import ModelConfig, SchedulerConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
+from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
@@ -33,14 +34,18 @@ def __init__(
         self.model_config = model_config
         self.scheduler_config = scheduler_config
         self.mm_registry = mm_registry
+        self.cache = cache = processor_only_cache_from_config(
+            model_config, mm_registry)
 
         self.max_model_len = model_config.max_model_len
         self.max_num_reqs = scheduler_config.max_num_seqs
 
-        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config)
+        self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config,
+                                                              cache=cache)
 
         max_tokens_by_modality = mm_registry \
-            .get_max_tokens_per_item_by_nonzero_modality(model_config)
+            .get_max_tokens_per_item_by_nonzero_modality(model_config,
+                                                         cache=cache)
 
         encoder_compute_budget, encoder_cache_size = compute_mm_encoder_budget(
             scheduler_config,

From 64466778397482e0cb9ff9f6b320ca6d9dc567ae Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Wed, 27 Aug 2025 15:27:14 +0800
Subject: [PATCH 009/125] [XPU]fix cuda event used in XPU model runner (#23708)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/v1/worker/xpu_model_runner.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
index 59f8d0fcf5bd..fb892211f19d 100644
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
 from typing import TYPE_CHECKING
 
 import torch
@@ -22,7 +23,8 @@ def __init__(
         vllm_config: VllmConfig,
         device: torch.device,
     ):
-        super().__init__(vllm_config, device)
+        with _torch_cuda_wrapper():
+            super().__init__(vllm_config, device)
         # FIXME: To be verified.
         self.cascade_attn_enabled = False
 
@@ -31,3 +33,21 @@ def _init_device_properties(self) -> None:
 
     def _sync_device(self) -> None:
         torch.xpu.synchronize()
+
+
+@contextmanager
+def _torch_cuda_wrapper():
+
+    class _EventPlaceholder:
+
+        def __init__(self, *args, **kwargs) -> None:
+            self.record = lambda: None
+            self.synchronize = lambda: None
+
+    try:
+        # replace cuda Event with xpu Event, this should work by default
+        torch.cuda.Event = torch.xpu.Event
+        yield
+    finally:
+        # if anything goes wrong, just patch it with a placeholder
+        torch.cuda.Event = _EventPlaceholder

From 91e382c935c2905c29f3ca22c658e03e8f02deaa Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 16:11:15 +0800
Subject: [PATCH 010/125] [CI/Build] Remove redundant register in model init
 tests (#23715)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/models/test_initialization.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index bbd3da982af8..b4d516233b4b 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -38,11 +38,6 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                               model_arch=model_arch,
                               exist_overrides=model_info.hf_overrides)
 
-    if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
-        from vllm.model_executor.models.llama4 import Llama4ForCausalLM
-        from vllm.model_executor.models.registry import ModelRegistry
-        ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
-
     # Avoid calling model.forward()
     def _initialize_kv_caches_v0(self) -> None:
         self.cache_config.num_gpu_blocks = 0

From 5bd9f841581a3a9e9eecdd8764240575bb28e391 Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 27 Aug 2025 17:50:09 +0800
Subject: [PATCH 011/125] [Docs] Fix an admonition important (#23726)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/configuration/optimization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index 3eaf2185a559..a8eab9985c8b 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -164,7 +164,7 @@ llm = LLM(
 )
 ```
 
-!! important
+!!! important
     Batch-level DP is not to be confused with API request-level DP
     (which is instead controlled by `data_parallel_size`).
 

From 6578e873655859462758c5c51e51f876f2aa24a3 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 02:52:45 -0700
Subject: [PATCH 012/125] Optimize input preparation for FlashInfer [2/N]
 (#23174)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 80 ++++++++++++++++--------
 1 file changed, 54 insertions(+), 26 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 941d2a4d7f1a..f948157c2b57 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -6,6 +6,7 @@
 from dataclasses import dataclass
 from typing import ClassVar, Optional, Union
 
+import numpy as np
 import torch
 from flashinfer import (BatchDecodeWithPagedKVCacheWrapper,
                         BatchPrefillWithPagedKVCacheWrapper,
@@ -22,6 +23,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.platforms import current_platform
+from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, is_pin_memory_available
 from vllm.utils.flashinfer import (supports_trtllm_attention,
                                    use_trtllm_attention)
@@ -230,6 +232,7 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                                                dtype=torch.int32,
                                                device="cpu",
                                                pin_memory=pin_memory)
+        self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
         self.paged_kv_indices_cpu = torch.zeros(max_num_pages,
                                                 dtype=torch.int32,
                                                 device="cpu",
@@ -238,10 +241,8 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                                                       dtype=torch.int32,
                                                       device="cpu",
                                                       pin_memory=pin_memory)
-
-        self.block_table_arange = torch.arange(max_num_pages_per_req,
-                                               dtype=torch.int32,
-                                               device=self.device)
+        self.paged_kv_last_page_len_np = (
+            self.paged_kv_last_page_len_cpu.numpy())
 
     def _get_workspace_buffer(self):
         if self._workspace_buffer is None:
@@ -317,9 +318,10 @@ def build(self,
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+        seq_lens_np = seq_lens_cpu.numpy()
         block_table_tensor = common_attn_metadata.block_table_tensor
 
-        block_table_bounds_cpu = (seq_lens_cpu + page_size - 1) // page_size
+        num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
 
         use_cascade = common_prefix_len > 0
         if use_cascade:
@@ -342,37 +344,41 @@ def build(self,
 
             # Remove the blocks of the shared prefix from all requests.
             block_table_tensor = block_table_tensor[:, num_common_kv_blocks:]
-            block_table_bounds_cpu -= num_common_kv_blocks
+            num_blocks_np -= num_common_kv_blocks
         else:
             shared_qo_indptr_cpu = None
             shared_kv_page_indptr_cpu = None
             shared_kv_page_indices_cpu = None
             shared_kv_last_page_len_cpu = None
 
-        max_num_blocks = block_table_bounds_cpu.max().item()
-        block_table_bounds = block_table_bounds_cpu.to(self.device,
-                                                       non_blocking=True)
-        mask = (self.block_table_arange[:max_num_blocks].unsqueeze(0)
-                < block_table_bounds.unsqueeze(1))
+        # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
+        np.cumsum(
+            num_blocks_np,
+            dtype=np.int32,
+            out=self.paged_kv_indptr_np[1:num_reqs + 1],
+        )
+        paged_kv_indptr = self.paged_kv_indptr[:num_reqs + 1]
+        paged_kv_indptr.copy_(self.paged_kv_indptr_cpu[:num_reqs + 1],
+                              non_blocking=True)
+
         # write self.paged_kv_indices inplace
-        num_actual_pages = torch.sum(mask)
+        num_actual_pages = num_blocks_np.sum().item()
         paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
-        torch.masked_select(block_table_tensor[:, :max_num_blocks],
-                            mask,
-                            out=paged_kv_indices)
-
-        # write self.paged_kv_indptr_cpu inplace (0-index is always 0)
-        torch.cumsum(block_table_bounds_cpu,
-                     dim=0,
-                     dtype=torch.int32,
-                     out=self.paged_kv_indptr_cpu[1:1 + num_reqs])
+        _copy_page_indices_kernel[(num_reqs, )](
+            paged_kv_indices,
+            block_table_tensor,
+            block_table_tensor.stride(0),
+            paged_kv_indptr,
+            BLOCK_SIZE=1024,
+        )
 
-        paged_kv_last_page_len_cpu = seq_lens_cpu % page_size
         # write self.paged_kv_last_page_len_cpu inplace
-        torch.where(paged_kv_last_page_len_cpu == 0,
-                    torch.tensor(page_size),
-                    paged_kv_last_page_len_cpu,
-                    out=self.paged_kv_last_page_len_cpu[:num_reqs])
+        paged_kv_last_page_len_np = seq_lens_np % page_size
+        self.paged_kv_last_page_len_np[:num_reqs] = np.where(
+            paged_kv_last_page_len_np == 0,
+            page_size,
+            paged_kv_last_page_len_np,
+        )
 
         # Check if any layer uses sinks (requires TRTLLM attention)
         has_sinks = self.global_hyperparameters.has_sinks
@@ -1002,3 +1008,25 @@ def fast_plan_decode(
     self._sm_scale = sm_scale
     self._rope_scale = rope_scale
     self._rope_theta = rope_theta
+
+
+@triton.jit
+def _copy_page_indices_kernel(
+    page_indices,
+    block_table,
+    block_table_stride,
+    cu_num_blocks,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = block_table + req_idx * block_table_stride
+    start_idx = tl.load(cu_num_blocks + req_idx)
+    end_idx = tl.load(cu_num_blocks + req_idx + 1)
+    num_blocks = end_idx - start_idx
+
+    offset = tl.arange(0, BLOCK_SIZE)
+    for i in tl.range(0, num_blocks, BLOCK_SIZE):
+        block_ids = tl.load(row_ptr + i + offset, mask=i + offset < num_blocks)
+        tl.store(page_indices + start_idx + i + offset,
+                 block_ids,
+                 mask=i + offset < num_blocks)

From 04ff1e43fb6e2e675170d0c90399290f8925abb7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 03:25:00 -0700
Subject: [PATCH 013/125] [Misc] Move CpuGpuBuffer to vllm/v1/utils.py (#23728)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/utils.py                   | 29 +++++++++++++++++++++++++++++
 vllm/v1/worker/cpu_model_runner.py |  2 +-
 vllm/v1/worker/gpu_model_runner.py |  6 +++---
 vllm/v1/worker/utils.py            | 29 -----------------------------
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index b5750c82db02..8f9face6fbf2 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -96,6 +96,35 @@ def __repr__(self):
         return f"ConstantList({self._x})"
 
 
+class CpuGpuBuffer:
+
+    def __init__(
+        self,
+        *args,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.cpu = torch.zeros(*args,
+                               dtype=dtype,
+                               device="cpu",
+                               pin_memory=pin_memory)
+        self.np = self.cpu.numpy()
+        self.gpu = self.cpu.to(device)
+
+    def copy_to_gpu(self, n: Optional[int] = None) -> torch.Tensor:
+        if n is None:
+            return self.gpu.copy_(self.cpu, non_blocking=True)
+        return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
+
+    def copy_to_cpu(self, n: Optional[int] = None) -> torch.Tensor:
+        """NOTE: Because this method is non-blocking, explicit synchronization
+        is needed to ensure the data is copied to CPU."""
+        if n is None:
+            return self.cpu.copy_(self.gpu, non_blocking=True)
+        return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)
+
+
 def get_engine_client_zmq_addr(local_only: bool,
                                host: str,
                                port: int = 0) -> str:
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 137578f0e608..742e553b77e0 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -10,8 +10,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
+from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_model_runner import GPUModelRunner
-from vllm.v1.worker.utils import CpuGpuBuffer
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 053aaf4f968e..d93460d618e7 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -78,14 +78,14 @@
 from vllm.v1.spec_decode.medusa import MedusaProposer
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
+from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.kv_connector_model_runner_mixin import (
     KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (AttentionGroup, CpuGpuBuffer, MultiModalBudget,
-                    bind_kv_cache, gather_mm_placeholders,
-                    initialize_kv_cache_for_kv_sharing,
+from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
+                    gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
                     sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
 
 if TYPE_CHECKING:
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 82ede5ad8eb1..f40753468766 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -303,32 +303,3 @@ def bind_kv_cache(
     for layer_name, kv_cache in kv_caches.items():
         # NOTE: Use list because of v0 PP virtual engine.
         forward_context[layer_name].kv_cache = [kv_cache]
-
-
-class CpuGpuBuffer:
-
-    def __init__(
-        self,
-        *args,
-        dtype: torch.dtype,
-        device: torch.device,
-        pin_memory: bool,
-    ):
-        self.cpu = torch.zeros(*args,
-                               dtype=dtype,
-                               device="cpu",
-                               pin_memory=pin_memory)
-        self.np = self.cpu.numpy()
-        self.gpu = self.cpu.to(device)
-
-    def copy_to_gpu(self, n: Optional[int] = None) -> None:
-        if n is None:
-            return self.gpu.copy_(self.cpu, non_blocking=True)
-        return self.gpu[:n].copy_(self.cpu[:n], non_blocking=True)
-
-    def copy_to_cpu(self, n: Optional[int] = None) -> None:
-        """NOTE: Because this method is non-blocking, explicit synchronization
-        is needed to ensure the data is copied to CPU."""
-        if n is None:
-            return self.cpu.copy_(self.gpu, non_blocking=True)
-        return self.cpu[:n].copy_(self.gpu[:n], non_blocking=True)

From 11eddf02f0234f79435d747f2d3dce117ab39aa1 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 27 Aug 2025 03:45:04 -0700
Subject: [PATCH 014/125] [FlashInfer] Cache hyper params in metadata builder
 (#23732)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 30 ++++++++++++------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index f948157c2b57..1115fc606b05 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -214,6 +214,10 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
         # TODO: discard this for trtllm-gen backend
         self.global_hyperparameters = infer_global_hyperparameters(
             get_per_layer_parameters(vllm_config, layer_names, FlashInferImpl))
+        self.sm_scale = self.global_hyperparameters.sm_scale
+        self.window_left = self.global_hyperparameters.window_left
+        self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
+        self.has_sinks = self.global_hyperparameters.has_sinks
 
         # Preparing persistent buffers (device-side)
         self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
@@ -381,8 +385,6 @@ def build(self,
         )
 
         # Check if any layer uses sinks (requires TRTLLM attention)
-        has_sinks = self.global_hyperparameters.has_sinks
-
         prefill_use_trtllm = use_trtllm_attention(self.num_qo_heads,
                                                   self.num_kv_heads,
                                                   num_prefill_tokens,
@@ -390,7 +392,7 @@ def build(self,
                                                   self.cache_dtype,
                                                   self.q_data_type,
                                                   is_prefill=True,
-                                                  has_sinks=has_sinks)
+                                                  has_sinks=self.has_sinks)
         decode_use_trtllm = use_trtllm_attention(self.num_qo_heads,
                                                  self.num_kv_heads,
                                                  num_decode_tokens,
@@ -398,7 +400,7 @@ def build(self,
                                                  self.cache_dtype,
                                                  self.q_data_type,
                                                  is_prefill=False,
-                                                 has_sinks=has_sinks)
+                                                 has_sinks=self.has_sinks)
 
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
@@ -433,9 +435,9 @@ def build(self,
                 self.head_dim,
                 self.page_size,
                 causal=True,
-                sm_scale=self.global_hyperparameters.sm_scale,
-                window_left=self.global_hyperparameters.window_left,
-                logits_soft_cap=self.global_hyperparameters.logits_soft_cap,
+                sm_scale=self.sm_scale,
+                window_left=self.window_left,
+                logits_soft_cap=self.logits_soft_cap,
                 q_data_type=self.q_data_type,
                 kv_data_type=self.kv_cache_dtype,
             )
@@ -472,10 +474,9 @@ def build(self,
                         self.head_dim,
                         self.page_size,
                         causal=True,
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
                     )
@@ -525,10 +526,9 @@ def build(self,
                         self.page_size,
                         # Disable flashinfer's pos encoding and use vllm's rope.
                         pos_encoding_mode="NONE",
-                        sm_scale=self.global_hyperparameters.sm_scale,
-                        window_left=self.global_hyperparameters.window_left,
-                        logits_soft_cap=self.global_hyperparameters.
-                        logits_soft_cap,
+                        sm_scale=self.sm_scale,
+                        window_left=self.window_left,
+                        logits_soft_cap=self.logits_soft_cap,
                         q_data_type=self.q_data_type,
                         kv_data_type=self.kv_cache_dtype,
                     )

From e03940762b43812fccd3c214bda60201cff9d16a Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 27 Aug 2025 18:59:35 +0800
Subject: [PATCH 015/125] [CI/Build] Reduce LoRA layer test cases (#23721)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py | 72 ++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 92db023babc2..6e2dda464d8e 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -347,7 +347,7 @@ def create_random_embedding_layer():
 @torch.inference_mode()
 # @pytest.mark.skip(
 #     reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
@@ -486,7 +486,7 @@ def create_random_embedding_layer():
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
@@ -620,12 +620,15 @@ def _pretest():
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
-def test_linear_replicated(dist_init, num_loras, device, stage,
-                           bias_enabled) -> None:
+def test_linear_replicated(
+    dist_init,
+    num_loras,
+    device,
+    stage,
+) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        lora_dtype=torch.float16,
+    )
 
     def create_random_linear_replicated_layer():
 
@@ -651,10 +655,6 @@ def create_random_linear_replicated_layer():
         lora_linear.create_lora_weights(max_loras, lora_config)
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):
@@ -734,14 +734,13 @@ def create_random_linear_replicated_layer():
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("orientation", ["row", "column"])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
-                         device, stage, bias_enabled) -> None:
+                         device, stage) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )
 
     def create_random_linear_parallel_layer():
         if orientation == "row":
@@ -777,10 +777,7 @@ def create_random_linear_parallel_layer():
         lora_linear.create_lora_weights(max_loras, lora_config)
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == 1)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):
@@ -860,14 +857,13 @@ def create_random_linear_parallel_layer():
 
 
 @torch.inference_mode()
-@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
+@pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("repeats", [1, 2, 3])
 @pytest.mark.parametrize("fully_shard", [True, False])
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
-@pytest.mark.parametrize("bias_enabled", [True, False])
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
-                                device, stage, bias_enabled) -> None:
+                                device, stage) -> None:
 
     if current_platform.is_cuda_alike():
         torch.cuda.set_device(device)
@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
     torch.set_default_device(device)
     punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
     assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(max_loras=max_loras,
-                             max_lora_rank=8,
-                             fully_sharded_loras=fully_shard,
-                             lora_dtype=torch.float16,
-                             bias_enabled=bias_enabled)
+    lora_config = LoRAConfig(
+        max_loras=max_loras,
+        max_lora_rank=8,
+        fully_sharded_loras=fully_shard,
+        lora_dtype=torch.float16,
+    )
 
     def create_column_parallel_packed_layer():
         if repeats == 2:
@@ -924,10 +921,7 @@ class FakeConfig:
                                         model_config=FakeConfig())
         assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
             lora_linear.lora_b_stacked) == n_slices)
-        if bias_enabled:
-            assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
-        else:
-            assert lora_linear.lora_bias_stacked is None
+
         return linear, lora_linear
 
     for i in range(NUM_RANDOM_SEEDS):

From 8f0d7eaea87409a54ccaed76995b59c6b0a3d4cf Mon Sep 17 00:00:00 2001
From: Fanli Lin <fanli0116@gmail.com>
Date: Wed, 27 Aug 2025 19:57:38 +0800
Subject: [PATCH 016/125] [XPU] Fix OOM issue for data parallel with Ray
 backend (#22500)

Signed-off-by: Fanli Lin <fanli.lin@intel.com>
Signed-off-by: Fanli Lin <fanli0116@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/v1/engine/core.py  | 27 ++++++++++++++++++---------
 vllm/v1/engine/utils.py | 35 +++++++++++++++++++++++++++++++----
 2 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index b61482806184..a7038e2d2c26 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -39,7 +39,8 @@
                             EngineCoreRequestType,
                             ReconfigureDistributedRequest, ReconfigureRankType,
                             UtilityOutput, UtilityResult)
-from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses
+from vllm.v1.engine.utils import (EngineHandshakeMetadata, EngineZmqAddresses,
+                                  get_device_indices)
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import SchedulerStats
@@ -1169,22 +1170,30 @@ def __init__(
         # https://github.com/ray-project/ray/pull/40461/files#diff-31e8159767361e4bc259b6d9883d9c0d5e5db780fcea4a52ead4ee3ee4a59a78R1860 # noqa: E501
         # and get_accelerator_ids_for_accelerator_resource() in worker.py
         # of ray.
-        self._set_cuda_visible_devices(vllm_config, local_dp_rank)
+        self._set_visible_devices(vllm_config, local_dp_rank)
 
         super().__init__(vllm_config, local_client, "", executor_class,
                          log_stats)
 
-    def _set_cuda_visible_devices(self, vllm_config: VllmConfig,
-                                  local_dp_rank: int):
+    def _set_visible_devices(self, vllm_config: VllmConfig,
+                             local_dp_rank: int):
         from vllm.platforms import current_platform
-        device_control_env_var = current_platform.device_control_env_var
+        if current_platform.is_xpu():
+            pass
+        else:
+            device_control_env_var = current_platform.device_control_env_var
+            self._set_cuda_visible_devices(vllm_config, local_dp_rank,
+                                           device_control_env_var)
+
+    def _set_cuda_visible_devices(self, vllm_config: VllmConfig,
+                                  local_dp_rank: int,
+                                  device_control_env_var: str):
         world_size = vllm_config.parallel_config.world_size
         # Set CUDA_VISIBLE_DEVICES or equivalent.
         try:
-            os.environ[device_control_env_var] = ",".join(
-                str(current_platform.device_id_to_physical_device_id(i))
-                for i in range(local_dp_rank *
-                               world_size, (local_dp_rank + 1) * world_size))
+            value = get_device_indices(device_control_env_var, local_dp_rank,
+                                       world_size)
+            os.environ[device_control_env_var] = value
         except IndexError as e:
             raise Exception(
                 f"Error setting {device_control_env_var}: "
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 62f229e28693..56ef8477d267 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -164,19 +164,33 @@ def set_device_control_env_var(vllm_config: VllmConfig,
     """
     world_size = vllm_config.parallel_config.world_size
     evar = current_platform.device_control_env_var
+
+    value = get_device_indices(evar, local_dp_rank, world_size)
+    with patch.dict(os.environ, values=((evar, value), )):
+        yield
+
+
+def get_device_indices(device_control_env_var: str, local_dp_rank: int,
+                       world_size: int):
+    """
+    Returns a comma-separated string of device indices for the specified
+    data parallel rank.
+
+    For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
+    this will select devices 2 and 3 for local_dp_rank=1.
+    """
     try:
         value = ",".join(
             str(current_platform.device_id_to_physical_device_id(i))
             for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
                            world_size))
     except IndexError as e:
-        raise Exception(f"Error setting {evar}: "
+        raise Exception(f"Error setting {device_control_env_var}: "
                         f"local range: [{local_dp_rank * world_size}, "
                         f"{(local_dp_rank + 1) * world_size}) "
                         "base value: "
-                        f"\"{os.getenv(evar)}\"") from e
-    with patch.dict(os.environ, values=((evar, value), )):
-        yield
+                        f"\"{os.getenv(device_control_env_var)}\"") from e
+    return value
 
 
 class CoreEngineActorManager:
@@ -254,6 +268,19 @@ def __init__(
             dp_vllm_config = copy.deepcopy(vllm_config)
             dp_vllm_config.parallel_config.placement_group = pg
             local_client = index < local_engine_count
+
+            # Ray XPU known issue: dpctl initializes the GPU runtime early, so
+            # setting device env vars in Ray actor's initialization method
+            # will not affect device selection. See:
+            # https://github.com/ray-project/ray/blob/master/python/ray/_private/accelerators/intel_gpu.py#L56 # noqa: E501
+            if current_platform.is_xpu():
+                device_evar = current_platform.device_control_env_var
+                device_indices = get_device_indices(device_evar, local_index,
+                                                    world_size)
+                actor_env_vars = self.env_vars_dict.copy()
+                actor_env_vars[device_evar] = device_indices
+                runtime_env = RuntimeEnv(env_vars=actor_env_vars)
+
             actor = ray.remote(DPEngineCoreActor).options(
                 scheduling_strategy=PlacementGroupSchedulingStrategy(
                     placement_group=pg,

From 1f7a9c95e4b2a1e02b19e94fd7371443f08b2e4b Mon Sep 17 00:00:00 2001
From: Michael Yao <haifeng.yao@daocloud.io>
Date: Wed, 27 Aug 2025 20:37:52 +0800
Subject: [PATCH 017/125] [Docs] Fix a 1-2-3 list and style issues in tpu.md
 (#23729)

Signed-off-by: windsonsea <haifeng.yao@daocloud.io>
---
 docs/configuration/tpu.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
index ac2b6baffd14..e456077e0495 100644
--- a/docs/configuration/tpu.md
+++ b/docs/configuration/tpu.md
@@ -45,30 +45,30 @@ This initial compilation time ranges significantly and is impacted by many of th
 
 ### Optimize based on your data
 
-#### max model len vs. most model len
+#### max-model-len vs. most-model-len
 
 ![most_model_len](../assets/design/tpu/most_model_len.png)
 
-If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most model len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
+If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
 
 For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
 
-The requests get subdivided into max-model-len and most-model-len categories, for the latter category, we can gain better performance since the server can process more requests at a time.
+The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time.
 
 #### Padding
 
-For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128: 128, 256, etc.
+For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.)
 
-The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about tpu padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
+The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
 
-1) the default exponential padding (pad to the nearest power of 2)
-2) bucket padding (pad to the nearest linearly increasing bucket).
+1. the default exponential padding (pad to the nearest power of 2)
+2. bucket padding (pad to the nearest linearly increasing bucket).
 
 When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
 
 For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
 
-The fewer tokens we pad, the less unnecessary computation TPU does, the better performance we can get. For example, if num_tokens=300, with exponential padding, we pad to 512, with the bucket_padding above, we pad to 320.
+The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320.
 
 However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
 

From 9d30de44698e1e337e4736ff62b83ebe1bbd4d40 Mon Sep 17 00:00:00 2001
From: tc-mb <157115220+tc-mb@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:38:00 +0800
Subject: [PATCH 018/125] [model] Support MiniCPM-V 4.5 (#23586)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: tc-mb <caitianchi@modelbest.cn>
Signed-off-by: Xin Yang <xyangx@amazon.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: chzhang <chaojun.zhang@intel.com>
Signed-off-by: Pate Motter <patemotter@google.com>
Signed-off-by: Terrencezzj <terrence@cohere.ai>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
Signed-off-by: simon-mo <simon.mo@hey.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: siyuanf <siyuanf@nvidia.com>
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
Signed-off-by: jiabin.00 <jiabin.00@bytedance.com>
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Huy Do <huydhn@gmail.com>
Signed-off-by: Matúš Námešný <matus.namesny@ameria.com>
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: oye93 <en.ouyang93@outlook.com>
Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: Didier Durand <durand.didier@gmail.com>
Signed-off-by: Tianyu Li <tianyu.li@arm.com>
Signed-off-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Federico <65908512+coval3nte@users.noreply.github.com>
Signed-off-by: Zixuan Zhang <zixuanzhang@bytedance.com>
Signed-off-by: wuhang <wuhang6@huawei.com>
Signed-off-by: czhu-cohere <conway.zhu@cohere.com>
Signed-off-by: Wei Wei <wwei6@meta.com>
Signed-off-by: Yiheng Xu <charlesyihengxu@gmail.com>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: wangyafeng <wangyafeng@baidu.com>
Co-authored-by: Xin Yang <105740670+xyang16@users.noreply.github.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Chaojun Zhang <chaojun.zhang@intel.com>
Co-authored-by: Pate Motter <p@temotter.com>
Co-authored-by: Terrence Zhao <32208165+Terrencezzj@users.noreply.github.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: weiliang <weiliangl@nvidia.com>
Co-authored-by: Siyuan Fu <siyuanf@nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Zijing Liu <liuzijing2014@users.noreply.github.com>
Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com>
Co-authored-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Raghavan <oneraghavan@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: knlnguyen1802 <knlnguyen1802@gmail.com>
Co-authored-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Matúš Námešný <matus@namesny.com>
Co-authored-by: Guillaume Calmettes <gcalmettes@scaleway.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: En Ouyang <en.ouyang93@outlook.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: nvjullin <jullin@nvidia.com>
Co-authored-by: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Co-authored-by: TianyuLi0 <116711075+TianyuLi0@users.noreply.github.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Yuekai Zhang <zhangyuekai@foxmail.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Lukas Geiger <lukas.geiger94@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: Federico <65908512+coval3nte@users.noreply.github.com>
Co-authored-by: zixuanzhang226 <zixuanzhang@bytedance.com>
Co-authored-by: wuhang <wuhang6@huawei.com>
Co-authored-by: yzds <41983536+youzhedian@users.noreply.github.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: czhu-cohere <conway.zhu@cohere.com>
Co-authored-by: Wei <weiweinpu@gmail.com>
Co-authored-by: Yiheng Xu <charlesyihengxu@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Chenheli Hua <huachenheli@outlook.com>
Co-authored-by: CSWYF3634076 <58356743+CSWYF3634076@users.noreply.github.com>
---
 docs/models/supported_models.md               |   2 +-
 tests/models/registry.py                      |   2 +-
 vllm/model_executor/models/minicpmv.py        | 314 +++++++++++++++++-
 .../chat_templates/registry.py                |  11 +
 .../chat_templates/template_minicpmv45.jinja  |  93 ++++++
 5 files changed, 407 insertions(+), 15 deletions(-)
 create mode 100644 vllm/transformers_utils/chat_templates/template_minicpmv45.jinja

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 19ce8c06724f..35a5fa0c2e42 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -638,7 +638,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | ✅︎ |
 | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I<sup>+</sup> + V<sup>+</sup> | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | ✅︎ |
 | `MiniCPMO` | MiniCPM-O | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup> | `openbmb/MiniCPM-o-2_6`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, etc. | ✅︎ | | ✅︎ |
+| `MiniCPMV` | MiniCPM-V | T + I<sup>E+</sup> + V<sup>E+</sup> | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, `openbmb/MiniCPM-V-4`, `openbmb/MiniCPM-V-4_5`, etc. | ✅︎ | | ✅︎ |
 | `MiniMaxVL01ForConditionalGeneration` | MiniMax-VL | T + I<sup>E+</sup> | `MiniMaxAI/MiniMax-VL-01`, etc. | | ✅︎ | ✅︎ |
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `MllamaForConditionalGeneration` | Llama 3.2 | T + I<sup>+</sup> | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. | | | |
diff --git a/tests/models/registry.py b/tests/models/registry.py
index f2c09d3e8452..ee546e7af85c 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -451,7 +451,7 @@ def check_available_online(
     "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                 trust_remote_code=True),
     "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
-                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4"},  # noqa: E501
+                                extras={"2.6": "openbmb/MiniCPM-V-2_6", "4.0": "openbmb/MiniCPM-V-4", "4.5": "openbmb/MiniCPM-V-4_5"},  # noqa: E501
                                 trust_remote_code=True),
     "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
                                               trust_remote_code=True,
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index c22d871ab20d..2d785c30fd7d 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -27,12 +27,14 @@
 from collections import defaultdict
 from collections.abc import Iterable, Mapping, Sequence
 from functools import partial
+from itertools import chain
 from typing import Annotated, Any, Callable, Literal, Optional, Union
 
 import numpy as np
 import torch
 import torch.types
 from torch import nn
+from torch.nn.init import trunc_normal_
 from transformers import BatchFeature, PretrainedConfig
 from typing_extensions import TypeVar
 
@@ -47,10 +49,11 @@
 from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
-                                    NestedTensors)
+                                    MultiModalKwargsItems, NestedTensors)
 from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
                                    ImageProcessorItems, ImageSize,
                                    ModalityData, ModalityDataItems,
@@ -218,6 +221,187 @@ def forward(self, x: torch.Tensor,
         return x
 
 
+class Resampler4_5(Resampler2_5):
+
+    def __init__(self,
+                 num_queries: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 kv_dim: Optional[int] = None,
+                 norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN,
+                 max_size: tuple[int, int] = (70, 70),
+                 max_temporal_size: int = 36000,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "") -> None:
+        super().__init__(num_queries,
+                         embed_dim,
+                         num_heads,
+                         kv_dim,
+                         norm_layer,
+                         max_size,
+                         quant_config=quant_config,
+                         prefix=prefix)
+
+        trunc_normal_(self.query, std=.02)
+        self.max_temporal_size = max_temporal_size
+        self._set_temporal_pos_cache(self.max_temporal_size)
+        self.apply(self._init_weights)
+
+    def get_1d_sincos_pos_embed_from_temporal_size(self, embed_dim: int,
+                                                   pos: np.ndarray):
+        """
+        embed_dim: output dimension for each position
+        pos: a list of positions to be encoded: size (M,)
+        out: (M, D)
+        """
+        assert embed_dim % 2 == 0
+        omega = np.arange(embed_dim // 2, dtype=np.float32)
+        omega /= embed_dim / 2.
+        omega = 1. / 10000**omega  # (D/2,)
+
+        pos = pos.reshape(-1)  # (M,)
+        out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+        emb_sin = np.sin(out)  # (M, D/2)
+        emb_cos = np.cos(out)  # (M, D/2)
+
+        emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+        return emb
+
+    def _set_temporal_pos_cache(self,
+                                max_temporal_size: int,
+                                device: torch.types.Device = "cpu") -> None:
+        temporal_size = np.arange(max_temporal_size, dtype=np.float32)
+        pos_embed = torch.from_numpy(
+            self.get_1d_sincos_pos_embed_from_temporal_size(
+                self.embed_dim, temporal_size)).float().to(device)
+        self.register_buffer("temporal_pos_embed", pos_embed, persistent=False)
+
+    def _adjust_temporal_pos_cache(self,
+                                   max_temporal_size: int,
+                                   device: torch.types.Device = "cpu"):
+        if max_temporal_size > self.max_temporal_size:
+            self.max_temporal_size = max_temporal_size
+            self._set_temporal_pos_cache(self.max_temporal_size, device)
+
+    def _init_weights(self, m: Union[nn.Linear, nn.LayerNorm]):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        tgt_sizes: torch.Tensor,
+        # temporal_ids for high refresh rate videos
+        temporal_ids=None
+    ) -> torch.Tensor:
+        assert x.shape[0] == tgt_sizes.shape[0]
+        bs = x.shape[0]
+
+        device = x.device
+        dtype = x.dtype
+
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+
+        self._adjust_pos_cache(tgt_sizes, device=device)
+
+        temporal_pos_emb = False
+        temporal_ids_flatten = None
+        if temporal_ids is not None:
+            # example: [[-1], [-1], [2, 6, 9]]
+            temporal_ids_flatten = list(chain.from_iterable(temporal_ids))
+            max_temporal_size = max(temporal_ids_flatten, default=0)
+            if max_temporal_size > -1:
+                temporal_pos_emb = True
+            if max_temporal_size > self.max_temporal_size:
+                self._adjust_temporal_pos_cache(max_temporal_size, device)
+
+        max_patch_len = patch_len.max().item()
+        assert isinstance(max_patch_len, int)
+
+        key_padding_mask = torch.zeros((bs, max_patch_len),
+                                       dtype=torch.bool,
+                                       device=device)
+
+        x, _ = self.kv_proj(x)  # B * L * D
+        x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
+        q = self.ln_q(self.query)  # Q * D
+
+        pos_embed_2d = []
+        pos_embed_temporal = []
+        for i in range(bs):
+            tgt_h, tgt_w = tgt_sizes[i]
+            if temporal_pos_emb:
+                if temporal_ids_flatten[i] == -1:
+                    pos_embed_temporal.append(
+                        torch.zeros(self.embed_dim, dtype=dtype,
+                                    device=device))
+                else:
+                    pos_embed_temporal.append(self.temporal_pos_embed[
+                        temporal_ids_flatten[i]].to(dtype))  # D
+
+            pos_embed_2d.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape(
+                (tgt_h * tgt_w, -1)).to(dtype))  # patches * D
+            key_padding_mask[i, patch_len[i]:] = True
+
+        pos_embed_2d = torch.nn.utils.rnn.pad_sequence(
+            pos_embed_2d, batch_first=True,
+            padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
+
+        k = x
+        v = x + pos_embed_2d
+        if pos_embed_temporal:
+            k += torch.stack(pos_embed_temporal, dim=0)
+            bs = len(temporal_ids)
+            merge_k = []
+            merge_v = []
+            merge_key_padding_mask = []
+
+            start = 0
+            for tp in temporal_ids:
+                end = start + len(tp)
+                # L * (end-start) * D -> (end-start) * L * D
+                # -> 1 * L*(end-start) * D
+                merge_k.append(k[:, start:end, :].permute(1, 0, 2).reshape(
+                    -1, self.embed_dim))
+                merge_v.append(v[:, start:end, :].permute(1, 0, 2).reshape(
+                    -1, self.embed_dim))
+                merge_key_padding_mask.append(
+                    key_padding_mask[start:end, :].reshape(-1, 1))
+
+                start = end
+
+            k = torch.nn.utils.rnn.pad_sequence(merge_k,
+                                                batch_first=True,
+                                                padding_value=0.0).permute(
+                                                    1, 0, 2)  # L*(end-start)
+            v = torch.nn.utils.rnn.pad_sequence(merge_v,
+                                                batch_first=True,
+                                                padding_value=0.0).permute(
+                                                    1, 0, 2)  # L*(end-start)
+            key_padding_mask = torch.nn.utils.rnn.pad_sequence(
+                merge_key_padding_mask, batch_first=True,
+                padding_value=True).squeeze(-1)
+
+        out = self.attn(
+            self._repeat(q, bs),  # Q * B * D
+            k,  # L * B * D +  L * B * D
+            v,
+            key_padding_mask=key_padding_mask,
+        )[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+
+
 def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]:
     version_float = getattr(config, "version", None)
 
@@ -354,9 +538,7 @@ def get_model_version(self):
 
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         mm_limits = {"image": None}
-        if self.get_model_version() == (2,
-                                        6) or self.get_model_version() == (4,
-                                                                           0):
+        if self.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
             mm_limits["video"] = None
 
         return mm_limits
@@ -637,8 +819,7 @@ def _base_call_hf_processor(
         out_keys: set[str],
     ) -> dict[str, NestedTensors]:
         # This processor supports zipping prompt and mm_data together
-        if self.info.get_model_version() == (
-                2, 6) or self.info.get_model_version() == (4, 0):
+        if self.info.get_model_version() in {(2, 6), (4, 0), (4, 5)}:
             inputs = super()._call_hf_processor(
                 prompt=prompts,  # type: ignore
                 mm_data=mm_data,
@@ -816,7 +997,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         # and config class
         self.config = config
         self.multimodal_config = multimodal_config
-        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
 
         self.version = get_version_by_config(self.config)
         self.llm = self.init_llm(vllm_config=vllm_config,
@@ -1364,11 +1544,9 @@ def init_vision_module(
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
-        model = Idefics2VisionTransformer(
-            config.vision_config,
-            quant_config=quant_config,
-            prefix=prefix,
-            use_data_parallel=self.use_data_parallel)
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1436,11 +1614,121 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loader.load_weights(weights)
 
 
+class MiniCPMV4_5(MiniCPMVBaseModel, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+        assert self.version == (4, 5)
+
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        if isinstance(quant_config, (AWQConfig, AWQMarlinConfig)):
+            return None
+        return quant_config
+
+    def init_llm(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> nn.Module:
+        return Qwen3ForCausalLM(vllm_config=vllm_config, prefix=prefix)
+
+    def init_vision_module(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        model = Idefics2VisionTransformer(config.vision_config,
+                                          quant_config=quant_config,
+                                          prefix=prefix)
+        if self.config.drop_vision_last_layer:
+            model.encoder.layers = model.encoder.layers[:-1]
+        return model
+
+    def init_resampler(
+        self,
+        embed_dim: int,
+        vision_dim: int,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> nn.Module:
+        quant_config = self._maybe_ignore_quant_config(quant_config)
+        with set_default_torch_dtype(torch.float16):
+            # The resampler in 4.0 remains consistent with the one in 2.5/2.6.
+            resampler = Resampler4_5(num_queries=self.config.query_num,
+                                     embed_dim=embed_dim,
+                                     num_heads=embed_dim // 128,
+                                     kv_dim=vision_dim,
+                                     quant_config=quant_config,
+                                     prefix=prefix)
+
+        return resampler.to(device=current_platform.device_type,
+                            dtype=torch.get_default_dtype())
+
+    def get_vision_hidden_states(
+            self, data: MiniCPMVImagePixelInputs) -> torch.Tensor:
+        pixel_values = data["pixel_values"]
+        tgt_sizes = data["tgt_sizes"]
+        temporal_ids = data.get('temporal_ids', None)
+
+        B = len(pixel_values)
+        P = pixel_values[0].shape[-2]
+        L = max(item.shape[-1] for item in pixel_values)
+        device = pixel_values[0].device
+        dtype = pixel_values[0].dtype
+
+        all_pixel_values = torch.zeros((B, 3, P, L),
+                                       dtype=dtype,
+                                       device=device)
+        all_temporal_ids = None if temporal_ids is None else flatten_2d_lists(
+            temporal_ids)
+        for i, pixel_values_item in enumerate(pixel_values):
+            L_item = pixel_values_item.shape[-1]
+            all_pixel_values[i, ..., :L_item] = pixel_values_item
+
+        num_patches = tgt_sizes.prod(-1)
+        max_patches = num_patches.max().item()
+        assert isinstance(max_patches, int)
+
+        patch_attn_mask = torch.zeros((B, max_patches),
+                                      dtype=torch.bool,
+                                      device=device)
+        for i, num_patches_item in enumerate(num_patches):
+            patch_attn_mask[i, :num_patches_item] = True
+
+        vision_embedding = self.vpm(
+            all_pixel_values,
+            patch_attention_mask=patch_attn_mask.unsqueeze(1),
+            tgt_sizes=tgt_sizes,
+        )
+
+        return self.resampler(vision_embedding, tgt_sizes, all_temporal_ids)
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self,
+                                   skip_prefixes=["apm.", "audio", "tts"])
+        return loader.load_weights(weights)
+
+
 _SUPPORT_VERSION = {
     (2, 0): MiniCPMV2_0,
     (2, 5): MiniCPMV2_5,
     (2, 6): MiniCPMV2_6,
     (4, 0): MiniCPMV4_0,
+    (4, 5): MiniCPMV4_5,
 }
 
 
diff --git a/vllm/transformers_utils/chat_templates/registry.py b/vllm/transformers_utils/chat_templates/registry.py
index e0ef7f0999d4..d09c5fa924fb 100644
--- a/vllm/transformers_utils/chat_templates/registry.py
+++ b/vllm/transformers_utils/chat_templates/registry.py
@@ -20,6 +20,16 @@ def _get_qwen_chat_template_fallback(
     return CHAT_TEMPLATES_DIR / "template_basic.jinja"
 
 
+def _get_minicpmv_chat_template_fallback(
+        tokenizer_name_or_path: str) -> Optional[Path]:
+    # MiniCPM-V-4.5 version uses a dedicated template
+    if "4.5" in tokenizer_name_or_path or "4_5" in tokenizer_name_or_path:
+        return CHAT_TEMPLATES_DIR / "template_minicpmv45.jinja"
+
+    # Other versions use chatml template
+    return CHAT_TEMPLATES_DIR / "template_chatml.jinja"
+
+
 # yapf: disable
 _MODEL_TYPE_TO_CHAT_TEMPLATE_FALLBACK: dict[str, ChatTemplatePath] = {
     "blip-2": CHAT_TEMPLATES_DIR / "template_blip2.jinja",
@@ -27,6 +37,7 @@ def _get_qwen_chat_template_fallback(
     "deepseek_vl_v2": CHAT_TEMPLATES_DIR / "template_deepseek_vl2.jinja",
     "florence2": CHAT_TEMPLATES_DIR / "template_basic.jinja",
     "fuyu": CHAT_TEMPLATES_DIR / "template_fuyu.jinja",
+    "minicpmv": _get_minicpmv_chat_template_fallback,
     "paligemma": CHAT_TEMPLATES_DIR / "template_basic.jinja",
     "qwen": _get_qwen_chat_template_fallback,
 }
diff --git a/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
new file mode 100644
index 000000000000..661ebd1cf5c1
--- /dev/null
+++ b/vllm/transformers_utils/chat_templates/template_minicpmv45.jinja
@@ -0,0 +1,93 @@
+{%- set enable_thinking = enable_thinking | default(false) %}
+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set content = message.content %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in message.content %}
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+    {%- if enable_thinking is defined and enable_thinking is true %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
\ No newline at end of file

From 8c13820f0b203976eab8e821c102234a73f338cd Mon Sep 17 00:00:00 2001
From: cndoit18 <cndoit18@outlook.com>
Date: Wed, 27 Aug 2025 20:42:20 +0800
Subject: [PATCH 019/125] [Bugfix] Fix task field initialization when
 PYTHONOPTIMIZE is enabled (#23718)

Signed-off-by: cndoit18 <cndoit18@outlook.com>
---
 vllm/worker/pooling_model_runner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index 8d8d9b4d0503..3e1950798dbf 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -199,8 +199,9 @@ def _prepare_pooling(
 
             pooling_params = seq_group_metadata.pooling_params
             assert pooling_params is not None
-            assert (task := pooling_params.task) is not None, (
-                "You did not set `task` in the API")
+
+            task = pooling_params.task
+            assert task is not None, "You did not set `task` in the API"
 
             model = cast(VllmModelForPooling, self.model)
             to_update = model.pooler.get_pooling_updates(task)

From a403d0fa41cc68e3b6da4e1097dc896fde2f1a6a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 27 Aug 2025 05:50:47 -0700
Subject: [PATCH 020/125] [Misc] Remove unnecessary `_send_reconfig_message()`
 in `core_client.py` (#23127)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/engine/core_client.py | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 079dd9a7d38d..65f7abc97110 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -1190,21 +1190,6 @@ async def _abort_requests(self, request_ids: list[str],
         await self._send_input(EngineCoreRequestType.ABORT, request_ids,
                                engine)
 
-    async def _send_reconfig_message(
-            self, reconfig_request: ReconfigureDistributedRequest,
-            engine: EngineIdentity) -> asyncio.Future:
-        """Send reconfiguration message and return the result future without
-        waiting for completion."""
-        call_id = uuid.uuid1().int >> 64
-        future = asyncio.get_running_loop().create_future()
-        self.utility_results[call_id] = future
-        message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode(
-            (self.client_index, call_id, "reinitialize_distributed",
-             (reconfig_request, ))))
-        await self._send_input_message(message, engine, reconfig_request)
-        self._ensure_output_queue_task()
-        return future
-
     async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
         """Scale elastic EP data parallel size"""
         cur_data_parallel_size = len(self.core_engines)
@@ -1214,7 +1199,7 @@ async def scale_elastic_ep(self, new_data_parallel_size: int) -> None:
             f"different from cur_data_parallel_size {cur_data_parallel_size}")
 
         assert self.vllm_config.parallel_config.data_parallel_backend == \
-            "ray", ("Only ray DP backend supports scaling elastic EP")
+            "ray", "Only ray DP backend supports scaling elastic EP"
 
         scale_up = new_data_parallel_size > cur_data_parallel_size
 
@@ -1246,9 +1231,10 @@ async def _scale_up_elastic_ep(self, cur_data_parallel_size: int,
                 data_parallel_master_ip,
                 new_data_parallel_master_port=self.vllm_config.parallel_config.
                 data_parallel_master_port)
-            future = await self._send_reconfig_message(reconfig_request,
-                                                       engine)
-            reconfig_futures.append(future)
+            coro = self._call_utility_async("reinitialize_distributed",
+                                            reconfig_request,
+                                            engine=engine)
+            reconfig_futures.append(asyncio.create_task(coro))
 
         logger.info("All reconfigure messages sent, starting engine creation")
 
@@ -1318,9 +1304,10 @@ async def _scale_down_elastic_ep(self, cur_data_parallel_size: int,
             if cur_dp_rank >= new_data_parallel_size:
                 reconfig_request.new_data_parallel_rank = \
                 ReconfigureRankType.SHUTDOWN_CURRENT_RANK
-            future = await self._send_reconfig_message(reconfig_request,
-                                                       engine)
-            reconfig_futures.append(future)
+            coro = self._call_utility_async("reinitialize_distributed",
+                                            reconfig_request,
+                                            engine=engine)
+            reconfig_futures.append(asyncio.create_task(coro))
 
         for _ in range(new_data_parallel_size, cur_data_parallel_size):
             self.core_engines.pop()

From 704432af3c129b7a57fca9b059eefe214159f836 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 27 Aug 2025 14:51:54 +0200
Subject: [PATCH 021/125] [V1] [Hybrid] Disable prefix caching by default for
 hybrid or mamba-based models  (#23716)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/usage/v1_guide.md               | 10 ++++++----
 vllm/model_executor/models/config.py |  9 +++++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 64bd0d9bf507..20234e761133 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -107,14 +107,16 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported. Please note that these models currently require disabling prefix caching in V1.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
+Please note that prefix caching is not yet supported for these models.
 
 Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`). Please note that
-these models currently require disabling prefix caching in V1.
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+Please note that prefix caching is not yet supported for these models.
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that these models currently require disabling prefix caching and enforcing eager mode in V1.
+Please note that prefix caching is not yet supported for these models.
+It is also necessary to enforce eager mode for these models in V1.
 
 #### Encoder-Decoder Models
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index f62209326b98..88b3154de2cb 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -292,12 +292,13 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             return
 
         model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
         compilation_config = vllm_config.compilation_config
 
-        model_cls, _ = ModelRegistry.resolve_model_cls(
-            model_config.architecture,
-            model_config=model_config,
-        )
+        # TODO(tdoublep): remove once prefix caching is enabled
+        cache_config.enable_prefix_caching = False
+        logger.info("Hybrid or mamba-based model detected: disabling prefix "
+                    "caching since it is not yet supported.")
 
         # TODO(tdoublep): remove as full cuda graph support is added
         FCG_NOT_SUPPORTED_MODELS = [

From 5eeef1b90852917b300ed67b98e341eb846ba2e9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 21:24:09 +0800
Subject: [PATCH 022/125] [Model] Explicit `default_pooling_type` interface
 (#23736)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/model_executor/models/bert.py            |  4 +--
 vllm/model_executor/models/bert_with_rope.py  |  5 ++--
 vllm/model_executor/models/gritlm.py          |  2 +-
 vllm/model_executor/models/interfaces.py      | 19 +------------
 vllm/model_executor/models/interfaces_base.py | 28 +++++++++++++++++++
 vllm/model_executor/models/internlm2.py       |  3 +-
 vllm/model_executor/models/modernbert.py      |  3 +-
 .../models/prithvi_geospatial_mae.py          |  7 +++--
 vllm/model_executor/models/qwen2_rm.py        |  3 +-
 vllm/model_executor/models/registry.py        |  7 +++--
 vllm/model_executor/models/roberta.py         |  3 +-
 11 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 22b6c4401213..b34ca5cbe963 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -28,8 +28,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import (SupportsCrossEncoding, SupportsQuant,
-                         default_pooling_type)
+from .interfaces import SupportsCrossEncoding, SupportsQuant
+from .interfaces_base import default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index 129450927e56..dcb7e75456cd 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -27,13 +27,14 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (SupportsQuant,
-                                                   default_pooling_type)
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import SupportsQuant
+from .interfaces_base import default_pooling_type
+
 
 class BertWithRopeEmbedding(nn.Module):
 
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 3f6790269ae6..1b3d541c65cf 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -20,7 +20,7 @@
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
-from .interfaces import default_pooling_type
+from .interfaces_base import default_pooling_type
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 9415e67924e7..22f005849e86 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -3,7 +3,7 @@
 
 from collections.abc import Iterable, Mapping, MutableSequence
 from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol,
-                    TypeVar, Union, overload, runtime_checkable)
+                    Union, overload, runtime_checkable)
 
 import numpy as np
 import torch
@@ -641,23 +641,6 @@ def supports_cross_encoding(
     return is_pooling_model(model) and _supports_cross_encoding(model)
 
 
-_T = TypeVar("_T", bound=type[torch.nn.Module])
-
-
-def default_pooling_type(pooling_type: str):
-    """Set default_pooling_type decorator. """
-
-    def func(model: _T) -> _T:
-        model.default_pooling_type = pooling_type  # type: ignore
-        return model
-
-    return func
-
-
-def get_default_pooling_type(model: Union[type[object], object]) -> str:
-    return getattr(model, "default_pooling_type", "LAST")
-
-
 class SupportsQuant:
     """The interface required for all models that support quantization."""
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 697fa020deb4..19a3ef1a3b80 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -144,6 +144,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
+    default_pooling_type: ClassVar[str] = "LAST"
+    """
+    Indicates the
+    [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][]
+    to use by default.
+
+    You can use the
+    [vllm.model_executor.models.interfaces_base.default_pooling_type][]
+    decorator to conveniently set this field.
+    """
+
     pooler: Pooler
     """The pooler is only called on TP rank 0."""
 
@@ -165,3 +176,20 @@ def is_pooling_model(
         return False
 
     return getattr(model, "is_pooling_model", False)
+
+
+_T = TypeVar("_T", bound=type[nn.Module])
+
+
+def default_pooling_type(pooling_type: str):
+    """Decorator to set `VllmModelForPooling.default_pooling_type`."""
+
+    def func(model: _T) -> _T:
+        model.default_pooling_type = pooling_type  # type: ignore
+        return model
+
+    return func
+
+
+def get_default_pooling_type(model: Union[type[object], object]) -> str:
+    return getattr(model, "default_pooling_type", "LAST")
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index d0c4bf5450d6..26bc48ffbd9b 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -31,7 +31,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type
+from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces_base import default_pooling_type
 from .utils import (is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 72290bf2ee29..477855586128 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -26,7 +26,8 @@
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
 
-from .interfaces import SupportsCrossEncoding, default_pooling_type
+from .interfaces import SupportsCrossEncoding
+from .interfaces_base import default_pooling_type
 from .utils import WeightsMapper, maybe_prefix
 
 
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 59e9f3e8a47b..f46d6375e1f6 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -27,9 +27,6 @@
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import (
-    IsAttentionFree, MultiModalEmbeddings, SupportsMultiModalWithRawInput,
-    default_pooling_type)
 from vllm.model_executor.models.utils import AutoWeightsLoader
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (ImageItem, ModalityData,
@@ -43,6 +40,10 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
 
+from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
+                         SupportsMultiModalWithRawInput)
+from .interfaces_base import default_pooling_type
+
 
 def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
     # This model receives in input a multi-dimensional tensor representing
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index e0a30e04c602..421b43563bad 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -18,7 +18,8 @@
 from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP, default_pooling_type
+from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces_base import default_pooling_type
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, maybe_prefix
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c65c58d4a047..196b5f35e1e4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -25,11 +25,12 @@
 from vllm.transformers_utils.dynamic_module import (
     try_get_class_from_dynamic_module)
 
-from .interfaces import (get_default_pooling_type, has_inner_state, has_noops,
-                         is_attention_free, is_hybrid, supports_cross_encoding,
+from .interfaces import (has_inner_state, has_noops, is_attention_free,
+                         is_hybrid, supports_cross_encoding,
                          supports_multimodal, supports_multimodal_raw_input,
                          supports_pp, supports_transcription, supports_v0_only)
-from .interfaces_base import is_pooling_model, is_text_generation_model
+from .interfaces_base import (get_default_pooling_type, is_pooling_model,
+                              is_text_generation_model)
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
index 49a37342c67f..2bfa51162910 100644
--- a/vllm/model_executor/models/roberta.py
+++ b/vllm/model_executor/models/roberta.py
@@ -22,7 +22,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .bert_with_rope import BertWithRope, JinaRobertaModel
-from .interfaces import SupportsCrossEncoding, default_pooling_type
+from .interfaces import SupportsCrossEncoding
+from .interfaces_base import default_pooling_type
 
 
 class RobertaEmbedding(nn.Module):

From 8dd2baa5978f123974177023d6efab731153a2f4 Mon Sep 17 00:00:00 2001
From: rebel-hongseok <hongseok@rebellions.ai>
Date: Wed, 27 Aug 2025 22:25:49 +0900
Subject: [PATCH 023/125] Add vLLM Korea Meetup in the README.md and meetups.md
 (#23746)

Signed-off-by: rebel-hongseok <hongseok@rebellions.ai>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 README.md                 | 1 +
 docs/community/meetups.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index ef5b43588953..8812aac4ea26 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ Easy, fast, and cheap LLM serving for everyone
 *Latest News* 🔥
 
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
+- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 61ea44220ad2..d76238cb3179 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -3,6 +3,7 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
+- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). 
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).

From 16dc4052b004261b547fc50fe7b20e2d2fbf915d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 14:39:48 +0100
Subject: [PATCH 024/125] Fix pre-commit on main (#23747)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/community/meetups.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index d76238cb3179..221a7bd96213 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -3,7 +3,7 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg), August 23rd 2025. [[Slides]](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH)
-- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view). 
+- [vLLM Korea Meetup](https://luma.com/cgcgprmh), August 19th 2025. [[Slides]](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA), August 2nd 2025. [[Slides]](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) [[Recording]](https://www.chaspark.com/#/live/1166916873711665152).
 - [NYC vLLM Meetup](https://lu.ma/c1rqyf1f), May 7th, 2025. [[Slides]](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing)
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).

From fe8d7b6f03e7d8a36ffb6931397fc81ee594dd64 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 27 Aug 2025 21:41:22 +0800
Subject: [PATCH 025/125] [Model] Interface to enable batch-level DP support
 (#23733)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 docs/configuration/optimization.md       |  7 +++++--
 vllm/config/__init__.py                  |  7 +++++++
 vllm/model_executor/models/interfaces.py | 11 +++++++++++
 vllm/model_executor/models/minicpmv.py   |  2 ++
 vllm/model_executor/models/mllama4.py    |  2 ++
 vllm/model_executor/models/qwen2_5_vl.py |  2 ++
 vllm/model_executor/models/registry.py   |  9 +++++++--
 vllm/model_executor/models/step3_vl.py   |  2 ++
 8 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index a8eab9985c8b..b11ccb5c0027 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -168,8 +168,11 @@ llm = LLM(
     Batch-level DP is not to be confused with API request-level DP
     (which is instead controlled by `data_parallel_size`).
 
-The availability of batch-level DP is based on model implementation.
-Currently, the following models support `mm_encoder_tp_mode="data"`:
+Batch-level DP needs to be implemented on a per-model basis,
+and enabled by setting `supports_encoder_tp_data = True` in the model class.
+Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to use this feature.
+
+Known supported models:
 
 - Llama4 (<gh-pr:18368>)
 - MiniCPM-V-4 (<gh-pr:23327>)
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index ac6f51df9549..e3fb6d796def 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -872,6 +872,13 @@ def maybe_pull_model_tokenizer_for_s3(self, model: str,
 
     def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
         if self._model_info.supports_multimodal:
+            if (self.mm_encoder_tp_mode == "data" and
+                    not self._model_info.supports_multimodal_encoder_tp_data):
+                logger.warning_once(
+                    "This model does not support `--mm-encoder-tp-mode data`. "
+                    "Falling back to `--mm-encoder-tp-mode weights`.")
+                self.mm_encoder_tp_mode = "weights"
+
             return MultiModalConfig(
                 limit_per_prompt=self.limit_mm_per_prompt,
                 media_io_kwargs=self.media_io_kwargs,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 22f005849e86..506732fed361 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -52,6 +52,12 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
+    supports_encoder_tp_data: ClassVar[bool] = False
+    """
+    A flag that indicates whether this model supports
+    `multimodal_config.mm_encoder_tp_mode="data"`.
+    """
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         """
@@ -137,6 +143,11 @@ def supports_multimodal(
     return getattr(model, "supports_multimodal", False)
 
 
+def supports_multimodal_encoder_tp_data(
+        model: Union[type[object], object]) -> bool:
+    return getattr(model, "supports_encoder_tp_data", False)
+
+
 @runtime_checkable
 class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol):
     """The interface required for all multi-modal models."""
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2d785c30fd7d..0181bfeebda0 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1521,6 +1521,8 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         ],
     }
 
+    supports_encoder_tp_data = True
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (4, 0)
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 595bdd17cf2c..ac9b968f7a0c 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -716,6 +716,8 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
         "gate_up_proj": ["gate_proj", "up_proj"],
     }
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 648ba81eb387..b528083b7c9c 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -868,6 +868,8 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
             "model.": "language_model.model.",
         })
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 196b5f35e1e4..80eac78cdfad 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -27,8 +27,10 @@
 
 from .interfaces import (has_inner_state, has_noops, is_attention_free,
                          is_hybrid, supports_cross_encoding,
-                         supports_multimodal, supports_multimodal_raw_input,
-                         supports_pp, supports_transcription, supports_v0_only)
+                         supports_multimodal,
+                         supports_multimodal_encoder_tp_data,
+                         supports_multimodal_raw_input, supports_pp,
+                         supports_transcription, supports_v0_only)
 from .interfaces_base import (get_default_pooling_type, is_pooling_model,
                               is_text_generation_model)
 
@@ -324,6 +326,7 @@ class _ModelInfo:
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input: bool
+    supports_multimodal_encoder_tp_data: bool
     supports_pp: bool
     has_inner_state: bool
     is_attention_free: bool
@@ -343,6 +346,8 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo":
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input=supports_multimodal_raw_input(model),
+            supports_multimodal_encoder_tp_data=
+            supports_multimodal_encoder_tp_data(model),
             supports_pp=supports_pp(model),
             has_inner_state=has_inner_state(model),
             is_attention_free=is_attention_free(model),
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index f8877b584b19..f379d2c15fb6 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -867,6 +867,8 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal,
         "lm_head.": "language_model.lm_head.",
     })
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):

From 513c1fe255f7d4ec3e91f7f5c2dd2d97c0460765 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 14:55:12 +0100
Subject: [PATCH 026/125] Only run `get_attr_docs` if generating help text
 (#23723)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/engine/arg_utils.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 9e7c95ea5205..3399d505e363 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -152,9 +152,17 @@ def is_online_quantization(quantization: Any) -> bool:
     return quantization in ["inc"]
 
 
+NEEDS_HELP = (
+    "--help" in (argv := sys.argv)  # vllm SUBCOMMAND --help
+    or (argv0 := argv[0]).endswith("mkdocs")  # mkdocs SUBCOMMAND
+    or argv0.endswith("mkdocs/__main__.py")  # python -m mkdocs SUBCOMMAND
+)
+
+
 @functools.lru_cache(maxsize=30)
 def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
-    cls_docs = get_attr_docs(cls)
+    # Save time only getting attr docs if we're generating help text
+    cls_docs = get_attr_docs(cls) if NEEDS_HELP else {}
     kwargs = {}
     for field in fields(cls):
         # Get the set of possible types for the field
@@ -172,7 +180,7 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, Any]:
 
         # Get the help text for the field
         name = field.name
-        help = cls_docs[name].strip()
+        help = cls_docs.get(name, "").strip()
         # Escape % for argparse
         help = help.replace("%", "%%")
 
@@ -254,6 +262,9 @@ def parse_dataclass(val: str, cls=dataclass_cls) -> Any:
 def get_kwargs(cls: ConfigType) -> dict[str, Any]:
     """Return argparse kwargs for the given Config dataclass.
 
+    If `--help` or `mkdocs` are not present in the command line command, the
+    attribute documentation will not be included in the help output.
+
     The heavy computation is cached via functools.lru_cache, and a deep copy
     is returned so callers can mutate the dictionary without affecting the
     cached version.

From 3af47c3cc693f432b59658019891393385aa0e2a Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:09:08 -0400
Subject: [PATCH 027/125] [Feature] Add Hopper DeepGEMM E8M0 for DeepSeekV3.1
 scale_fmt (#23666)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 tests/kernels/moe/test_block_fp8.py           |  5 +-
 tests/kernels/moe/test_deepep_deepgemm_moe.py |  7 ++-
 vllm/envs.py                                  |  8 ++-
 .../layers/fused_moe/batched_deep_gemm_moe.py |  4 +-
 .../layers/fused_moe/fused_moe.py             |  7 ++-
 .../layers/fused_moe/triton_deep_gemm_moe.py  |  6 +--
 .../model_executor/layers/quantization/fp8.py |  9 ++--
 .../layers/quantization/utils/fp8_utils.py    |  4 +-
 vllm/transformers_utils/config.py             | 18 +++++++
 vllm/utils/deep_gemm.py                       | 53 +++++++++----------
 10 files changed, 68 insertions(+), 53 deletions(-)

diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index 9e4eaf221f24..ecc57acc6796 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -16,7 +16,7 @@
     fused_topk, modular_triton_fused_moe)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 dg_available = has_deep_gemm()
 
@@ -226,8 +226,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
-                    reason="Not E8M0 scale MOE")
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(), reason="Not E8M0 scale MOE")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
                                             monkeypatch):
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 1e922be47f2b..36a98522a658 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -20,8 +20,7 @@
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
-                                  is_deep_gemm_supported)
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
 
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -374,7 +373,7 @@ def _test_deepep_deepgemm_moe(
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
                     reason="Skipping test for Blackwell DeepGEMM")
 def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
                                 topk: int, world_dp_size: tuple[int, int]):
@@ -432,7 +431,7 @@ def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(is_blackwell_deep_gemm_e8m0_used(),
+@pytest.mark.skipif(is_deep_gemm_e8m0_used(),
                     reason="Skipping test for Blackwell DeepGEMM")
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
diff --git a/vllm/envs.py b/vllm/envs.py
index 66c7c2c7f2c4..35735b552575 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -131,6 +131,7 @@
     VLLM_TPU_USING_PATHWAYS: bool = False
     VLLM_USE_DEEP_GEMM: bool = False
     VLLM_USE_DEEP_GEMM_E8M0: bool = True
+    VLLM_USE_DEEP_GEMM_E8M0_HOPPER: bool = False
     VLLM_SKIP_DEEP_GEMM_WARMUP: bool = False
     VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
@@ -954,9 +955,12 @@ def get_vllm_port() -> Optional[int]:
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
 
     # Whether to use E8M0 scaling when DeepGEMM is used on Blackwell GPUs.
-    # E8M0 is faster on B200 but may reduce accuracy.
     "VLLM_USE_DEEP_GEMM_E8M0":
     lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0", "1"))),
+    # TODO(wentao): unify the two E8M0 flags after verifying the correctness.
+    # Whether to use E8M0 scaling when DeepGEMM is used on Hopper GPUs.
+    "VLLM_USE_DEEP_GEMM_E8M0_HOPPER":
+    lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM_E8M0_HOPPER", "0"))),
     # DeepGemm JITs the kernels on-demand. The warmup attempts to make DeepGemm
     # JIT all the required kernels before model execution so there is no
     # JIT'ing in the hot-path. However, this warmup increases the engine
@@ -1244,6 +1248,8 @@ def compute_hash() -> str:
         "VLLM_USE_FLASHINFER_SAMPLER",
         "VLLM_DISABLED_KERNELS",
         "VLLM_USE_DEEP_GEMM",
+        "VLLM_USE_DEEP_GEMM_E8M0",
+        "VLLM_USE_DEEP_GEMM_E8M0_HOPPER",
         "VLLM_USE_TRTLLM_FP4_GEMM",
         "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
         "VLLM_USE_FLASHINFER_MOE_FP8",
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index c4d680af932f..a5326dfe84f6 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -12,7 +12,7 @@
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (fp8_m_grouped_gemm_nt_masked,
-                                  is_blackwell_deep_gemm_e8m0_used)
+                                  is_deep_gemm_e8m0_used)
 
 logger = init_logger(__name__)
 
@@ -174,7 +174,7 @@ def silu_mul_fp8_quant_deep_gemm(
         eps,
         fp8_min,
         fp8_max,
-        is_blackwell_deep_gemm_e8m0_used(),
+        is_deep_gemm_e8m0_used(),
         BLOCK=group_size,
         NUM_STAGES=4,
         num_warps=1,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 84dafcf00d82..17a5c735a57f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -40,7 +40,7 @@
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
 
@@ -1431,9 +1431,8 @@ def fused_experts(hidden_states: torch.Tensor,
     # E8M0 scale, which means we requantize the weight and input to the specific
     # scale. Fallen back to cutlass or triton for some cases would cause
     # accuracy issue.
-    if (allow_deep_gemm and use_fp8_w8a8
-            and (is_blackwell_deep_gemm_e8m0_used()
-                 or _valid_deep_gemm(hidden_states, w1, w2))):
+    if (allow_deep_gemm and use_fp8_w8a8 and
+        (is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2))):
         assert apply_router_weight_on_input is False
         assert is_act_and_mul, (
             "DeepGemm only supports is_act_and_mul=True for now.")
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
index 486ca881df48..6cd81d97f029 100644
--- a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -10,7 +10,7 @@
     DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape,
     deep_gemm_block_shape)
 from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
-from vllm.utils.deep_gemm import is_blackwell_deep_gemm_e8m0_used
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 
 
 class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
@@ -107,7 +107,7 @@ def workspace_shapes(
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and (is_blackwell_deep_gemm_e8m0_used()
+        if self.allow_deep_gemm and (is_deep_gemm_e8m0_used()
                                      or _valid_deep_gemm_shape(M, N, K)):
             assert self.deep_gemm_expert is not None
             return self.deep_gemm_expert.workspace_shapes(
@@ -143,7 +143,7 @@ def apply(
     ):
         use_deep_gemm = (self.allow_deep_gemm
                          and (_valid_deep_gemm(hidden_states, w1, w2)
-                              or is_blackwell_deep_gemm_e8m0_used()))
+                              or is_deep_gemm_e8m0_used()))
 
         experts = self.deep_gemm_expert if use_deep_gemm else self.triton_expert
         assert experts is not None
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d45d368b582d..be358cfa949f 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -48,8 +48,7 @@
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
-                                  is_deep_gemm_supported)
+from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
 from vllm.utils.flashinfer import has_flashinfer_moe
 
 if TYPE_CHECKING:
@@ -427,7 +426,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # On B200, if E8M0 for DeepGemm is used, we need to
         # requantize the weight and input to the specific scale
         # at the same time.
-        if is_blackwell_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used():
             assert layer.weight_block_size is not None
             block_sz = tuple(layer.weight_block_size)
             requant_weight_ue8m0_inplace(
@@ -734,7 +733,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
 
             # DeepGemm scales need to be transposed and aligned.  We try to do
             # it ahead of time for performance reasons.
-            if self.allow_deep_gemm and not is_blackwell_deep_gemm_e8m0_used():
+            if self.allow_deep_gemm and not is_deep_gemm_e8m0_used():
                 # Lazy import to avoid CUDA initialization problems.
                 if _is_col_major(layer.w13_weight_scale_inv):
                     layer.w13_weight_scale_inv = \
@@ -871,7 +870,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-        if is_blackwell_deep_gemm_e8m0_used():
+        if is_deep_gemm_e8m0_used():
             assert layer.weight_block_size is not None
             # Re-quantise the expert weights so their scales are UE8M0.
             block_sz = tuple(layer.weight_block_size)
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index ab1d5383f465..7b324dce3c36 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -20,7 +20,7 @@
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv, direct_register_custom_op
-from vllm.utils.deep_gemm import (is_blackwell_deep_gemm_e8m0_used,
+from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used,
                                   should_use_deepgemm_for_fp8_linear)
 
 logger = init_logger(__name__)
@@ -385,7 +385,7 @@ def per_token_group_quant_fp8(
         scaling factor.
     """
     if use_ue8m0 is None:
-        use_ue8m0 = is_blackwell_deep_gemm_e8m0_used()
+        use_ue8m0 = is_deep_gemm_e8m0_used()
     dtype = current_platform.fp8_dtype() if dtype is None else dtype
     assert (x.shape[-1] % group_size == 0), (
         f"the last dimension of `x` {x.shape[-1]} must be divisible "
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 2cd799e5eb5a..bec792465bfb 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -501,6 +501,24 @@ def get_config(
 
     if quantization_config is not None:
         config.quantization_config = quantization_config
+        # auto-enable DeepGEMM UE8M0 on Hopper if model config requests it
+        scale_fmt = quantization_config.get("scale_fmt", None)
+        if scale_fmt in ("ue8m0", ):
+            if not envs.is_set("VLLM_USE_DEEP_GEMM_E8M0_HOPPER"):
+                os.environ["VLLM_USE_DEEP_GEMM_E8M0_HOPPER"] = "1"
+                logger.info_once(
+                    ("Detected quantization_config.scale_fmt=%s; "
+                     "enabling Hopper UE8M0."),
+                    scale_fmt,
+                )
+            elif not envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+                logger.warning_once(
+                    ("Model config requests UE8M0 "
+                     "(quantization_config.scale_fmt=%s), but "
+                     "VLLM_USE_DEEP_GEMM_E8M0_HOPPER=0 is set; "
+                     "Hopper UE8M0 disabled."),
+                    scale_fmt,
+                )
 
     if hf_overrides_kw:
         logger.debug("Overriding HF config with %s", hf_overrides_kw)
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index b0bc3a79eb0a..cd1dbfb813fe 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -31,34 +31,33 @@ def is_deep_gemm_supported() -> bool:
 
 
 @functools.cache
-def is_blackwell_deep_gemm_e8m0_used() -> bool:
+def is_deep_gemm_e8m0_used() -> bool:
     """Return ``True`` if vLLM is configured to use DeepGEMM "
-    "E8M0 scale on a Blackwell-class GPU.
+    "E8M0 scale on a Hopper or Blackwell-class GPU.
     """
     if not is_deep_gemm_supported():
-        logger.debug_once(
+        logger.info_once(
             "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.")
         return False
 
-    if not envs.VLLM_USE_DEEP_GEMM_E8M0:
-        logger.debug_once("DeepGEMM E8M0 disabled: VLLM_USE_DEEP_GEMM_E8M0=0.")
-        return False
-
     _lazy_init()
 
     if _fp8_gemm_nt_impl is None:
-        logger.debug_once(
-            "DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
+        logger.info_once("DeepGEMM E8M0 disabled: _fp8_gemm_nt_impl not found")
         return False
 
-    enabled = (current_platform.is_cuda()
-               and current_platform.has_device_capability(100))
-    if enabled:
-        logger.debug_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
-    else:
-        logger.debug_once(
-            "DeepGEMM E8M0 disabled: not running on Blackwell GPU.")
-    return enabled
+    if current_platform.is_device_capability(100) and \
+            envs.VLLM_USE_DEEP_GEMM_E8M0:
+        logger.info_once("DeepGEMM E8M0 enabled on Blackwell GPU.")
+        return True
+
+    if current_platform.is_device_capability(90) and \
+            envs.VLLM_USE_DEEP_GEMM_E8M0_HOPPER:
+        logger.info_once("DeepGEMM E8M0 enabled on Hopper GPU.")
+        return True
+
+    logger.info_once("DeepGEMM E8M0 disabled on current configuration.")
+    return False
 
 
 def _missing(*_: Any, **__: Any) -> NoReturn:
@@ -124,20 +123,18 @@ def fp8_gemm_nt(*args, **kwargs):
     _lazy_init()
     if _fp8_gemm_nt_impl is None:
         return _missing(*args, **kwargs)
-    return _fp8_gemm_nt_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+    return _fp8_gemm_nt_impl(*args,
+                             disable_ue8m0_cast=not is_deep_gemm_e8m0_used(),
+                             **kwargs)
 
 
 def m_grouped_fp8_gemm_nt_contiguous(*args, **kwargs):
     _lazy_init()
     if _grouped_impl is None:
         return _missing(*args, **kwargs)
-    return _grouped_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+    return _grouped_impl(*args,
+                         disable_ue8m0_cast=not is_deep_gemm_e8m0_used(),
+                         **kwargs)
 
 
 def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
@@ -145,9 +142,7 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs):
     if _grouped_masked_impl is None:
         return _missing(*args, **kwargs)
     return _grouped_masked_impl(
-        *args,
-        disable_ue8m0_cast=not is_blackwell_deep_gemm_e8m0_used(),
-        **kwargs)
+        *args, disable_ue8m0_cast=not is_deep_gemm_e8m0_used(), **kwargs)
 
 
 def _ceil_to_ue8m0(x: torch.Tensor):
@@ -211,7 +206,7 @@ def should_use_deepgemm_for_fp8_linear(output_dtype: torch.dtype,
     "m_grouped_fp8_gemm_nt_contiguous",
     "fp8_m_grouped_gemm_nt_masked",
     "per_block_cast_to_fp8",
-    "is_blackwell_deep_gemm_e8m0_used",
+    "is_deep_gemm_e8m0_used",
     "is_deep_gemm_supported",
     "should_use_deepgemm_for_fp8_linear",
 ]

From 841490434aaee4b1c8d8427112af740b6662f384 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Wed, 27 Aug 2025 22:45:17 +0800
Subject: [PATCH 028/125] [Model] Enable native HF format InternVL support
 (#23742)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md               |  1 +
 .../multimodal/generation/test_common.py      | 29 +++++++++----------
 tests/models/registry.py                      |  3 +-
 vllm/model_executor/models/registry.py        |  1 +
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 35a5fa0c2e42..20cf75873af7 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -629,6 +629,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ | ✅︎ |
 | `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 96208f8eda62..2b60faae8ec0 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -222,21 +222,6 @@
         },
         marks=[large_gpu_mark(min_gb=32)],
     ),
-    # Check "auto" with fallback to transformers
-    "internvl-transformers": VLMTestInfo(
-        models=["OpenGVLab/InternVL3-1B-hf"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
-        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
-        max_model_len=4096,
-        use_tokenizer_eos=True,
-        image_size_factors=[(0.25, 0.5, 1.0)],
-        vllm_runner_kwargs={
-            "model_impl": "auto",
-        },
-        auto_cls=AutoModelForImageTextToText,
-        marks=[pytest.mark.core_model],
-    ),
     #### Extended model tests
     "aria": VLMTestInfo(
         models=["rhymes-ai/Aria"],
@@ -461,6 +446,20 @@
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
+    "intern_vl-hf": VLMTestInfo(
+        models=["OpenGVLab/InternVL3-1B-hf"],
+        test_type=(
+            VLMTestType.IMAGE,
+            VLMTestType.MULTI_IMAGE,
+            VLMTestType.VIDEO,
+        ),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
+        video_idx_to_prompt=lambda idx: "<video>",
+        max_model_len=8192,
+        use_tokenizer_eos=True,
+        auto_cls=AutoModelForImageTextToText,
+    ),
     "kimi_vl": VLMTestInfo(
         models=["moonshotai/Kimi-VL-A3B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ee546e7af85c..2538e71692c4 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -429,6 +429,7 @@ def check_available_online(
                                                  "3.5-qwen3moe": "OpenGVLab/InternVL3_5-30B-A3B",   # noqa: E501
                                                  "3.5-gptoss": "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview"},  # noqa: E501
                                          trust_remote_code=True),
+    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
     "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                     trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -584,7 +585,7 @@ def check_available_online(
 _TRANSFORMERS_BACKEND_MODELS = {
     "TransformersModel": _HfExamplesInfo("Qwen/Qwen3-Embedding-0.6B"),
     "TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True),  # noqa: E501
-    "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
+    "TransformersForMultimodalLM": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
 }
 
 _EXAMPLE_MODELS = {
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 80eac78cdfad..02ef301a52a4 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -220,6 +220,7 @@
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "InternS1ForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
+    "InternVLForConditionalGeneration": ("interns1", "InternS1ForConditionalGeneration"),  # noqa: E501
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),

From 83f555f637b41a0f533fa1d37b194df6f564ac64 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Wed, 27 Aug 2025 16:59:34 +0200
Subject: [PATCH 029/125] [Doc]: upgrade version of crate-ci tool for improved
 typo detection (#23755)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 612b290e88d4..c16bdeeecd07 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,7 +21,7 @@ repos:
   - id: ruff-format
     files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.34.0
+  rev: v1.35.5
   hooks:
   - id: typos
 - repo: https://github.com/PyCQA/isort

From 3ce8285d6d96b929fddbb8d29be9ed3b81adcd75 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Wed, 27 Aug 2025 08:11:33 -0700
Subject: [PATCH 030/125] [LogitsProcs] Deduplicate built-in LP implementation
 logic (#23362)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 .../offline_inference/logits_processor.py     |  38 ++---
 tests/v1/logits_processors/utils.py           |  37 ++---
 vllm/v1/sample/logits_processor/builtin.py    | 148 ++++++++----------
 vllm/v1/sample/logits_processor/interface.py  |  15 +-
 4 files changed, 95 insertions(+), 143 deletions(-)

diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
index 7ef20efa7d28..3e122319169e 100644
--- a/examples/offline_inference/logits_processor.py
+++ b/examples/offline_inference/logits_processor.py
@@ -42,8 +42,8 @@ class object.
 from vllm.v1.sample.logits_processor import (
     BatchUpdate,
     LogitsProcessor,
-    MoveDirectionality,
 )
+from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
 
 # Hypothetical custom logits processor
@@ -53,38 +53,22 @@ class DummyLogitsProcessor(LogitsProcessor):
     def __init__(
         self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
     ):
-        self.req_info: dict[int, SamplingParams] = {}
+        self.req_info: dict[int, int] = {}
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            assert params is not None
-            if params.extra_args and (
-                target_token := params.extra_args.get("target_token")
-            ):
-                self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional move (a->b) and swap
-            # (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                a_val = self.req_info.pop(adx, None)
-                b_val = self.req_info.pop(bdx, None)
-                if a_val is not None:
-                    self.req_info[bdx] = a_val
-                if direct == MoveDirectionality.SWAP and b_val is not None:
-                    self.req_info[adx] = b_val
+        process_dict_updates(
+            self.req_info,
+            batch_update,
+            # This function returns the LP's per-request state based on the
+            # request details, or None if this LP does not apply to the
+            # request.
+            lambda params, _, __: params.extra_args
+            and (params.extra_args.get("target_token")),
+        )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if not self.req_info:
diff --git a/tests/v1/logits_processors/utils.py b/tests/v1/logits_processors/utils.py
index c0bfc1a18fec..c36f1bd021c7 100644
--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -8,10 +8,9 @@
 import torch
 
 from vllm.config import VllmConfig
-from vllm.sampling_params import SamplingParams
 from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
-                                             LogitsProcessor,
-                                             MoveDirectionality)
+                                             LogitsProcessor)
+from vllm.v1.sample.logits_processor.builtin import process_dict_updates
 
 MODEL_NAME = "facebook/opt-125m"
 POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
@@ -45,37 +44,19 @@ class DummyLogitsProcessor(LogitsProcessor):
 
     def __init__(self, vllm_config: "VllmConfig", device: torch.device,
                  is_pin_memory: bool):
-        self.req_info: dict[int, SamplingParams] = {}
+        self.req_info: dict[int, int] = {}
 
     def is_argmax_invariant(self) -> bool:
         """Never impacts greedy sampling"""
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            assert params is not None
-            if params.extra_args and (target_token :=
-                                      params.extra_args.get("target_token")):
-                self.req_info[index] = target_token
-
-        if self.req_info:
-            # Process removed requests.
-            for index in batch_update.removed:
-                self.req_info.pop(index, None)
-
-            # Process moved requests, unidirectional move (a->b) and swap
-            # (a<->b)
-            for adx, bdx, direct in batch_update.moved:
-                a_val = self.req_info.pop(adx, None)
-                b_val = self.req_info.pop(bdx, None)
-                if a_val is not None:
-                    self.req_info[bdx] = a_val
-                if direct == MoveDirectionality.SWAP and b_val is not None:
-                    self.req_info[adx] = b_val
+        process_dict_updates(
+            self.req_info,
+            batch_update,
+            lambda params, _, __: params.extra_args and
+            (params.extra_args.get("target_token")),
+        )
 
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
         if not self.req_info:
diff --git a/vllm/v1/sample/logits_processor/builtin.py b/vllm/v1/sample/logits_processor/builtin.py
index 00dd757489ca..60f9c0bdb631 100644
--- a/vllm/v1/sample/logits_processor/builtin.py
+++ b/vllm/v1/sample/logits_processor/builtin.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Sequence
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar
 
 import torch
 
+from vllm import SamplingParams
 from vllm.v1.sample.logits_processor.interface import (BatchUpdate,
                                                        LogitsProcessor,
                                                        MoveDirectionality)
@@ -12,6 +13,8 @@
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+T = TypeVar("T")
+
 
 class MinPLogitsProcessor(LogitsProcessor):
 
@@ -130,49 +133,15 @@ def is_argmax_invariant(self) -> bool:
         return False
 
     def update_state(self, batch_update: Optional[BatchUpdate]):
-        if not batch_update:
-            return
-
-        needs_update: bool = False
-        # Process added requests.
-        for index, params, _, _ in batch_update.added:
-            if lb := params.logit_bias:
-                self.biases[index] = lb
-                needs_update = True
-            else:
-                # Drop biases metadata at batch index
-                if self.biases.pop(index, None) is not None:
-                    # If a new request replaces an old request which
-                    # specified biases, we should update processor tensors
-                    needs_update = True
-
-        if self.biases:
-            # Process removed requests.
-            for index in batch_update.removed:
-                if self.biases.pop(index, None):
-                    needs_update = True
-
-            # Process moved requests, unidirectional (a->b) and swap (a<->b)
-            for a_index, b_index, direct in batch_update.moved:
-                if direct == MoveDirectionality.UNIDIRECTIONAL:
-                    if (a_entry := self.biases.pop(a_index, None)) is None:
-                        if self.biases.pop(b_index, None) is not None:
-                            needs_update = True
-                    else:
-                        self.biases[b_index] = a_entry
-                        needs_update = True
-                else:
-                    a_entry = self.biases.pop(a_index, None)
-                    if (b_entry := self.biases.pop(b_index, None)) is not None:
-                        self.biases[a_index] = b_entry
-                        needs_update = True
-                    if a_entry is not None:
-                        self.biases[b_index] = a_entry
-                        needs_update = True
+        needs_update = process_dict_updates(
+            self.biases, batch_update,
+            lambda params, _, __: params.logit_bias or None)
 
         # Update tensors if needed.
         if needs_update:
-            reqs, tok_ids, biases = [], [], []
+            reqs: list[int] = []
+            tok_ids: list[int] = []
+            biases: list[float] = []
             for req, lb in self.biases.items():
                 reqs.extend([req] * len(lb))
                 tok_ids.extend(lb.keys())
@@ -216,52 +185,18 @@ def is_argmax_invariant(self) -> bool:
         of the argmax operation in greedy sampling."""
         return False
 
-    def update_state(self, batch_update: Optional[BatchUpdate]):
-        needs_update = False
-
-        if batch_update:
-            # Process added requests.
-            for index, params, _, output_tok_ids in batch_update.added:
-                if ((min_tokens := params.min_tokens)
-                        and len(output_tok_ids) < min_tokens):
-                    # Replace request metadata at batch index
-                    self.min_toks[index] = (min_tokens, output_tok_ids,
-                                            params.all_stop_token_ids)
-                    needs_update = True
-                else:
-                    # Drop min_toks metadata at batch index
-                    if self.min_toks.pop(index, None) is not None:
-                        # If a new request replaces an old request which
-                        # specified min_toks, we should update processor tensors
-                        needs_update = True
-
-            if self.min_toks:
-                # Process removed requests.
-                for index in batch_update.removed:
-                    if self.min_toks.pop(index, None):
-                        needs_update = True
-
-                # Process moved requests, unidirectional (a->b) and
-                # swapped (a<->b)
-                for a_index, b_index, direct in batch_update.moved:
-                    if direct == MoveDirectionality.UNIDIRECTIONAL:
-                        if (a_entry := self.min_toks.pop(a_index,
-                                                         None)) is None:
-                            if self.min_toks.pop(b_index, None) is not None:
-                                needs_update = True
-                        else:
-                            self.min_toks[b_index] = a_entry
-                            needs_update = True
-                    else:
-                        a_entry = self.min_toks.pop(a_index, None)
-                        if (b_entry := self.min_toks.pop(b_index,
-                                                         None)) is not None:
-                            self.min_toks[a_index] = b_entry
-                            needs_update = True
-                        if a_entry is not None:
-                            self.min_toks[b_index] = a_entry
-                            needs_update = True
+    @staticmethod
+    def add_request(
+        params: SamplingParams, _: list[int], output_tok_ids: list[int]
+    ) -> Optional[tuple[int, Sequence[int], set[int]]]:
+        min_tokens = params.min_tokens
+        if not min_tokens or len(output_tok_ids) >= min_tokens:
+            return None
+        return min_tokens, output_tok_ids, params.all_stop_token_ids
 
+    def update_state(self, batch_update: Optional[BatchUpdate]):
+        needs_update = process_dict_updates(self.min_toks, batch_update,
+                                            self.add_request)
         if self.min_toks:
             # Check for any requests that have attained their min tokens.
             to_remove = tuple(index for index, (min_toks, out_tok_ids,
@@ -295,3 +230,44 @@ def apply(self, logits: torch.Tensor) -> torch.Tensor:
             # Inhibit EOS token for requests which have not reached min length
             logits[self.logits_slice] = -float("inf")
         return logits
+
+
+def process_dict_updates(
+    req_entries: dict[int, T], batch_update: Optional[BatchUpdate],
+    new_state: Callable[[SamplingParams, list[int], list[int]], Optional[T]]
+) -> bool:
+    """Utility function to update dict state for sparse LogitsProcessors."""
+
+    if not batch_update:
+        # Nothing to do.
+        return False
+
+    updated = False
+    for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
+        if (state := new_state(params, prompt_tok_ids,
+                               output_tok_ids)) is not None:
+            req_entries[index] = state
+            updated = True
+        elif req_entries.pop(index, None) is not None:
+            updated = True
+
+    if req_entries:
+        # Process removed requests.
+        for index in batch_update.removed:
+            if req_entries.pop(index, None):
+                updated = True
+
+        # Process moved requests, unidirectional (a->b) and
+        # swapped (a<->b)
+        for a_index, b_index, direct in batch_update.moved:
+            a_entry = req_entries.pop(a_index, None)
+            b_entry = req_entries.pop(b_index, None)
+            if a_entry is not None:
+                req_entries[b_index] = a_entry
+                updated = True
+            if b_entry is not None:
+                updated = True
+                if direct == MoveDirectionality.SWAP:
+                    req_entries[a_index] = b_entry
+
+    return updated
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 12b4db24bff8..16cd00943db8 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -44,10 +44,16 @@ class BatchUpdate:
     # Key assumption: the `output_tok_ids` list (which is an element of each
     # tuple in `added`) is a reference to the request's running output tokens
     # list; via this reference, the logits processors always see the latest
-    # list of generated output tokens
+    # list of generated output tokens.
+    #
+    # NOTE:
+    # * Added or moved requests may replace existing requests with the same
+    #   index.
+    # * Operations should be processed in the following order:
+    #   - removed, added, moved
     removed: Sequence[RemovedRequest]
-    moved: Sequence[MovedRequest]
     added: Sequence[AddedRequest]
+    moved: Sequence[MovedRequest]
 
 
 class LogitsProcessor(ABC):
@@ -59,6 +65,11 @@ def __init__(self, vllm_config: "VllmConfig", device: torch.device,
 
     @abstractmethod
     def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """Apply LogitsProcessor to batch logits tensor.
+
+        The updated tensor must be returned but may be
+        modified in-place.
+        """
         raise NotImplementedError
 
     @abstractmethod

From 2b61d2e22fbcfd6c9df9cdf06f5905b311c2ca18 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:22:21 +0100
Subject: [PATCH 031/125] [Docs] Remove in-tree Gaudi install instructions
 (#23628)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 docs/getting_started/installation/README.md   |   1 -
 .../installation/intel_gaudi.md               | 388 ------------------
 2 files changed, 389 deletions(-)
 delete mode 100644 docs/getting_started/installation/intel_gaudi.md

diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
index 0ee680f5c688..8a658b7a9103 100644
--- a/docs/getting_started/installation/README.md
+++ b/docs/getting_started/installation/README.md
@@ -12,7 +12,6 @@ vLLM supports the following hardware platforms:
     - [Apple silicon](cpu.md#apple-silicon)
     - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
 - [Google TPU](google_tpu.md)
-- [Intel Gaudi](intel_gaudi.md)
 - [AWS Neuron](aws_neuron.md)
 
 ## Hardware Plugins
diff --git a/docs/getting_started/installation/intel_gaudi.md b/docs/getting_started/installation/intel_gaudi.md
deleted file mode 100644
index ff912efec9ca..000000000000
--- a/docs/getting_started/installation/intel_gaudi.md
+++ /dev/null
@@ -1,388 +0,0 @@
-# Intel Gaudi
-
-This page provides instructions on running vLLM with Intel Gaudi devices.
-
-!!! warning
-    There are no pre-built wheels or images for this device, so you must build vLLM from source.
-
-## Requirements
-
-- OS: Ubuntu 22.04 LTS
-- Python: 3.10
-- Intel Gaudi accelerator
-- Intel Gaudi software version 1.18.0
-
-Please follow the instructions provided in the
-[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
-to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the
-[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
-
-## Configure a new environment
-
-### Environment verification
-
-To verify that the Intel Gaudi software was correctly installed, run:
-
-```bash
-hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
-apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
-pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
-pip list | grep neural # verify that neural_compressor_pt is installed
-```
-
-Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
-for more details.
-
-### Run Docker Image
-
-It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
-for more details.
-
-Use the following commands to run a Docker image:
-
-```bash
-docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-docker run \
-  -it \
-  --runtime=habana \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --net=host \
-  --ipc=host \
-  vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-```
-
-## Set up using Python
-
-### Pre-built wheels
-
-Currently, there are no pre-built Intel Gaudi wheels.
-
-### Build wheel from source
-
-To build and install vLLM from source, run:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
-
-```bash
-git clone https://github.com/HabanaAI/vllm-fork.git
-cd vllm-fork
-git checkout habana_main
-pip install -r requirements/hpu.txt
-python setup.py develop
-```
-
-## Set up using Docker
-
-### Pre-built images
-
-Currently, there are no pre-built Intel Gaudi images.
-
-### Build image from source
-
-```bash
-docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
-docker run \
-  -it \
-  --runtime=habana \
-  -e HABANA_VISIBLE_DEVICES=all \
-  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
-  --cap-add=sys_nice \
-  --net=host \
-  --rm vllm-hpu-env
-```
-
-!!! tip
-    If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-
-## Extra information
-
-### Supported features
-
-- [Offline inference](../../serving/offline_inference.md)
-- Online serving via [OpenAI-Compatible Server](../../serving/openai_compatible_server.md)
-- HPU autodetection - no need to manually select device within vLLM
-- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
-- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
-  prefill attention, Root Mean Square Layer Normalization, Rotary
-  Positional Encoding
-- Tensor parallelism support for multi-card inference
-- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
-  for accelerating low-batch latency and throughput
-- Attention with Linear Biases (ALiBi)
-- INC quantization
-
-### Unsupported features
-
-- Beam search
-- LoRA adapters
-- AWQ quantization
-- Prefill chunking (mixed-batch inferencing)
-
-### Supported configurations
-
-The following configurations have been validated to function with
-Gaudi2 devices. Configurations that are not listed may or may not work.
-
-| Model | TP Size| dtype | Sampling |
-|-------|--------|--------|----------|
-| [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1, 2, 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 8 | BF16 | Random / Greedy |
-| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 8 | BF16 | Random / Greedy |
-
-## Performance tuning
-
-### Execution modes
-
-Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
-
-|   `PT_HPU_LAZY_MODE` |   `enforce_eager` | execution mode     |
-|----------------------|-------------------|--------------------|
-|                    0 |                 0 | torch.compile      |
-|                    0 |                 1 | PyTorch eager mode |
-|                    1 |                 0 | HPU Graphs         |
-
-!!! warning
-    In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-
-[](){ #gaudi-bucketing-mechanism }
-
-### Bucketing mechanism
-
-Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
-In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
-
-!!! note
-    Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-
-Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
-
-```text
-INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-```
-
-| Parameter      | Description                                                                 |
-|----------------|-----------------------------------------------------------------------------|
-| `min`          | Determines the lowest value of the bucket.                                  |
-| `step`         | Determines the interval between buckets.                                     |
-| `max`          | Determines the upper bound of the bucket.                                    |
-| Ramp-up phase  | A special handling phase applied between `min` and `step`:<br/>- `min` is multiplied by consecutive powers of two until `step` is reached.<br/>- Minimizes resource wastage for small batch sizes.<br/>- Allows larger padding for larger batches. |
-
-Example (with ramp-up):
-
-```text
-min = 2, step = 32, max = 64
-=> ramp_up = (2, 4, 8, 16)
-=> stable = (32, 64)
-=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
-```
-
-Example (without ramp-up):
-
-```text
-min = 128, step = 128, max = 512
-=> ramp_up = ()
-=> stable = (128, 256, 384, 512)
-=> buckets = ramp_up + stable => (128, 256, 384, 512)
-```
-
-In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
-
-!!! warning
-    If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-
-As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
-
-!!! note
-    Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-
-### Warmup
-
-Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
-
-??? console "Logs"
-
-    ```text
-    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
-    INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
-    INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
-    ...
-    INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
-    INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
-    INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
-    ...
-    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
-    INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    ```
-
-This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
-
-!!! tip
-    Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-
-### HPU Graph capture
-
-[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
-
-When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default).
-Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
-Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
-Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
-Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture.
-With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
-Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
-Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
-
-!!! note
-    `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-
-User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-
-- `max_bs` - graph capture queue will be sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
-- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
-
-When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
-
-!!! note
-    `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt to do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-
-Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
-
-??? console "Logs"
-
-    ```text
-    INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
-    INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
-    INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
-    INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-    INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
-    INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
-    INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
-    INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
-    ...
-    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
-    INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
-    ...
-    INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
-    INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
-    ...
-    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
-    INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
-    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
-    INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
-    INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
-    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
-    INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
-    INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
-    INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
-    ```
-
-### Recommended vLLM Parameters
-
-- We recommend running inference on Gaudi 2 with `block_size` of 128
-  for BF16 data type. Using default values (16, 32) might lead to
-  sub-optimal performance due to Matrix Multiplication Engine
-  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
-- For max throughput on Llama 7B, we recommend running with batch size
-  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
-  If you encounter out-of-memory issues, see troubleshooting section.
-
-### Environment variables
-
-**Diagnostic and profiling knobs:**
-
-- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
-- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
-- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
-
-**Performance tuning knobs:**
-
-- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default
-
-- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default
-
-- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default
-
-- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default
-
-- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default
-
-- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism
-
-    - `{phase}` is either `PROMPT` or `DECODE`
-
-    - `{dim}` is either `BS`, `SEQ` or `BLOCK`
-
-    - `{param}` is either `MIN`, `STEP` or `MAX`
-
-    - Default values:
-
-| `{phase}` | Parameter | Env Variable | Value Expression |
-|-----------|-----------|--------------|------------------|
-| Prompt | Batch size min | `VLLM_PROMPT_BS_BUCKET_MIN` | `1` |
-| Prompt | Batch size step | `VLLM_PROMPT_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Prompt | Batch size max | `VLLM_PROMPT_BS_BUCKET_MAX` | `min(max_num_seqs, 64)` |
-| Prompt | Sequence length min | `VLLM_PROMPT_SEQ_BUCKET_MIN` | `block_size` |
-| Prompt | Sequence length step | `VLLM_PROMPT_SEQ_BUCKET_STEP` | `block_size` |
-| Prompt | Sequence length max | `VLLM_PROMPT_SEQ_BUCKET_MAX` | `max_model_len` |
-| Decode | Batch size min | `VLLM_DECODE_BS_BUCKET_MIN` | `1` |
-| Decode | Batch size step | `VLLM_DECODE_BS_BUCKET_STEP` | `min(max_num_seqs, 32)` |
-| Decode | Batch size max | `VLLM_DECODE_BS_BUCKET_MAX` | `max_num_seqs` |
-| Decode | Sequence length min | `VLLM_DECODE_BLOCK_BUCKET_MIN` | `block_size` |
-| Decode | Sequence length step | `VLLM_DECODE_BLOCK_BUCKET_STEP` | `block_size` |
-| Decode | Sequence length max | `VLLM_DECODE_BLOCK_BUCKET_MAX` | `max(128, (max_num_seqs*max_model_len)/block_size)` |
-
-Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
-
-- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
-- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
-
-## Troubleshooting: tweaking HPU graphs
-
-If you experience device out-of-memory issues or want to attempt
-inference at higher batch sizes, try tweaking HPU Graphs by following
-the below:
-
-- Tweak `gpu_memory_utilization` knob. It will decrease the
-  allocation of KV cache, leaving some headroom for capturing graphs
-  with larger batch size. By default `gpu_memory_utilization` is set
-  to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
-  short profiling run. Note that decreasing reduces the number of KV
-  cache blocks you have available, and therefore reduces the effective
-  maximum number of tokens you can handle at a given time.
-- If this method is not efficient, you can disable `HPUGraph`
-  completely. With HPU Graphs disabled, you are trading latency and
-  throughput at lower batches for potentially higher throughput on
-  higher batches. You can do that by adding `--enforce-eager` flag to
-  server (for online serving), or by passing `enforce_eager=True`
-  argument to LLM constructor (for offline inference).

From 4f35be10a96feeca0328d3ab8d359e1eaae5c23d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Wed, 27 Aug 2025 12:47:28 -0400
Subject: [PATCH 032/125] [BugFix] Fix topk_softmax assert (#19764)

Signed-off-by: Luka Govedic <lgovedic@redhat.com>
---
 csrc/moe/topk_softmax_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 99c52ef17d08..cd80bfda7dfd 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -573,7 +573,7 @@ void topk_softmax(
             stream);
     }
     else {
-        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
+        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
         vllm::moe::topkGatingSoftmaxKernelLauncher(
             gating_output.data_ptr<float>(),
             topk_weights.data_ptr<float>(),

From 52883ed08461943ff55d5dd3cf12a28c00902fa7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 28 Aug 2025 01:01:50 +0800
Subject: [PATCH 033/125] [Model] Merge `SupportsMultiModalWithRawInput` with
 `SupportsMultiModal` (#23749)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/__init__.py                       |  8 ++--
 vllm/model_executor/models/interfaces.py      | 45 +++++--------------
 .../models/prithvi_geospatial_mae.py          |  6 +--
 vllm/model_executor/models/registry.py        | 11 ++---
 vllm/v1/worker/gpu_model_runner.py            | 10 +++--
 5 files changed, 30 insertions(+), 50 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index e3fb6d796def..351833d3f02d 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1698,6 +1698,10 @@ def uses_mrope(self) -> bool:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
+    @property
+    def is_multimodal_raw_input_only_model(self) -> bool:
+        return self._model_info.supports_multimodal_raw_input_only
+
     @property
     def is_cross_encoder(self) -> bool:
         return (self._model_info.supports_cross_encoding
@@ -1707,10 +1711,6 @@ def is_cross_encoder(self) -> bool:
     def is_pp_supported(self) -> bool:
         return self._model_info.supports_pp
 
-    @property
-    def is_multimodal_raw_input_supported(self) -> bool:
-        return self._model_info.supports_multimodal_raw_input
-
     @property
     def is_attention_free(self) -> bool:
         return self._model_info.is_attention_free
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 506732fed361..2ee966fb5c0c 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -52,6 +52,12 @@ class SupportsMultiModal(Protocol):
         MRO of your model class.
     """
 
+    supports_multimodal_raw_input_only: ClassVar[bool] = False
+    """
+    A flag that indicates this model supports multi-modal inputs and processes
+    them in their raw form and not embeddings.
+    """
+
     supports_encoder_tp_data: ClassVar[bool] = False
     """
     A flag that indicates whether this model supports
@@ -143,43 +149,14 @@ def supports_multimodal(
     return getattr(model, "supports_multimodal", False)
 
 
-def supports_multimodal_encoder_tp_data(
+def supports_multimodal_raw_input_only(
         model: Union[type[object], object]) -> bool:
-    return getattr(model, "supports_encoder_tp_data", False)
-
-
-@runtime_checkable
-class SupportsMultiModalWithRawInput(SupportsMultiModal, Protocol):
-    """The interface required for all multi-modal models."""
-
-    supports_multimodal_raw_input: ClassVar[Literal[True]] = True
-    """
-    A flag that indicates this model supports multi-modal inputs and processes
-    them in their raw form and not embeddings.
-
-    Note:
-        There is no need to redefine this flag if this class is in the
-        MRO of your model class.
-    """
-
-
-@overload
-def supports_multimodal_raw_input(
-        model: object) -> TypeIs[SupportsMultiModalWithRawInput]:
-    ...
+    return getattr(model, "supports_multimodal_raw_input_only", False)
 
 
-@overload
-def supports_multimodal_raw_input(
-        model: type[object]) -> TypeIs[type[SupportsMultiModalWithRawInput]]:
-    ...
-
-
-def supports_multimodal_raw_input(
-    model: Union[type[object], object]
-) -> Union[TypeIs[type[SupportsMultiModalWithRawInput]],
-           TypeIs[SupportsMultiModalWithRawInput]]:
-    return getattr(model, "supports_multimodal_raw_input", False)
+def supports_multimodal_encoder_tp_data(
+        model: Union[type[object], object]) -> bool:
+    return getattr(model, "supports_encoder_tp_data", False)
 
 
 @runtime_checkable
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index f46d6375e1f6..2d14fe6d5892 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -41,7 +41,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import (IsAttentionFree, MultiModalEmbeddings,
-                         SupportsMultiModalWithRawInput)
+                         SupportsMultiModal)
 from .interfaces_base import default_pooling_type
 
 
@@ -174,10 +174,10 @@ def apply(
     info=PrithviGeoSpatialMAEProcessingInfo,
     dummy_inputs=PrithviGeoSpatialMAEInputBuilder,
 )
-class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree,
-                           SupportsMultiModalWithRawInput):
+class PrithviGeoSpatialMAE(nn.Module, IsAttentionFree, SupportsMultiModal):
     """Prithvi Masked Autoencoder"""
 
+    supports_multimodal_raw_input_only = True
     is_pooling_model = True
 
     @classmethod
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 02ef301a52a4..12c0c77784db 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -29,7 +29,7 @@
                          is_hybrid, supports_cross_encoding,
                          supports_multimodal,
                          supports_multimodal_encoder_tp_data,
-                         supports_multimodal_raw_input, supports_pp,
+                         supports_multimodal_raw_input_only, supports_pp,
                          supports_transcription, supports_v0_only)
 from .interfaces_base import (get_default_pooling_type, is_pooling_model,
                               is_text_generation_model)
@@ -326,7 +326,7 @@ class _ModelInfo:
     default_pooling_type: str
     supports_cross_encoding: bool
     supports_multimodal: bool
-    supports_multimodal_raw_input: bool
+    supports_multimodal_raw_input_only: bool
     supports_multimodal_encoder_tp_data: bool
     supports_pp: bool
     has_inner_state: bool
@@ -346,7 +346,8 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo":
             default_pooling_type=get_default_pooling_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
-            supports_multimodal_raw_input=supports_multimodal_raw_input(model),
+            supports_multimodal_raw_input_only=
+            supports_multimodal_raw_input_only(model),
             supports_multimodal_encoder_tp_data=
             supports_multimodal_encoder_tp_data(model),
             supports_pp=supports_pp(model),
@@ -743,13 +744,13 @@ def is_multimodal_model(
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
         return model_cls.supports_multimodal
 
-    def supports_multimodal_raw_input(
+    def is_multimodal_raw_input_only_model(
         self,
         architectures: Union[str, list[str]],
         model_config: ModelConfig,
     ) -> bool:
         model_cls, _ = self.inspect_model_cls(architectures, model_config)
-        return model_cls.supports_multimodal_raw_input
+        return model_cls.supports_multimodal_raw_input_only
 
     def is_pp_supported_model(
         self,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d93460d618e7..20d2d20ba096 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -139,8 +139,9 @@ def __init__(
                 cache_config.cache_dtype]
 
         self.is_pooling_model = model_config.pooler_config is not None
-        self.is_multimodal_raw_input_supported = (
-            model_config.is_multimodal_raw_input_supported)
+        self.is_multimodal_raw_input_only_model = (
+            model_config.is_multimodal_raw_input_only_model)
+
         self.max_model_len = model_config.max_model_len
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -612,7 +613,7 @@ def _extract_mm_kwargs(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> BatchedTensorInputs:
-        if not self.is_multimodal_raw_input_supported or not scheduler_output:  # noqa: SIM102
+        if not scheduler_output or not self.is_multimodal_raw_input_only_model:
             return {}
 
         mm_kwargs = list[MultiModalKwargsItem]()
@@ -631,8 +632,9 @@ def _extract_mm_kwargs(
         return mm_kwargs_combined
 
     def _dummy_mm_kwargs(self, num_seqs: int) -> BatchedTensorInputs:
-        if not self.is_multimodal_raw_input_supported:
+        if not self.is_multimodal_raw_input_only_model:
             return {}
+
         mm_budget = self.mm_budget
         assert mm_budget is not None
 

From dd589322801e2eb8426aa2b95f2729699ff431c5 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Wed, 27 Aug 2025 19:05:16 +0200
Subject: [PATCH 034/125] [V1] [Hybrid] Enable compile and piecewise CUDA graph
 for MiniMax-Text models (#22589)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/config/compilation.py                    |   1 +
 vllm/model_executor/models/minimax_text_01.py | 234 ++++++++----------
 2 files changed, 98 insertions(+), 137 deletions(-)

diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index 56aa00a30d3a..5c3b22001636 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -339,6 +339,7 @@ class CompilationConfig:
         "vllm.mamba_mixer2",
         "vllm.mamba_mixer",
         "vllm.short_conv",
+        "vllm.linear_attention",
     ]
 
     def compute_hash(self) -> str:
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 0e854bd7d913..176a40179bca 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
-import copy
 import math
 from collections.abc import Iterable
 from typing import TYPE_CHECKING, Optional, Union
@@ -19,13 +18,14 @@
 
 from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
+from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
                          get_current_vllm_config)
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size)
-from vllm.forward_context import get_forward_context
+from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -43,12 +43,15 @@
     MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
+from vllm.utils import direct_register_custom_op
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
 from .interfaces import HasInnerState, IsHybrid
@@ -143,61 +146,6 @@ def forward(
         return self._forward(x)
 
 
-class MiniMaxText01RotaryEmbedding(CustomOp):
-    name = "MiniMaxText01RotaryEmbedding"
-
-    def __init__(
-        self,
-        head_size: int,
-        rotary_dim: int,
-        max_position: int,
-        base: float,
-        is_neox_style: bool,
-        cache_dtype: torch.dtype,
-    ) -> None:
-        super().__init__()
-        self.head_size = head_size
-        self.rotary_dim = rotary_dim
-        self.max_position_embeddings = max_position
-        self.base = base
-        self.is_neox_style = is_neox_style
-        self.cache_dtype = cache_dtype
-        cache = self._compute_cos_sin_cache().to(cache_dtype)
-        self.register_buffer("cos_sin_cache", cache, persistent=False)
-
-    def _compute_inv_freq(self, base: float) -> torch.Tensor:
-        """Compute the inverse frequency."""
-        inv_freq = 1.0 / (base**(torch.arange(
-            0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim))
-        return inv_freq
-
-    def _compute_cos_sin_cache(self) -> torch.Tensor:
-        """Compute the cos and sin cache."""
-        inv_freq = self._compute_inv_freq(self.base)
-        t = torch.arange(self.max_position_embeddings, dtype=torch.float)
-        freqs = torch.einsum("i,j -> ij", t, inv_freq)
-        cos = freqs.cos()
-        sin = freqs.sin()
-        cache = torch.cat((cos, sin), dim=-1)
-        return cache
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        from vllm import _custom_ops as ops
-        self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
-        query_cast = query.to(self.cache_dtype)
-        key_cast = key.to(self.cache_dtype)
-        ops.rotary_embedding(positions, query_cast, key_cast, self.head_size,
-                             self.cos_sin_cache, self.is_neox_style)
-        query = query_cast.to(query.dtype)
-        key = key_cast.to(key.dtype)
-        return query, key
-
-
 class MiniMaxText01MLP(nn.Module):
 
     def __init__(
@@ -526,20 +474,40 @@ def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
                                               slot_id, 32)
         return hidden
 
-    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
-                kv_caches: MinimaxCacheParams, **kwargs) -> torch.Tensor:
-        qkv, _ = self.qkv_proj(hidden_states)
+    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams) -> None:
+        if not envs.VLLM_USE_V1:
+            self._forward(hidden_states, output, positions, kv_caches)
+        else:
+            torch.ops.vllm.linear_attention(
+                hidden_states,
+                output,
+                positions,
+                self.prefix,
+            )
+
+    def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                 positions: torch.Tensor,
+                 kv_caches: Optional[MinimaxCacheParams]) -> None:
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if envs.VLLM_USE_V1 and attn_metadata is not None:
+            assert isinstance(attn_metadata, dict)
+            attn_metadata = attn_metadata[self.prefix]
+            assert isinstance(attn_metadata, LinearAttentionMetadata)
+            num_actual_tokens = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
+        else:
+            num_actual_tokens = hidden_states.shape[0]
+
+        qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens])
         qkv32 = qkv.to(torch.float32)
         qkvact = torch.nn.functional.silu(qkv32)
         qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
         q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
         if envs.VLLM_USE_V1:
             if attn_metadata is not None:
-                assert isinstance(attn_metadata, dict)
-                attn_metadata = attn_metadata[self.prefix]
-                assert isinstance(attn_metadata, LinearAttentionMetadata)
                 kv_cache = self.kv_cache[forward_context.virtual_engine][0]
                 state_indices_tensor = attn_metadata.state_indices_tensor
 
@@ -578,13 +546,11 @@ def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
                 hidden = self._decode_infer(q, k, v, kv_cache,
                                             state_indices_tensor,
                                             attn_metadata)
-
         hidden = self.norm._forward(hidden)
-        gate, _ = self.output_gate(hidden_states)
+        gate, _ = self.output_gate(hidden_states[:num_actual_tokens])
         hidden = F.sigmoid(gate) * hidden
         hidden = hidden.to(hidden_states.dtype)
-        hidden, _ = self.out_proj(hidden)
-        return hidden
+        output[:num_actual_tokens], _ = self.out_proj(hidden)
 
 
 class MiniMaxText01Attention(nn.Module):
@@ -652,23 +618,23 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
         )
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position,
+            base=int(rope_theta),
+            is_neox_style=True,
+            dtype=torch.float32,
+        )
         return
 
-    def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor,
-                **kwargs) -> torch.Tensor:
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
+    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                positions: torch.Tensor, **kwargs) -> None:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        if envs.VLLM_USE_V1:
-            if attn_metadata is not None:
-                q, k = attn_metadata[f"{self.prefix}.attn"].rotary_emb(
-                    positions, q, k)
-        else:
-            q, k = attn_metadata.rotary_emb(positions, q, k)
+        q, k = self.rotary_emb(positions, q, k)
         attn_output = self.attn(q, k, v)
-        output, _ = self.o_proj(attn_output)
-        return output
+        output[:], _ = self.o_proj(attn_output)
 
 
 class MiniMaxText01DecoderLayer(nn.Module):
@@ -816,16 +782,15 @@ def forward(self,
                 is_warmup: bool = False,
                 **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
 
-        forward_context = get_forward_context()
-        attn_metadata = forward_context.attn_metadata
         layernorm_input = hidden_states
         layernorm_output = self.input_layernorm(layernorm_input)
         residual = layernorm_output if self.postnorm else layernorm_input
-        self_attention_output = self.self_attn(
+        self_attention_output = torch.empty_like(layernorm_output)
+        self.self_attn(
             hidden_states=layernorm_output,
+            output=self_attention_output,
             positions=positions,
             kv_caches=kv_caches,
-            attn_metadata=attn_metadata,
         )
 
         residual = residual * self.layernorm_attention_alpha
@@ -839,8 +804,8 @@ def forward(self,
         if self.expert_num == 1:
             hidden_states = self.mlp(layernorm_output)
         else:
-            moe_hidden_states = self.block_sparse_moe(
-                copy.deepcopy(layernorm_output))
+            moe_layernorm_output = layernorm_output.clone()
+            moe_hidden_states = self.block_sparse_moe(moe_layernorm_output)
             if self.shared_moe:
                 before_moe_dtype = layernorm_output.dtype
                 moe_hidden_fp32 = moe_hidden_states.to(torch.float32)
@@ -878,18 +843,16 @@ def shared_moe_coefficient_loader(param: torch.Tensor,
         return
 
 
+@support_torch_compile
 class MiniMaxText01Model(nn.Module):
 
-    def __init__(
-        self,
-        config: MiniMaxConfig,
-        model_config: Optional[ModelConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        scheduler_config=None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+        config: MiniMaxConfig = vllm_config.model_config.hf_config
+        model_config = vllm_config.model_config
+        quant_config = vllm_config.quant_config
+        cache_config = vllm_config.cache_config
+        scheduler_config = vllm_config.scheduler_config
 
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -976,24 +939,6 @@ def layer_fn(prefix):
             self.minimax_cache = MinimaxCacheManager(
                 dtype=torch.float32, cache_shape=self.cache_shape)
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        head_dim = getattr(config, "head_dim", None)
-        if head_dim is None:
-            head_dim = config.hidden_size // config.num_attention_heads
-        if hasattr(config, "max_model_len") and isinstance(
-                config.max_model_len, int):
-            max_position_embeddings = min(config.max_position_embeddings,
-                                          config.max_model_len)
-        self.rotary_emb = MiniMaxText01RotaryEmbedding(
-            head_dim,
-            rotary_dim=config.rotary_dim
-            if hasattr(config, "rotary_dim") else head_dim,
-            max_position=max_position_embeddings,
-            base=int(rope_theta),
-            is_neox_style=True,
-            cache_dtype=torch.float32,
-        )
-
         norm_kwargs = {}
         if hasattr(config, "rms_norm_eps"):
             norm_kwargs["eps"] = config.rms_norm_eps
@@ -1043,12 +988,11 @@ def forward(self,
         attn_metadata = forward_context.attn_metadata
         if not envs.VLLM_USE_V1 and attn_metadata is None:
             return None
-        if "request_ids_to_seq_ids" not in kwargs:
-            kwargs["request_ids_to_seq_ids"] = {}
-        if "finished_requests_ids" not in kwargs:
-            kwargs["finished_requests_ids"] = []
-
         if not envs.VLLM_USE_V1:
+            if "request_ids_to_seq_ids" not in kwargs:
+                kwargs["request_ids_to_seq_ids"] = {}
+            if "finished_requests_ids" not in kwargs:
+                kwargs["finished_requests_ids"] = []
             (
                 minimax_cache_tensors,
                 state_indices_tensor,
@@ -1077,16 +1021,6 @@ def forward(self,
 
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
-            if attn_metadata is not None:
-                # TODO (tdoublep): this whole thing with the rotary_emb is
-                # weird. we shouldn't be passing it via attn_metadata imo.
-                if envs.VLLM_USE_V1:
-                    if isinstance(layer.self_attn, MiniMaxText01Attention):
-                        attn_metadata[layer.prefix +
-                                      ".attn"].rotary_emb = self.rotary_emb
-                else:
-                    attn_metadata.rotary_emb = self.rotary_emb
-
             _caches = None
             if not envs.VLLM_USE_V1 and isinstance(
                     layer.self_attn, MiniMaxText01LinearAttention):
@@ -1120,7 +1054,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
 
         super().__init__()
         config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
@@ -1133,13 +1066,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.unpadded_vocab_size = self.config.vocab_size
         if hasattr(vllm_config.model_config, "max_model_len"):
             self.config.max_model_len = vllm_config.model_config.max_model_len
-        self.model = MiniMaxText01Model(
-            self.config,
-            model_config=vllm_config.model_config,
-            cache_config=vllm_config.cache_config,
-            quant_config=quant_config,
-            scheduler_config=vllm_config.scheduler_config,
-            prefix=maybe_prefix(prefix, "model"))
+        self.model = MiniMaxText01Model(vllm_config=vllm_config,
+                                        prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(
                 self.unpadded_vocab_size,
@@ -1469,3 +1397,35 @@ def get_mamba_state_shape_from_config(
             tp_size=parallel_config.tensor_parallel_size,
             head_dim=hf_config.head_dim,
         )
+
+
+def linear_attention(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self._forward(hidden_states=hidden_states,
+                  output=output,
+                  positions=positions,
+                  kv_caches=None)
+
+
+def linear_attention_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="linear_attention",
+    op_func=linear_attention,
+    mutates_args=["output"],
+    fake_impl=linear_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)

From 4e4d017b6f70c729e7c78f74e4328a4ebca7b8ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hyogeun=20Oh=20=28=EC=98=A4=ED=9A=A8=EA=B7=BC=29?=
 <ohg3417@gmail.com>
Date: Thu, 28 Aug 2025 02:17:29 +0900
Subject: [PATCH 035/125] [Docs] Fix warnings in `mkdocs build` (continued)
 (#23743)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Zerohertz <ohg3417@gmail.com>
Signed-off-by: Hyogeun Oh (오효근) <ohg3417@gmail.com>
---
 vllm/core/block/naive_block.py               |  2 +-
 vllm/core/block/prefix_caching_block.py      |  2 +-
 vllm/core/scheduler.py                       |  2 +-
 vllm/v1/attention/backends/cpu_attn.py       |  3 ++-
 vllm/v1/attention/backends/flash_attn.py     |  3 ++-
 vllm/v1/attention/backends/flashinfer.py     |  8 +++-----
 vllm/v1/attention/backends/flex_attention.py |  3 ++-
 vllm/v1/attention/backends/pallas.py         |  5 +++--
 vllm/v1/attention/backends/rocm_aiter_fa.py  |  3 ++-
 vllm/v1/attention/backends/tree_attn.py      |  3 ++-
 vllm/v1/attention/backends/triton_attn.py    |  3 ++-
 vllm/v1/attention/backends/xformers.py       |  3 ++-
 vllm/v1/core/encoder_cache_manager.py        |  8 ++++----
 vllm/v1/core/kv_cache_coordinator.py         |  3 ++-
 vllm/v1/core/kv_cache_manager.py             | 11 ++++++-----
 vllm/v1/executor/ray_distributed_executor.py |  3 ++-
 vllm/v1/metrics/prometheus.py                |  2 +-
 vllm/v1/sample/logits_processor/interface.py |  4 ++--
 vllm/v1/sample/rejection_sampler.py          |  2 +-
 vllm/v1/sample/tpu/sampler.py                |  2 +-
 vllm/v1/structured_output/backend_types.py   |  4 ++--
 vllm/v1/worker/gpu_input_batch.py            |  3 ---
 vllm/v1/worker/gpu_model_runner.py           |  2 +-
 vllm/v1/worker/tpu_model_runner.py           | 10 +++++-----
 vllm/v1/worker/utils.py                      |  8 ++++----
 vllm/v1/worker/worker_base.py                |  4 ++--
 26 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index dae6ead04e9c..7d9b32cd4b67 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -207,7 +207,7 @@ def get_physical_block_id(self, absolute_id: int) -> int:
 
         Args:
             absolute_id (int): The absolute block id for the block 
-            in whole allocator.
+                in whole allocator.
 
         Returns:
             int: The zero-offset block id on certain device.
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 2913a01bf34a..a21d69323abb 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -61,7 +61,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
     Args:
         num_blocks (int): The total number of blocks to manage.
         block_size (int): The size of each block in tokens.
-        block_ids(Optional[Iterable[int]], optional): An optional iterable of
+        block_ids (Optional[Iterable[int]], optional): An optional iterable of
             block IDs. If not provided, block IDs will be assigned sequentially
             from 0 to num_blocks - 1.
     """
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 63894e7f5dc8..c89f3f663264 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -657,7 +657,7 @@ def _schedule_running(
                 `budget.num_batched_tokens` has not enough capacity to schedule
                 all tokens.
             partial_prefill_metadata: information about the partial prefills
-            that are currently running
+                that are currently running
 
         Returns:
             SchedulerRunningOutputs.
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 973979fdf7df..ced8234a7b43 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -491,7 +491,8 @@ def forward(
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size * num_kv_heads * head_size]
                 NOTE: kv_cache will be an empty tensor with shape [0]
                 for profiling run.
             attn_metadata: Metadata for attention.
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 6e7096de924c..dd2b956d4fa3 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -438,7 +438,8 @@ def forward(
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 1115fc606b05..70d3471a4725 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -637,11 +637,9 @@ def forward(
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape -
-            # NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
-            # HND: [num_blocks, 2,  num_kv_heads, block_size, head_size]
-
-
+            kv_cache: KV cache tensor with different possible shapes:
+                - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
+                - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index 458562ebc8d2..a596f6b2b32a 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -689,7 +689,8 @@ def forward(
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
index fd97db0abb84..26f9abf13d0e 100644
--- a/vllm/v1/attention/backends/pallas.py
+++ b/vllm/v1/attention/backends/pallas.py
@@ -235,7 +235,8 @@ def forward(
             query: shape = [num_tokens, num_heads * head_size]
             key: shape = [num_tokens, num_kv_heads * head_size]
             value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+            kv_cache: shape =
+                [num_blocks, block_size, num_kv_heads * 2, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
@@ -329,7 +330,7 @@ def write_to_kv_cache(
     Args:
         key: shape = [num_tokens, num_kv_heads, head_size]
         value: shape = [num_tokens, num_kv_heads, head_size]
-        kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size]
+        kv_cache: shape = [num_blocks, block_size, num_kv_heads * 2, head_size]
         num_slices_per_kv_cache_update_block: int
     """
     _, page_size, num_combined_kv_heads, head_size = kv_cache.shape
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index 403ad8e88a95..173a0a255e49 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -429,7 +429,8 @@ def forward(
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index c93223a34083..b96d957a150b 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -362,7 +362,8 @@ def forward(
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index b12036c59979..a37a7f6811ef 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -285,7 +285,8 @@ def forward(
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
index e0eb7d8be974..7f888c113574 100644
--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
@@ -330,7 +330,8 @@ def forward(
             query: shape = [num_tokens, num_heads, head_size]
             key: shape = [num_tokens, num_kv_heads, head_size]
             value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+            kv_cache: shape =
+                [2, num_blocks, block_size, num_kv_heads, head_size]
             attn_metadata: Metadata for attention.
         Returns:
             shape = [num_tokens, num_heads * head_size]
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
index c9d18033a198..bd2ec036834b 100644
--- a/vllm/v1/core/encoder_cache_manager.py
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -255,9 +255,9 @@ def compute_encoder_budget(
 
     Returns:
         - Compute budget for encoder execution, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
         - Space budget for encoder cache size, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
     """
     if mm_registry.supports_multimodal_inputs(model_config):
         max_tokens_by_modality = mm_registry \
@@ -303,9 +303,9 @@ def compute_mm_encoder_budget(
 
     Returns:
         - Compute budget for encoder execution, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
         - Space budget for encoder cache size, measured in number of tokens
-          from the input sequence.
+            from the input sequence.
     """
 
     if not max_tokens_by_modality:
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index f082ad00f2e3..9421341f990c 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -119,7 +119,8 @@ def cache_blocks(self, request: Request, num_computed_tokens: int) -> None:
 
         Args:
             request: The request.
-            num_tokens: The total number of tokens that need to be cached 
+            num_computed_tokens: The total number of tokens
+                that need to be cached
                 (including tokens that are already cached).
         """
         for manager in self.single_type_managers:
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index b427a9c497fe..87a11fe58a04 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -54,14 +54,15 @@ def get_block_ids(
     def get_block_ids(
         self,
         allow_none: bool = False,
-    ):
+    ) -> Optional[tuple[list[int], ...]]:
         """
         Converts the KVCacheBlocks instance to block_ids.
-        
+
         Returns:
-            tuple[list[int], ...]: A tuple of lists where
-            * the outer tuple corresponds to KV cache groups
-            * each inner list contains the block_ids of the blocks in that group
+            tuple[list[int], ...]: A tuple of lists where:
+                - the outer tuple corresponds to KV cache groups
+                - each inner list contains the block_ids of the blocks in that
+                  group
         """
         if allow_none and all(len(group) == 0 for group in self.blocks):
             return None
diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py
index c05ad1966d61..8394ae788ab0 100644
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -8,6 +8,7 @@
 from vllm.executor.ray_distributed_executor import (  # noqa
     RayDistributedExecutor as RayDistributedExecutorV0)
 from vllm.logger import init_logger
+from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.outputs import ModelRunnerOutput
@@ -64,7 +65,7 @@ def max_concurrent_batches(self) -> int:
 
     def execute_model(
         self,
-        scheduler_output,
+        scheduler_output: SchedulerOutput,
     ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
         """Execute the model on the Ray workers.
 
diff --git a/vllm/v1/metrics/prometheus.py b/vllm/v1/metrics/prometheus.py
index 61ba5d66cb31..a43cf9ce255e 100644
--- a/vllm/v1/metrics/prometheus.py
+++ b/vllm/v1/metrics/prometheus.py
@@ -36,7 +36,7 @@ def setup_multiprocess_prometheus():
                        "and vLLM will properly handle cleanup.")
 
 
-def get_prometheus_registry():
+def get_prometheus_registry() -> CollectorRegistry:
     """Get the appropriate prometheus registry based on multiprocessing 
     configuration.
     
diff --git a/vllm/v1/sample/logits_processor/interface.py b/vllm/v1/sample/logits_processor/interface.py
index 16cd00943db8..683fc7c00dfb 100644
--- a/vllm/v1/sample/logits_processor/interface.py
+++ b/vllm/v1/sample/logits_processor/interface.py
@@ -91,7 +91,7 @@ def update_state(
         to each forward pass.
 
         Args:
-            batch_update is non-None iff there have been
-            changes to the batch makeup.
+            batch_update: Non-None iff there have been changes
+                to the batch makeup.
         """
         raise NotImplementedError
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index b2354c53302a..2d9ce3101b6c 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -68,7 +68,7 @@ def forward(
                 different requests are flattened into a single tensor because
                 this is the shape of the output logits.
                 NOTE: `target_logits` can be updated in place to save memory.
-            bonus_token_ids_tensor (torch.Tensor):
+            bonus_token_ids (torch.Tensor):
                 A tensor containing bonus tokens. Shape is [batch_size, 1].
                 Bonus tokens are added to the end of the sequence if all
                 proposed tokens are accepted. We generate the bonus tokens
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 04545d587e4a..e84136e3a6d0 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -89,7 +89,7 @@ def gather_logprobs(
         Gather logprobs for topk and sampled/prompt token.
 
         Args:
-          logits: (num tokens) x (vocab) tensor
+          logprobs: (num tokens) x (vocab) tensor
           num_logprobs: minimum number of logprobs to
                         retain per token
           token_ids: prompt tokens (if prompt logprobs)
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index d500783aa4b3..9a53aa7a1ad1 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -110,7 +110,7 @@ def compile_grammar(self, request_type: StructuredOutputOptions,
 
         Args:
             request_type (StructuredOutputOptions): The type of structured
-              output request.
+                output request.
             grammar_spec (str): The grammar specification to compile.
 
         Returns:
@@ -124,7 +124,7 @@ def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor:
 
         Args:
             max_num_seqs (int): The maximum number of sequences for which
-              to allocate the bitmask.
+                to allocate the bitmask.
         """
 
     @abstractmethod
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 284af6bfedce..f4c2f45df595 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -525,9 +525,6 @@ def condense(self) -> None:
         Any consecutive empty indices at the very end of the list are not
         filled.
 
-        Args:
-          empty_req_indices: empty indices which may be filled.
-
         Returns:
           swaps: list of (from,to) swap tuples for moved requests
           empty_req_indices: indices not filled by condensation
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 20d2d20ba096..01c90b2ea38d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2955,7 +2955,7 @@ def _reshape_kv_cache_tensors(
         Args:
             kv_cache_config: The KV cache config
             kv_cache_raw_tensors: The KV cache buffer of each layer, with
-            correct size but uninitialized shape.
+                correct size but uninitialized shape.
         Returns:
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index d36423660427..70ffde39ca33 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -552,7 +552,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         return kv_cache_spec
 
     def _get_slot_mapping_metadata(self, num_reqs,
-                                   num_scheduled_tokens_per_req):
+                                   num_scheduled_tokens_per_req) -> np.ndarray:
         """
         Computes metadata for mapping slots to blocks in the key-value (KV)
         cache for a batch of requests.
@@ -565,15 +565,15 @@ def _get_slot_mapping_metadata(self, num_reqs,
         Args:
             num_reqs (int): Number of requests in the current batch.
             num_scheduled_tokens_per_req (int or np.ndarray): Number of tokens
-            to be scheduled for each request.
+                to be scheduled for each request.
 
         Returns:
             np.ndarray: A 2D array of shape (total_block_len, 3), where each row
-            contains:
+                contains:
                 - kv_cache_start_index (int): The starting index in the KV cache
-                    for the corresponding slice.
+                  for the corresponding slice.
                 - new_kv_start_index (int): The starting index in the new KV
-                    cache for the corresponding slice.
+                  cache for the corresponding slice.
                 - slice_len (int): The length of the slice.
         """
         slices_start = self.input_batch.num_computed_tokens_cpu[:num_reqs]
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index f40753468766..a519336e4161 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -172,10 +172,10 @@ def scatter_mm_placeholders(
 
     Args:
         embeds: The multimodal embeddings.
-          Shape: `(num_embeds, embed_dim)`
+            Shape: `(num_embeds, embed_dim)`
         is_embed: A boolean mask indicating which positions in the placeholder
-          tokens need to be filled with multimodal embeddings.
-          Shape: `(num_placeholders, num_embeds)`
+            tokens need to be filled with multimodal embeddings.
+            Shape: `(num_placeholders, num_embeds)`
     """
     if is_embed is None:
         return embeds
@@ -278,7 +278,7 @@ def bind_kv_cache(
     Args:
         kv_caches: The allocated kv_caches with layer names as keys.
         forward_context: The global forward context containing all Attention
-        layers with layer names as keys.
+            layers with layer names as keys.
         runner_kv_caches: The kv_cache declared by ModelRunner.
     """
     # Bind kv_caches to ModelRunner
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 9c93754f93f8..038ce4b54f96 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -36,8 +36,8 @@ def __init__(
             local_rank: Local device index
             rank: Global rank in distributed setup
             distributed_init_method: Distributed initialization method
-            is_driver_worker: Whether this worker handles driver 
-            responsibilities
+            is_driver_worker: Whether this worker handles driver
+                responsibilities
         """
         # Configuration storage
         super().__init__(vllm_config=vllm_config)

From 3c0ef769bace3d48b276c7233ed6f39fe03f95b7 Mon Sep 17 00:00:00 2001
From: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:41:48 -0700
Subject: [PATCH 036/125] ci: Add arm64 docker build to release pipeline
 (#23210)

Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
Signed-off-by: Eli Uriegas <1700823+seemethere@users.noreply.github.com>
---
 .buildkite/release-pipeline.yaml | 38 +++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index f96c38bf57db..86aae426c258 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -7,7 +7,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -62,23 +62,49 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build release image"
+  - block: "Build release image (x86)"
     depends_on: ~
     key: block-release-image-build
 
-  - label: "Build release image"
+  - label: "Build release image (x86)"
     depends_on: block-release-image-build
-    id: build-release-image
+    id: build-release-image-x86
     agents:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+      # re-tag to default image tag and push, just in case arm64 build fails
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  - label: "Build release image (arm64)"
+    depends_on: block-release-image-build
+    id: build-release-image-arm64
+    agents:
+      queue: arm64_cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
+
+  # Add job to create multi-arch manifest
+  - label: "Create multi-arch manifest"
+    depends_on:
+      - build-release-image-x86
+      - build-release-image-arm64
+    id: create-multi-arch-manifest
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
+      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+
   - label: "Annotate release workflow"
     depends_on:
-      - build-release-image
+      - create-multi-arch-manifest
       - build-wheel-cuda-12-8
       - build-wheel-cuda-12-6
       - build-wheel-cuda-11-8

From 0585a9e73c072a8cbb1a64bea3c26dd0d2dde402 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Wed, 27 Aug 2025 20:03:05 +0100
Subject: [PATCH 037/125] Disable `torch.compile` for dynamic rope models in
 Transformers backend (#23738)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 25 +++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index fc242d1adafd..dffc347a7366 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -88,6 +88,23 @@ def log_replacement(name: str, old_module: nn.Module, new_module: nn.Module):
     logger.debug("%s: %s -> %s", name, old_module, new_module)
 
 
+def can_enable_torch_compile(vllm_config: VllmConfig) -> bool:
+    """
+    Callable to be passed to `@support_torch_compile`'s `enable_if` argument.
+
+    Defaults to `True` but is disabled in the following situations:
+
+    - The model uses dynamic rope scaling.
+    """
+    enable = True
+    text_config = vllm_config.model_config.hf_config.get_text_config()
+    # Dynamic rope scaling is not compatible with torch.compile
+    rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
+    if rope_scaling.get("rope_type") == "dynamic":
+        enable = False
+    return enable
+
+
 def replace_linear_class(
     linear: nn.Linear, style: Literal["colwise", "rowwise"],
     quant_config: QuantizationConfig
@@ -641,7 +658,7 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
 
-@support_torch_compile
+@support_torch_compile(enable_if=can_enable_torch_compile)
 class TransformersModel(TransformersBase):
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -653,7 +670,7 @@ class TransformersModel(TransformersBase):
         })
 
 
-@support_torch_compile
+@support_torch_compile(enable_if=can_enable_torch_compile)
 class TransformersForCausalLM(TransformersBase):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -709,12 +726,14 @@ def _can_concat(x: list[torch.Tensor]):
     info=MultiModalProcessingInfo,
     dummy_inputs=MultiModalDummyInputsBuilder)
 @support_torch_compile(
+    # set `positions` to last dim to support Qwen-mrope
     dynamic_arg_dims={
         "input_ids": 0,
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    })  # set `positions` to last dim to support Qwen-mrope
+    },
+    enable_if=can_enable_torch_compile)
 class TransformersForMultimodalLM(TransformersForCausalLM, SupportsMultiModal):
     # Backwards compatibility for prev released models. State dicts back then
     # had different formats and cannot be loaded with `AutoModel` mapping as is

From 8bf6266a17933b130f94f6d53f32ac029ed8ba1b Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Wed, 27 Aug 2025 13:24:31 -0700
Subject: [PATCH 038/125] [Multimodal] Generate mm_hash based on request
 metadata when caching is turned off (#23690)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 vllm/inputs/preprocess.py                     | 71 ++++++++++++++++---
 vllm/model_executor/models/deepseek_vl2.py    |  3 +
 vllm/model_executor/models/h2ovl.py           |  3 +
 vllm/model_executor/models/llava.py           |  8 ++-
 vllm/model_executor/models/mllama.py          |  8 ++-
 vllm/model_executor/models/paligemma.py       |  8 ++-
 vllm/model_executor/models/pixtral.py         |  2 +
 .../models/prithvi_geospatial_mae.py          |  7 +-
 vllm/model_executor/models/transformers.py    |  7 +-
 vllm/model_executor/models/voxtral.py         |  2 +
 vllm/multimodal/processing.py                 | 36 ++++++++--
 vllm/v1/engine/processor.py                   | 48 +++++++++++++
 12 files changed, 179 insertions(+), 24 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index f0d0cab3df3d..fff9c42fe36f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -257,6 +257,8 @@ def _process_multimodal(
         mm_processor_kwargs: Optional[Mapping[str, object]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -273,10 +275,13 @@ def _process_multimodal(
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt,
-                                  mm_data,
-                                  hf_processor_mm_kwargs=mm_processor_kwargs,
-                                  tokenization_kwargs=tokenization_kwargs)
+        return mm_processor.apply(
+            prompt,
+            mm_data,
+            hf_processor_mm_kwargs=mm_processor_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
+        )
 
     async def _process_multimodal_async(
         self,
@@ -285,6 +290,8 @@ async def _process_multimodal_async(
         mm_processor_kwargs: Optional[Mapping[str, object]],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Async version of
@@ -301,10 +308,13 @@ async def _process_multimodal_async(
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(prompt,
-                                  mm_data,
-                                  hf_processor_mm_kwargs=mm_processor_kwargs,
-                                  tokenization_kwargs=tokenization_kwargs)
+        return mm_processor.apply(
+            prompt,
+            mm_data,
+            hf_processor_mm_kwargs=mm_processor_kwargs,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
+        )
 
     def _process_embeds(
         self,
@@ -341,6 +351,8 @@ def _process_tokens(
         parsed_content: TokensPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
         token_type_ids = parsed_content.get("token_type_ids")
@@ -353,6 +365,7 @@ def _process_tokens(
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             inputs = token_inputs(
@@ -370,6 +383,8 @@ async def _process_tokens_async(
         parsed_content: TokensPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
         token_type_ids = parsed_content.get("token_type_ids")
@@ -382,6 +397,7 @@ async def _process_tokens_async(
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             inputs = token_inputs(
@@ -399,6 +415,8 @@ def _process_text(
         parsed_content: TextPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -410,6 +428,7 @@ def _process_text(
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             prompt_token_ids = self._tokenize_prompt(
@@ -432,6 +451,8 @@ async def _process_text_async(
         parsed_content: TextPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -443,6 +464,7 @@ async def _process_text_async(
                 parsed_content.get("mm_processor_kwargs"),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         else:
             prompt_token_ids = await self._tokenize_prompt_async(
@@ -465,6 +487,8 @@ def _prompt_to_llm_inputs(
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -486,18 +510,21 @@ def _prompt_to_llm_inputs(
             return self._process_tokens(
                 parsed["content"],
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "text":
             return self._process_text(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "str":
             return self._process_text(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         assert_never(parsed)
@@ -507,6 +534,8 @@ async def _prompt_to_llm_inputs_async(
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> SingletonInputs:
         """
         Async version of
@@ -520,18 +549,21 @@ async def _prompt_to_llm_inputs_async(
             return await self._process_tokens_async(
                 parsed["content"],
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "text":
             return await self._process_text_async(
                 parsed["content"],
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
         if parsed["type"] == "str":
             return await self._process_text_async(
                 TextPrompt(prompt=parsed["content"]),
                 tokenization_kwargs=tokenization_kwargs,
                 lora_request=lora_request,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         assert_never(parsed)
@@ -641,6 +673,8 @@ def _process_encoder_decoder_prompt(
         self,
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -682,6 +716,7 @@ def _process_encoder_decoder_prompt(
             encoder_inputs = self._prompt_to_llm_inputs(
                 prompt["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if (decoder_input := prompt["decoder_prompt"]) is None:
                 decoder_inputs = None
@@ -697,6 +732,7 @@ def _process_encoder_decoder_prompt(
             inputs = self._prompt_to_llm_inputs(
                 prompt,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
@@ -712,6 +748,8 @@ async def _process_encoder_decoder_prompt_async(
         self,
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> EncoderDecoderInputs:
         """
         Async version of
@@ -724,6 +762,7 @@ async def _process_encoder_decoder_prompt_async(
             encoder_task = self._prompt_to_llm_inputs_async(
                 prompt["encoder_prompt"],
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
             if (decoder_input := prompt["decoder_prompt"]) is None:
@@ -733,6 +772,7 @@ async def _process_encoder_decoder_prompt_async(
                 decoder_task = self._prompt_to_llm_inputs_async(
                     decoder_input,
                     tokenization_kwargs=tokenization_kwargs,
+                    mm_hash_overrides=mm_hash_overrides,
                 )
 
                 encoder_inputs, decoder_inputs = await asyncio.gather(
@@ -748,6 +788,7 @@ async def _process_encoder_decoder_prompt_async(
             inputs = await self._prompt_to_llm_inputs_async(
                 prompt,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
             if self.model_config.is_multimodal_model:
                 # Encoder-Decoder Multimodal model
@@ -774,6 +815,8 @@ def _process_decoder_only_prompt(
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -794,6 +837,7 @@ def _process_decoder_only_prompt(
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -803,6 +847,8 @@ async def _process_decoder_only_prompt_async(
         prompt: SingletonPrompt,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> DecoderOnlyInputs:
         """
         Async version of
@@ -812,6 +858,7 @@ async def _process_decoder_only_prompt_async(
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._build_decoder_only_llm_inputs(prompt_comps)
@@ -821,6 +868,8 @@ def preprocess(
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
@@ -829,6 +878,7 @@ def preprocess(
             return self._process_encoder_decoder_prompt(
                 prompt,
                 tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
@@ -840,6 +890,7 @@ def preprocess(
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
     async def preprocess_async(
@@ -847,6 +898,8 @@ async def preprocess_async(
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> ProcessorInputs:
         """
         Async version of
@@ -858,6 +911,7 @@ async def preprocess_async(
             return await self._process_encoder_decoder_prompt_async(
                 prompt,
                 tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         if is_explicit_encoder_decoder_prompt(prompt):
@@ -869,6 +923,7 @@ async def preprocess_async(
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
     def clear_cache(self) -> None:
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index ceb5e1364b68..1bd2802a8683 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -290,6 +290,7 @@ def _cached_apply_hf_processor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 2 vs > 2
         # Since the processing cache assumes that the processor output is
@@ -301,6 +302,7 @@ def _cached_apply_hf_processor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         return super()._cached_apply_hf_processor(
@@ -308,6 +310,7 @@ def _cached_apply_hf_processor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
 
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 87e451a2769e..306775af6806 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -479,6 +479,7 @@ def _cached_apply_hf_processor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         # The processor logic is different for len(images) <= 1 vs > 1
         # Since the processing cache assumes that the processor output is
@@ -490,6 +491,7 @@ def _cached_apply_hf_processor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
         return super()._cached_apply_hf_processor(
@@ -497,6 +499,7 @@ def _cached_apply_hf_processor(
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 0ee26b68345c..8a847a6180f3 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -795,6 +795,7 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         hf_config = self.info.get_hf_config()
         image_token_id = hf_config.image_token_index
@@ -805,8 +806,11 @@ def apply(
             image_height=-1,
         )
 
-        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                               tokenization_kwargs)
+        result = super().apply(prompt,
+                               mm_data,
+                               hf_processor_mm_kwargs,
+                               tokenization_kwargs,
+                               mm_hash_overrides=mm_hash_overrides)
 
         mm_items = self._to_mm_items(mm_data)
         mm_item_counts = mm_items.get_all_counts()
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 2a60450de414..cc2216996f03 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -184,9 +184,13 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalEncDecInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                                  tokenization_kwargs)
+        mm_inputs = super().apply(prompt,
+                                  mm_data,
+                                  hf_processor_mm_kwargs,
+                                  tokenization_kwargs,
+                                  mm_hash_overrides=mm_hash_overrides)
 
         image_token_id = self.info.get_hf_config().image_token_index
         # Check that the number of image tokens in the decoder prompt matches
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 95abb190e0a4..b74a09ee92c3 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -203,9 +203,13 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
-        mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
-                                  tokenization_kwargs)
+        mm_inputs = super().apply(prompt,
+                                  mm_data,
+                                  hf_processor_mm_kwargs,
+                                  tokenization_kwargs,
+                                  mm_hash_overrides=mm_hash_overrides)
         prompt_token_ids = mm_inputs["prompt_token_ids"]
 
         tokenizer = self.info.get_tokenizer()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 461b9c85d1c2..a74e01a59697 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -314,12 +314,14 @@ def _cached_apply_hf_processor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py
index 2d14fe6d5892..2edc357d2df1 100644
--- a/vllm/model_executor/models/prithvi_geospatial_mae.py
+++ b/vllm/model_executor/models/prithvi_geospatial_mae.py
@@ -138,6 +138,7 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         if "image" in mm_data:
             image_data = mm_data["image"]
@@ -146,8 +147,10 @@ def apply(
             mm_data = {"image": mm_data}
 
         mm_items = self._to_mm_items(mm_data)
-        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs or {})
+        tokenization_kwargs = tokenization_kwargs or {}
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
         mm_processed_data = BatchFeature(image_data)
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index dffc347a7366..edf3dddb1bad 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -327,6 +327,7 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -393,9 +394,11 @@ def apply(
             self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs,
                                        num_image_patches),
         )
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
-        mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
         return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 77f11a691e08..eed8d89ca4f5 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -288,12 +288,14 @@ def _cached_apply_hf_processor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: The tokens are already inserted by the chat template
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 6ecdf80d4aa6..41595df2e262 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1020,8 +1020,13 @@ def __call__(
         prompt: str,
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> MultiModalInputs:
-        return self.apply(prompt, mm_data, hf_processor_mm_kwargs)
+        return self.apply(prompt,
+                          mm_data,
+                          hf_processor_mm_kwargs,
+                          mm_hash_overrides=mm_hash_overrides)
 
     def _get_data_parser(self) -> MultiModalDataParser:
         """
@@ -1357,7 +1362,11 @@ def _hash_mm_items(
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
     ) -> MultiModalHashes:
-        """Create MM hashes to be returned (only used in V1)."""
+        """Create MM hashes to be returned (only used in V1).
+
+        Note: When overrides are provided via callers of `apply`,
+        `_hash_mm_items` will be bypassed and the overrides will be used.
+        """
         model_id = self.info.model_id
 
         return {
@@ -1464,6 +1473,8 @@ def _apply_hf_processor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1483,8 +1494,10 @@ def _apply_hf_processor(
                                        hf_processor_mm_kwargs),
         )
 
-        mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
         mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
@@ -1506,6 +1519,8 @@ def _cached_apply_hf_processor(
         mm_data_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -1520,10 +1535,13 @@ def _cached_apply_hf_processor(
                 mm_data_items=mm_data_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
                 tokenization_kwargs=tokenization_kwargs,
+                mm_hash_overrides=mm_hash_overrides,
             )
 
-        mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                        tokenization_kwargs)
+        # Use overrides if provided; fallback to data-dependent hashing.
+        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
+                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
+                                         tokenization_kwargs))
 
         mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
@@ -1723,6 +1741,8 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        *,
+        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1751,6 +1771,7 @@ def apply(
             mm_items,
             hf_processor_mm_kwargs,
             tokenization_kwargs=tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         # NOTE: tokenization_kwargs are not required to init processor
@@ -1835,6 +1856,8 @@ def apply(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
+        *,
+        mm_hash_overrides: Optional[MultiModalHashes] = None,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1849,6 +1872,7 @@ def apply(
             mm_data,
             hf_processor_mm_kwargs,
             tokenization_kwargs,
+            mm_hash_overrides=mm_hash_overrides,
         )
 
         return self._get_enc_dec_inputs(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7ed60156626b..df915258d863 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -225,6 +225,41 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
             # Remember that this backend was set automatically
             params.guided_decoding.backend_was_auto = True
 
+    def _maybe_build_mm_hash_overrides(
+        self,
+        request_id: str,
+        prompt: PromptType,
+    ) -> Optional[dict[str, list[str]]]:
+        """Build per-item multimodal hash overrides when enabled. In this case,
+        multimodal data items are identified by their request id, modality and
+        index rather than their content.
+
+        Returns a dictionary of modality -> list[str] of overrides, or None if
+        disabled or no multimodal data is present.
+        """
+
+        def _extract_mm_data(p: PromptType):
+            if isinstance(p, dict) and "encoder_prompt" in p:
+                enc = p.get("encoder_prompt")
+                if isinstance(enc, dict):
+                    return enc.get("multi_modal_data")
+                return None
+            if isinstance(p, dict):
+                return p.get("multi_modal_data")
+            return None
+
+        mm_data = _extract_mm_data(prompt)
+        if not mm_data:
+            return None
+
+        overrides: dict[str, list[str]] = {}
+        for modality, data in mm_data.items():
+            n = len(data) if isinstance(data, list) else 1
+            overrides[modality] = [
+                f"{request_id}-{modality}-{i}" for i in range(n)
+            ]
+        return overrides
+
     def process_inputs(
         self,
         request_id: str,
@@ -254,6 +289,18 @@ def process_inputs(
         if arrival_time is None:
             arrival_time = time.time()
 
+        # Optionally generate multimodal hash overrides based on request id.
+        # NOTE: when users explicitly turn off BOTH prefix caching and input
+        # processing caching, no multimodal features or embeddings will be
+        # reused across requests, therefore hashing is no longer necessary.
+        if (self.model_config.multimodal_config and
+                self.model_config.multimodal_config.mm_processor_cache_gb == 0
+                and not self.cache_config.enable_prefix_caching):
+            mm_hash_overrides = self._maybe_build_mm_hash_overrides(
+                request_id, prompt)
+        else:
+            mm_hash_overrides = None
+
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
         # 2. For multimodal models with a merged preprocessor, preprocess
@@ -262,6 +309,7 @@ def process_inputs(
             prompt,
             tokenization_kwargs=tokenization_kwargs,
             lora_request=lora_request,
+            mm_hash_overrides=mm_hash_overrides,
         )
         from vllm.platforms import current_platform
         current_platform.validate_request(

From 853c371fc33e7c99aa2ab9f6e2cd7cbd1cadcf99 Mon Sep 17 00:00:00 2001
From: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Date: Wed, 27 Aug 2025 23:53:30 +0300
Subject: [PATCH 039/125] [V1][Mamba] - Enable V1 by default for Mamba Models
 (#23650)

Signed-off-by: asafg <39553475+Josephasafg@users.noreply.github.com>
---
 .../models/language/generation/test_hybrid.py | 151 ++++++++----------
 vllm/engine/arg_utils.py                      |   5 -
 vllm/model_executor/models/config.py          |   1 +
 3 files changed, 72 insertions(+), 85 deletions(-)

diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 7e7cc893ec8a..31ca3a6f0f98 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -100,21 +100,19 @@ def test_models(
         else:
             hf_outputs = None
 
-    if model not in V0_UNSUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v0_outputs = None
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None
 
     if model in V1_SUPPORTED_MODELS:
-        with monkeypatch.context() as m:
-            m.setenv("VLLM_USE_V1", "1")
-            with vllm_runner(model,
-                             max_num_seqs=MAX_NUM_SEQS,
-                             enable_prefix_caching=False) as vllm_model:
-                vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                    example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
     else:
         vllm_v1_outputs = None
 
@@ -137,7 +135,7 @@ def test_models(
         )
 
 
-@pytest.mark.parametrize("model", SSM_MODELS + HYBRID_MODELS)
+@pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_batching(
@@ -147,10 +145,6 @@ def test_batching(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    if model in V0_UNSUPPORTED_MODELS:
-        pytest.skip(
-            f"Unsupported V0 Engine. Skipping `test_batching` on {model}.")
-
     try:
         model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
         model_info.check_available_online(on_fail="skip")
@@ -188,29 +182,32 @@ def test_chunked_prefill(
     max_tokens: int,
     num_logprobs: int,
     chunked_prefill_token_size: int,
+    monkeypatch,
 ) -> None:
     max_num_seqs = chunked_prefill_token_size
     max_num_batched_tokens = chunked_prefill_token_size
 
-    with vllm_runner(model,
-                     enable_chunked_prefill=True,
-                     max_num_batched_tokens=max_num_batched_tokens,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        chunked = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                      max_tokens, num_logprobs)
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model,
+                         enable_chunked_prefill=True,
+                         max_num_batched_tokens=max_num_batched_tokens,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
 
-    with vllm_runner(model,
-                     enable_chunked_prefill=False,
-                     max_num_seqs=max_num_seqs) as vllm_model:
-        non_chunked = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+        with vllm_runner(model,
+                         enable_chunked_prefill=False,
+                         max_num_seqs=max_num_seqs) as vllm_model:
+            non_chunked = vllm_model.generate_greedy_logprobs(
+                example_prompts, max_tokens, num_logprobs)
 
-    check_logprobs_close(
-        outputs_0_lst=chunked,
-        outputs_1_lst=non_chunked,
-        name_0="chunked",
-        name_1="non_chunked",
-    )
+        check_logprobs_close(
+            outputs_0_lst=chunked,
+            outputs_1_lst=non_chunked,
+            name_0="chunked",
+            name_1="non_chunked",
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -281,25 +278,29 @@ def test_models_preemption_recompute(
     example_prompts,
     model: str,
     max_tokens: int,
+    monkeypatch,
 ) -> None:
     """
     Tests that outputs are identical with and w/o preemptions (recompute).
     """
-    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        scheduler = vllm_model.llm.llm_engine.scheduler[0]
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
-        preempt_vllm_outputs = vllm_model.generate_greedy(
-            example_prompts, max_tokens)
-
-        scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=preempt_vllm_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="vllm_preepmtions",
-        name_1="vllm",
-    )
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+            scheduler = vllm_model.llm.llm_engine.scheduler[0]
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
+            preempt_vllm_outputs = vllm_model.generate_greedy(
+                example_prompts, max_tokens)
+
+            scheduler.ENABLE_ARTIFICIAL_PREEMPT = False
+            vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                      max_tokens)
+
+        check_outputs_equal(
+            outputs_0_lst=preempt_vllm_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="vllm_preepmtions",
+            name_1="vllm",
+        )
 
 
 @pytest.mark.parametrize("model", [SSM_MODELS[0], HYBRID_MODELS[0]])
@@ -402,24 +403,18 @@ def test_full_cuda_graph(
         else:
             hf_outputs = None
 
-    if model not in V0_UNSUPPORTED_MODELS:
-        with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
-    else:
-        vllm_v0_outputs = None
-
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        with vllm_runner(model,
-                         max_num_seqs=MAX_NUM_SEQS,
-                         compilation_config={'full_cuda_graph': True},
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
-                example_prompts, max_tokens, num_logprobs)
+        m.setenv("VLLM_USE_V1", "0")
+        if model not in V0_UNSUPPORTED_MODELS:
+            with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+                vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
+                    example_prompts, max_tokens, num_logprobs)
+        else:
+            vllm_v0_outputs = None
+
+    with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     if hf_outputs is not None and vllm_v0_outputs is not None:
         check_logprobs_close(
@@ -466,24 +461,20 @@ def test_fp32_state(
         else:
             hf_outputs = None
 
-    with vllm_runner(model,
-                     max_num_seqs=MAX_NUM_SEQS,
-                     mamba_ssm_cache_dtype="float32") as vllm_model:
-        vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-        if model in HYBRID_MODELS:
-            # required due to reorder_batch behaviour
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+        m.setenv("VLLM_USE_V1", "0")
         with vllm_runner(model,
                          max_num_seqs=MAX_NUM_SEQS,
-                         mamba_ssm_cache_dtype="float32",
-                         enable_prefix_caching=False) as vllm_model:
-            vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+                         mamba_ssm_cache_dtype="float32") as vllm_model:
+            vllm_v0_outputs = vllm_model.generate_greedy_logprobs(
                 example_prompts, max_tokens, num_logprobs)
 
+    with vllm_runner(model,
+                     max_num_seqs=MAX_NUM_SEQS,
+                     mamba_ssm_cache_dtype="float32") as vllm_model:
+        vllm_v1_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
     if hf_outputs is not None:
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 3399d505e363..e4d205aeb863 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1463,11 +1463,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        # V1 mamba models are unoptimized.
-        if model_config.has_inner_state and _warn_or_fallback(
-                feature_name="Mamba"):
-            return False
-
         # No Concurrent Partial Prefills so far.
         if (self.max_num_partial_prefills
                 != SchedulerConfig.max_num_partial_prefills
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 88b3154de2cb..b0dbfacece3a 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -417,4 +417,5 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
     "GptOssForCausalLM": GptOssForCausalLMConfig,
     "MambaForCausalLM": MambaModelConfig,
     "Mamba2ForCausalLM": MambaModelConfig,
+    "FalconMambaForCausalLM": MambaModelConfig,
 }

From 082cc07ef8f810bea61eaed77a60137684ca78f8 Mon Sep 17 00:00:00 2001
From: Yongye Zhu <zyy1102000@gmail.com>
Date: Wed, 27 Aug 2025 17:33:21 -0400
Subject: [PATCH 040/125] DP/EP Support for gpt-oss with deepep-ht comm kernel
 on SM100 (#23608)

---
 .../base_device_communicator.py               |   2 +-
 .../model_executor/layers/fused_moe/config.py |   6 +
 vllm/model_executor/layers/fused_moe/layer.py |   6 +-
 .../layers/fused_moe/trtllm_moe.py            | 197 ++++++++++++++++++
 vllm/model_executor/layers/fused_moe/utils.py |  16 ++
 .../compressed_tensors_moe.py                 |   8 +-
 .../model_executor/layers/quantization/fp8.py |   1 +
 .../layers/quantization/modelopt.py           |   2 +
 .../layers/quantization/mxfp4.py              | 110 ++++++++++
 .../layers/quantization/utils/mxfp4_utils.py  |   9 +-
 .../layers/quantization/utils/mxfp8_utils.py  |  20 ++
 11 files changed, 365 insertions(+), 12 deletions(-)
 create mode 100644 vllm/model_executor/layers/fused_moe/trtllm_moe.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/mxfp8_utils.py

diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index 9e5aa4e4c2a8..9131582eef75 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -255,7 +255,7 @@ def prepare_communication_buffer_for_model(self,
             if module.__class__.__name__ == "FusedMoE"
         ]
         for module in moe_modules:
-            module.quant_method.init_prepare_finalize()
+            module.quant_method.init_prepare_finalize(module)
 
     def dispatch(
             self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 7c1a7b636a9c..cab610decf90 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -450,6 +450,12 @@ def make(
             if quant_dtype is None and isinstance(quant_config, Fp8Config):
                 quant_dtype = torch.float8_e4m3fn
 
+            from vllm.model_executor.layers.quantization.mxfp4 import (
+                Mxfp4Config)
+            if (quant_dtype is None and isinstance(quant_config, Mxfp4Config)
+                    and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8):
+                quant_dtype = "mxfp8"
+
             from vllm.model_executor.layers.quantization.modelopt import (
                 ModelOptNvFp4Config)
             if quant_dtype is None and isinstance(quant_config,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 54406a5a2d87..b9de03ddd216 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -200,7 +200,7 @@ def maybe_make_prepare_finalize(
 
     # Note: init_prepare_finalize should only be called by
     # prepare_communication_buffer_for_model.
-    def init_prepare_finalize(self):
+    def init_prepare_finalize(self, layer: torch.nn.Module):
         assert self.moe is not None
         prepare_finalize = self.maybe_make_prepare_finalize(self.moe)
 
@@ -211,7 +211,7 @@ def init_prepare_finalize(self):
             assert self.fused_experts is None, \
                 f"Attempt to override experts for {id(self)}!"
             self.topk_indices_dtype = prepare_finalize.topk_indices_dtype()
-            experts = self.select_gemm_impl(prepare_finalize, self.moe)
+            experts = self.select_gemm_impl(prepare_finalize, self.moe, layer)
             self.fused_experts = FusedMoEModularKernel(
                 prepare_finalize,
                 experts,
@@ -221,6 +221,7 @@ def select_gemm_impl(
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
@@ -273,6 +274,7 @@ def select_gemm_impl(
         prepare_finalize: FusedMoEPrepareAndFinalize,
         # TODO(bnell): Remove. Every layer should have an moe config object.
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         if (prepare_finalize.activation_format ==
                 FusedMoEActivationFormat.BatchedExperts):
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
new file mode 100644
index 000000000000..14dfce4b0e3a
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP)
+from vllm.utils import next_power_of_2
+
+
+class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
+
+    def __init__(
+        self,
+        moe: FusedMoEConfig,
+        gemm1_alpha,
+        gemm1_beta,
+        gemm1_clamp_limit,
+        w13_bias,
+        w2_bias,
+        max_capture_size,
+    ):
+        super().__init__(moe.quant_config)
+        self.moe = moe
+        self.gemm1_alpha = gemm1_alpha
+        self.gemm1_beta = gemm1_beta
+        self.gemm1_clamp_limit = gemm1_clamp_limit
+        self.w13_bias = w13_bias
+        self.w2_bias = w2_bias
+        self.max_capture_size = max_capture_size
+
+    @property
+    def activation_formats(
+        self
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        a: torch.Tensor,
+        aq: torch.Tensor,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
+        # The workspaces for this implementation are managed by flashinfer.
+        # TODO(varun) : workspace1 is could be used as the output tensor. This
+        # is error-prone. Allow the `workspace_shapes` to return None workspaces
+        workspace1 = (M, K)
+        workspace2 = (0, 0)
+        output = (M, K)
+        return (workspace1, workspace2, output, a.dtype)
+
+    def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int,
+                             local_num_experts: int):
+        # Number of tokens in the input tensor.
+        num_tokens = x.shape[0]
+        # Factor to account for the imbalance of the experts.
+        # factor equals to the
+        # max_real_num_tokens_per_expert / perfect_num_tokens_per_expert
+        # 1.0 means perfect expert distribution.
+        # > 1.0 means some experts have more tokens than the perfect
+        # distribution.
+        # < 1.0 does not make sense.
+        imbalance_factor = 1.3
+        # Calculate the number of tokens per expert assuming perfect
+        # distribution.
+        num_tokens_per_expert = (num_tokens * top_k) // local_num_experts
+        # Apply the imbalance factor.
+        num_tokens_per_expert = int(num_tokens_per_expert * imbalance_factor)
+        # And pad the number to the next power of 2.
+        tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+        # Cap to 8-64 tokens per CTA tile as it's the range supported by the
+        #  kernel.
+        tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+
+        return tile_tokens_dim
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: Optional[torch.Tensor],
+        w1_scale: Optional[torch.Tensor],
+        w2_scale: Optional[torch.Tensor],
+        w1_zp: Optional[torch.Tensor],
+        w2_zp: Optional[torch.Tensor],
+        a1q_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: Optional[mk.ExpertTokensMetadata],
+        apply_router_weight_on_input: bool,
+    ):
+        topk = topk_ids.size(-1)
+        local_num_experts = w1.size(0)
+        intermediate_size = w2.size(1)
+        local_expert_offset = self.moe.ep_rank * local_num_experts
+
+        x_quant = hidden_states
+        x_scale = a1q_scale
+        if x_scale is not None:
+            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                *x_quant.shape[:-1], -1)
+
+        packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to(
+            torch.bfloat16).view(torch.int16)
+
+        assert w1_scale is not None
+        assert w2_scale is not None
+        kwargs = {
+            "topk_ids":
+            packed_tensor,
+            "routing_bias":
+            None,
+            "hidden_states":
+            x_quant,
+            "hidden_states_scale":
+            x_scale,
+            "gemm1_weights":
+            w1,
+            "gemm1_weights_scale":
+            w1_scale,
+            "gemm1_bias":
+            self.w13_bias,
+            "gemm1_alpha":
+            self.gemm1_alpha,
+            "gemm1_beta":
+            self.gemm1_beta,
+            "gemm1_clamp_limit":
+            self.gemm1_clamp_limit,
+            "gemm2_weights":
+            w2,
+            "gemm2_weights_scale":
+            w2_scale,
+            "gemm2_bias":
+            self.w2_bias,
+            "output1_scale_scalar":
+            None,
+            "output1_scale_gate_scalar":
+            None,
+            "output2_scale_scalar":
+            None,
+            "num_experts":
+            global_num_experts,
+            "top_k":
+            topk,
+            "n_group":
+            None,
+            "topk_group":
+            None,
+            "intermediate_size":
+            intermediate_size,
+            "local_expert_offset":
+            local_expert_offset,
+            "local_num_experts":
+            local_num_experts,
+            "routed_scaling_factor":
+            None,
+            "tile_tokens_dim":
+            self._get_tile_tokens_dim(x_quant, topk, local_num_experts),
+            "routing_method_type":
+            1,
+            "do_finalize":
+            True,
+            "output":
+            output,
+            "tune_max_num_tokens":
+            self.max_capture_size,
+        }
+
+        from flashinfer import trtllm_fp4_block_scale_routed_moe
+        trtllm_fp4_block_scale_routed_moe(**kwargs)
+        return output
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 4c3e700ad399..1aeb3f92bc3e 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -12,6 +12,8 @@
     per_token_group_quant_int8, per_token_quant_int8)
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     quant_dequant_mxfp4)
+from vllm.model_executor.layers.quantization.utils.mxfp8_utils import (
+    mxfp8_quantize)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils import cdiv
@@ -177,6 +179,18 @@ def _mxfp4_quantize(
     return A, None
 
 
+def _mxfp8_quantize(
+    A: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    per_act_token_quant: bool,
+    block_shape: Optional[list[int]] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert A_scale is None
+    assert not per_act_token_quant
+    assert block_shape is None
+    return mxfp8_quantize(A)
+
+
 def moe_kernel_quantize_input(
     A: torch.Tensor,
     A_scale: Optional[torch.Tensor],
@@ -195,6 +209,8 @@ def moe_kernel_quantize_input(
                              is_sf_swizzled_layout=is_fp4_scale_swizzled)
     elif quant_dtype == "mxfp4":
         return _mxfp4_quantize(A, A_scale, per_act_token_quant, block_shape)
+    elif quant_dtype == "mxfp8":
+        return _mxfp8_quantize(A, A_scale, per_act_token_quant, block_shape)
     else:
         return A, A_scale
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 6279bb8b6057..af9d1c46f68f 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -322,6 +322,7 @@ def select_gemm_impl(
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         """Return the appropriate GEMM experts implementation."""
         experts = select_nvfp4_gemm_impl(
@@ -719,10 +720,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 dtype=torch.int64)
 
     def select_gemm_impl(
-        self,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-        moe: FusedMoEConfig,
-    ) -> FusedMoEPermuteExpertsUnpermute:
+            self, prepare_finalize: FusedMoEPrepareAndFinalize,
+            moe: FusedMoEConfig,
+            layer: torch.nn.Module) -> FusedMoEPermuteExpertsUnpermute:
         # cutlass path
         if self.use_cutlass:
             from vllm.model_executor.layers.fused_moe import (
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index be358cfa949f..0200b0e9ed00 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -897,6 +897,7 @@ def select_gemm_impl(
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> FusedMoEPermuteExpertsUnpermute:
         from vllm.model_executor.layers.fused_moe import (
             BatchedTritonOrDeepGemmExperts, TritonOrDeepGemmExperts)
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 72864853f7e0..adce598c4ff1 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -311,6 +311,7 @@ def select_gemm_impl(
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         experts = select_cutlass_fp8_gemm_impl(
             moe,
@@ -1032,6 +1033,7 @@ def select_gemm_impl(
         self,
         prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig,
+        layer: torch.nn.Module,
     ) -> mk.FusedMoEPermuteExpertsUnpermute:
         experts = select_nvfp4_gemm_impl(
             moe,
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index bdeb169a4b97..6724796904f0 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -10,6 +10,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
+from vllm.model_executor.layers.fused_moe import modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -445,6 +447,91 @@ def _get_tile_tokens_dim(self, x: torch.Tensor, top_k: int):
 
         return tile_tokens_dim
 
+    def select_gemm_impl(
+        self,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+        layer: torch.nn.Module,
+    ) -> mk.FusedMoEPermuteExpertsUnpermute:
+        if (prepare_finalize.activation_format ==
+                mk.FusedMoEActivationFormat.BatchedExperts):
+            raise NotImplementedError(
+                "Mxfp4 does not support batched experts format for EP")
+        else:
+            if should_use_flashinfer_mxfp4():
+                # B200 code-path
+                kwargs = {
+                    "gemm1_alpha": layer.gemm1_alpha,
+                    "gemm1_beta": layer.gemm1_beta,
+                    "gemm1_clamp_limit": layer.gemm1_clamp_limit,
+                    "w13_bias": layer.w13_bias,
+                    "w2_bias": layer.w2_bias,
+                    "max_capture_size": self.max_capture_size,
+                }
+                return TrtLlmGenExperts(moe, **kwargs)
+            else:
+                # Use matmul_ogs from triton_kernels here!
+                raise NotImplementedError(
+                    "Mxfp4 does not support non-batched experts format for EP")
+
+    def _route_and_experts(
+            self,
+            layer: torch.nn.Module,
+            x: torch.Tensor,
+            router_logits: torch.Tensor,
+            top_k: int,
+            renormalize: bool,
+            use_grouped_topk: bool = False,
+            topk_group: Optional[int] = None,
+            num_expert_group: Optional[int] = None,
+            global_num_experts: int = -1,
+            expert_map: Optional[torch.Tensor] = None,
+            custom_routing_function: Optional[Callable] = None,
+            scoring_func: str = "softmax",
+            e_score_correction_bias: Optional[torch.Tensor] = None,
+            apply_router_weight_on_input: bool = False,
+            activation: str = "silu",
+            enable_eplb: bool = False,
+            expert_load_view: Optional[torch.Tensor] = None,
+            logical_to_physical_map: Optional[torch.Tensor] = None,
+            logical_replica_count: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+
+        assert isinstance(self.fused_experts, mk.FusedMoEModularKernel)
+
+        topk_weights, topk_ids = FusedMoE.select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            scoring_func=scoring_func,
+            e_score_correction_bias=e_score_correction_bias,
+            indices_type=self.topk_indices_dtype,
+            enable_eplb=enable_eplb,
+            expert_map=expert_map,
+            expert_load_view=expert_load_view,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count)
+
+        return self.fused_experts(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=True,
+            activation=activation,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
+
     def apply(
         self,
         layer: torch.nn.Module,
@@ -503,6 +590,29 @@ def apply(
                 activation=activation,
                 expert_map=expert_map)
 
+        if self.fused_experts is not None:
+            return self._route_and_experts(
+                layer,
+                x,
+                router_logits,
+                top_k,
+                renormalize,
+                use_grouped_topk,
+                topk_group,
+                num_expert_group,
+                global_num_experts,
+                expert_map,
+                custom_routing_function,
+                scoring_func,
+                e_score_correction_bias,
+                apply_router_weight_on_input,
+                activation,
+                enable_eplb,
+                expert_load_view,
+                logical_to_physical_map,
+                logical_replica_count,
+            )
+
         assert _can_support_mxfp4(
             use_grouped_topk, topk_group, num_expert_group, expert_map,
             custom_routing_function, e_score_correction_bias,
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 48f9cc3737e4..3de928fea720 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -66,11 +66,10 @@ def _can_support_mxfp4(use_grouped_topk: bool = False,
                        logical_to_physical_map: Optional[torch.Tensor] = None,
                        logical_replica_count: Optional[torch.Tensor] = None):
     return not (use_grouped_topk or topk_group or num_expert_group
-                or expert_map or custom_routing_function
-                or e_score_correction_bias or apply_router_weight_on_input
-                or scoring_func != "softmax" or activation != "swigluoai"
-                or expert_load_view or logical_to_physical_map
-                or logical_replica_count)
+                or custom_routing_function or e_score_correction_bias
+                or apply_router_weight_on_input or scoring_func != "softmax"
+                or activation != "swigluoai" or expert_load_view
+                or logical_to_physical_map or logical_replica_count)
 
 
 def _dequant_mxfp4(x: torch.Tensor, scale: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
new file mode 100644
index 000000000000..2a6b21c918f4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/mxfp8_utils.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def mxfp8_quantize(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+
+    try:
+        from flashinfer import mxfp8_quantize
+    except ImportError as err:
+        raise ImportError("The package `flashinfer` is required to do "
+                          "MX-FP8 quantization. Please install it with" \
+                          "`pip install flashinfer`") from err
+
+    return mxfp8_quantize(x, is_sf_swizzled_layout=False)

From f9ca2b40a0357d98e3fb8bd951745dfaceab459e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 27 Aug 2025 17:48:16 -0400
Subject: [PATCH 041/125] [Bugfix] Fix Marlin NVFP4 for modelopt (#23659)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 .../layers/quantization/modelopt.py           | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index adce598c4ff1..9d4e453ffc54 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -891,7 +891,11 @@ def process_weights_after_loading(self, layer: Module) -> None:
         assert (layer.weight_scale.dtype == torch.float8_e4m3fn), (
             "Weight Block scale must be represented as FP8-E4M3")
 
-        if self.backend == "flashinfer-trtllm":
+        if self.backend == "marlin":
+            prepare_fp4_layer_for_marlin(layer)
+            del layer.alpha
+            del layer.input_scale
+        elif self.backend == "flashinfer-trtllm":
             # FlashInfer TRTLLM FP4 GEMM requires a different weight layout.
             # FlashInfer provides nvfp4_quantize to quantize + shuffle the
             # layout but we use our own quantization so we have to call
@@ -916,11 +920,6 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                            requires_grad=False)
             layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
-            if self.backend == "marlin":
-                prepare_fp4_layer_for_marlin(layer)
-                del layer.alpha
-                del layer.input_scale
-
     def apply(
         self,
         layer: torch.nn.Module,
@@ -1312,6 +1311,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             del layer.w2_weight_scale
             del layer.w13_weight
             del layer.w13_weight_scale
+        elif self.use_marlin:
+            # Marlin processing
+            prepare_moe_fp4_layer_for_marlin(layer)
+            del layer.g1_alphas
+            del layer.g2_alphas
+            del layer.w13_input_scale_quant
+            del layer.w2_input_scale_quant
         else:
             # Non-TRT-LLM processing (Cutlass or non-flashinfer)
             assert (layer.w13_weight_scale.shape[2] % 16 == 0), (
@@ -1333,13 +1339,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight = Parameter(layer.w2_weight.data,
                                         requires_grad=False)
 
-        if self.use_marlin:
-            prepare_moe_fp4_layer_for_marlin(layer)
-            del layer.g1_alphas
-            del layer.g2_alphas
-            del layer.w13_input_scale_quant
-            del layer.w2_input_scale_quant
-
     def apply(
         self,
         layer: torch.nn.Module,

From 321938e9ac4000e0cb37e328359a7fd3026bc672 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:52:24 -0400
Subject: [PATCH 042/125] [Feature] Add `VLLM_DISABLE_PAD_FOR_CUDAGRAPH` to
 Avoid Hang Issue (#23595)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/envs.py                       | 7 +++++++
 vllm/v1/worker/gpu_model_runner.py | 1 +
 2 files changed, 8 insertions(+)

diff --git a/vllm/envs.py b/vllm/envs.py
index 35735b552575..a6a795dcfcda 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -166,6 +166,7 @@
     VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
     VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False
     VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None
+    VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False
 
 
 def get_default_cache_root():
@@ -1144,6 +1145,12 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_ENABLE_CUDAGRAPH_GC":
     lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))),
 
+    # Disable padding to CUDA graph capture batch sizes.
+    # TODO(wentao): https://github.com/vllm-project/vllm/issues/23378
+    # After the issue is fixed, we can remove this flag.
+    "VLLM_DISABLE_PAD_FOR_CUDAGRAPH":
+    lambda: bool(int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))),
+
     # Used to force set up loopback IP
     "VLLM_LOOPBACK_IP":
     lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 01c90b2ea38d..a194808e513d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1491,6 +1491,7 @@ def execute_model(
 
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+                and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
             # Use CUDA graphs.
             # Add padding to the batch size.

From 5da4f5d857933329aaca779e3a81f1385c84e34a Mon Sep 17 00:00:00 2001
From: Hanchenli <61769611+Hanchenli@users.noreply.github.com>
Date: Wed, 27 Aug 2025 17:44:52 -0700
Subject: [PATCH 043/125] [Bugfix] Fix for V1 priority scheduling crashes at
 preemption (#23713)

Signed-off-by: Hanchenli <lihanc2002@gmail.com>
---
 tests/v1/core/test_scheduler.py | 91 +++++++++++++++++++++++++++++++--
 vllm/v1/core/sched/scheduler.py |  2 +
 2 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 07d7c12a4f5e..70e869178804 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -1293,7 +1293,8 @@ def create_requests_with_priority(
         mm_positions: Optional[list[list[PlaceholderRange]]] = None,
         max_tokens: int = 16,
         stop_token_ids: Optional[list[int]] = None,
-        prompt_logprobs: Optional[int] = None):
+        prompt_logprobs: Optional[int] = None,
+        starting_idx: int = 0):
     """Create requests with specified priorities and arrival times."""
     assert len(priorities) == num_requests
     if arrival_times is not None:
@@ -1315,8 +1316,8 @@ def create_requests_with_priority(
             mm_position = None
             mm_kwargs = None
         request = Request(
-            request_id=f"{i}",
-            prompt_token_ids=[i] * num_tokens,
+            request_id=f"{i + starting_idx}",
+            prompt_token_ids=[i + starting_idx] * num_tokens,
             sampling_params=sampling_params,
             pooling_params=None,
             multi_modal_kwargs=mm_kwargs,
@@ -1813,3 +1814,87 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     assert len(output.scheduled_new_reqs) == 0
     assert len(scheduler.running) == 0
     assert len(scheduler.waiting) == 1
+
+
+def test_priority_scheduling_preemption_when_out_of_kv():
+    """Test that priority scheduling preempts lower priority requests
+    when out of KV cache space."""
+    # Create scheduler with very limited memory to force preemption
+    scheduler = create_scheduler_with_priority(
+        max_num_seqs=2,  # Allow multiple requests
+        max_num_batched_tokens=200,
+        num_blocks=5,  # Can hold 64 tokens (first block is null)
+        block_size=16,  # Standard block size
+    )
+
+    # Create a request and schedule it
+    request_low = create_requests_with_priority(
+        num_requests=1,
+        priorities=[1],
+        arrival_times=[0.0],
+        num_tokens=30,
+        starting_idx=0,
+    )[0]
+    scheduler.add_request(request_low)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 1
+
+    # Simulate model execution
+    model_output = ModelRunnerOutput(
+        req_ids=[request_low.request_id],
+        req_id_to_index={request_low.request_id: 0},
+        sampled_token_ids=[[100]],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Create a high priority request and schedule it
+    request_high = create_requests_with_priority(
+        num_requests=1,
+        priorities=[0],
+        arrival_times=[1.0],
+        num_tokens=32,
+        starting_idx=1,
+    )[0]
+    scheduler.add_request(request_high)
+    output = scheduler.schedule()
+    # KV cache should be full at this point
+    assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == 0
+    assert len(output.scheduled_new_reqs) == 1
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == 2
+
+    # Simulate model execution
+    requests = [request_low, request_high]
+    model_output = ModelRunnerOutput(
+        req_ids=[req.request_id for req in requests],
+        req_id_to_index={
+            req.request_id: i
+            for i, req in enumerate(requests)
+        },
+        sampled_token_ids=[[100] for _ in requests],
+        # spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, model_output)
+
+    # Schedule again - this should trigger preemption
+    # req_low needs 32 tokens = 2 blocks
+    # req_high needs 33 tokens = 3 blocks
+    # so doesn't fit in 4 blocks.
+    output = scheduler.schedule()
+
+    # Should have preempted req_low
+    assert len(output.scheduled_new_reqs) == 0
+    assert output.scheduled_cached_reqs.num_reqs == 1
+    assert output.scheduled_cached_reqs.req_ids[0] == request_high.request_id
+    assert len(scheduler.waiting) == 1
+    assert len(scheduler.running) == 1
\ No newline at end of file
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 14a914d8f2f0..3bd2fe2f0515 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -253,6 +253,8 @@ def schedule(self) -> SchedulerOutput:
                             key=lambda r: (r.priority, r.arrival_time),
                         )
                         self.running.remove(preempted_req)
+                        if preempted_req in scheduled_running_reqs:
+                            scheduled_running_reqs.remove(preempted_req)
                     else:
                         preempted_req = self.running.pop()
 

From a69693e38f27f12e5a5d05b6792e590b520ca27b Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Wed, 27 Aug 2025 19:43:26 -0700
Subject: [PATCH 044/125] Migrate Qwen inputs to TensorSchema (#23473)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/qwen_vl.py | 51 +++++++++++++--------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 2950ca664a98..90200f319464 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -11,7 +11,7 @@
 import unicodedata
 from collections.abc import Collection, Mapping, Sequence, Set
 from functools import lru_cache, partial
-from typing import Callable, Literal, Optional, TypedDict, Union
+from typing import Annotated, Callable, Literal, Optional, Union
 
 import regex as re
 import torch
@@ -40,6 +40,7 @@
                                         PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
                          SupportsMultiModal, SupportsPP)
@@ -47,26 +48,34 @@
 from .utils import flatten_bn, merge_multimodal_embeddings
 
 
-class QwenImagePixelInputs(TypedDict):
-    type: Literal["pixel_values"]
-    data: torch.Tensor
+class QwenImagePixelInputs(TensorSchema):
     """
-    Shape: `(batch_size * num_images, 3, image_size, image_size)`
-
+    Dimensions:
+        - bn: Batch size * number of images
+        - c: Number of channels (3)
+        - h: Height
+        - w: Width
+    
     Note that image_size is the value in the vision config to which we resize
     the image to in the normalization transform. Currently multi-image support
     can only be leveraged by passing image embeddings directly.
     """
+    type: Literal["pixel_values"] = "pixel_values"
+    data: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
 
 
-class QwenImageEmbeddingInputs(TypedDict):
-    type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, 256, hidden_size)`
-
+class QwenImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - bn: Batch size * number of images
+        - ifs: Image feature size (256)
+        - hs: Hidden size
+    
     `hidden_size` must match the hidden size of the language model backbone
     and is stored in the visual config of the model if we have one.
     """
+    type: Literal["image_embeds"] = "image_embeds"
+    data: Annotated[torch.Tensor, TensorShape("bn", 256, "hs")]
 
 
 QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs]
@@ -697,19 +706,6 @@ def __init__(
 
         self.transformer: QwenVLModel
 
-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.visual["image_size"]
-        expected_dims = (3, h, w)
-        actual_dims = tuple(data.shape[1:])
-
-        if actual_dims != expected_dims:
-            expected_expr = ("batch_size", *map(str, expected_dims))
-            raise ValueError(
-                f"The expected shape of pixel values is {expected_expr}. "
-                f"You supplied {tuple(data.shape)}.")
-
-        return data
-
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[QwenImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
@@ -720,10 +716,13 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
+            expected_h = expected_w = self.config.visual["image_size"]
+            resolve_bindings = {"h": expected_h, "w": expected_w}
+
             return QwenImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_pixel_values(
-                    flatten_bn(pixel_values, concat=True)),
+                data=flatten_bn(pixel_values, concat=True),
+                resolve_bindings=resolve_bindings,
             )
 
         if image_embeds is not None:

From 1b7b161a09289214eea41e17895a68a7ccd4b1dc Mon Sep 17 00:00:00 2001
From: Shrey Gupta <66182248+Shrey1306@users.noreply.github.com>
Date: Thu, 28 Aug 2025 08:42:44 +0530
Subject: [PATCH 045/125] [Feature] models: pass layer prefix to
 replace_linear_class for per-layer quantization routing. Addresses #23239
 (#23556)

Signed-off-by: Shrey Gupta <shreyg1303@gmail.com>
---
 vllm/model_executor/models/deepseek_vl2.py | 12 ++++++++----
 vllm/model_executor/models/transformers.py | 14 ++++++++++----
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 1bd2802a8683..5eab02b17151 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -408,13 +408,17 @@ def patch_vit_for_tp(self, vit: torch.nn.Module,
             if isinstance(module, nn.Linear):
                 parent, attr_name = self._get_parent_and_attr(vit, name)
                 if isinstance(parent, timm.layers.Mlp) and attr_name == "fc1":
-                    new_linear = replace_linear_class(module, "colwise",
-                                                      quant_config)
+                    new_linear = replace_linear_class(module,
+                                                      "colwise",
+                                                      quant_config,
+                                                      prefix=name)
                     setattr(parent, attr_name, new_linear)
                 elif isinstance(parent,
                                 timm.layers.Mlp) and attr_name == "fc2":
-                    new_linear = replace_linear_class(module, "rowwise",
-                                                      quant_config)
+                    new_linear = replace_linear_class(module,
+                                                      "rowwise",
+                                                      quant_config,
+                                                      prefix=name)
                     setattr(parent, attr_name, new_linear)
 
         return vit
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index edf3dddb1bad..f7ced6134da5 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -106,8 +106,11 @@ def can_enable_torch_compile(vllm_config: VllmConfig) -> bool:
 
 
 def replace_linear_class(
-    linear: nn.Linear, style: Literal["colwise", "rowwise"],
-    quant_config: QuantizationConfig
+    linear: nn.Linear,
+    style: Literal["colwise", "rowwise"],
+    quant_config: QuantizationConfig,
+    *,
+    prefix: str = "",
 ) -> Union[ColumnParallelLinear, RowParallelLinear, ReplicatedLinear]:
     """
     Replace nn.Linear with one of vLLM's tensor parallel linear classes.
@@ -141,6 +144,7 @@ def replace_linear_class(
         output_size=linear.out_features,
         bias=linear.bias is not None,
         quant_config=quant_config,
+        prefix=prefix,
         return_bias=False,
         **vllm_linear_kwargs,
     )
@@ -557,8 +561,10 @@ def _tensor_parallel(module: nn.Module,
                     generator = (p for p in tp_plan if re.match(p, qual_name))
                     pattern = next(generator, None)
                     style = tp_plan.get(pattern, "replicate")
-                    new_module = replace_linear_class(child_module, style,
-                                                      self.quant_config)
+                    new_module = replace_linear_class(child_module,
+                                                      style,
+                                                      self.quant_config,
+                                                      prefix=qual_name)
                     setattr(module, child_name, new_module)
                     log_replacement(qual_name, child_module, new_module)
                 else:

From a781e84ec25b1d1b6c245f2e8ffec6e10bafdaa1 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Wed, 27 Aug 2025 23:12:53 -0400
Subject: [PATCH 046/125] [Perf] Tune configs for triton block fp8 gemm
 H100/H200 (#23748)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/bench_block_fp8_gemm.py    | 113 ++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  66 ++++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  70 ++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  44 +++---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  48 +++---
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  64 ++++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  68 ++++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  82 +++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  86 +++++------
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  62 ++++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  66 ++++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  60 +++----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  74 ++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  76 ++++-----
 ...,dtype=fp8_w8a8,block_shape=[128,128].json |  68 ++++----
 21 files changed, 1456 insertions(+), 467 deletions(-)
 create mode 100644 benchmarks/kernels/bench_block_fp8_gemm.py
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
new file mode 100644
index 000000000000..883f0cf7e55f
--- /dev/null
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul,
+)
+from vllm.platforms import current_platform
+from vllm.triton_utils import triton as vllm_triton
+
+assert current_platform.is_cuda(), (
+    "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
+)
+
+# DeepSeek-V3 weight shapes
+DEEPSEEK_V3_SHAPES = [
+    (512 + 64, 7168),
+    ((128 + 64) * 128, 7168),
+    (128 * (128 + 128), 512),
+    (7168, 16384),
+    (7168, 18432),
+    (18432 * 2, 7168),
+    (24576, 1536),
+    (12288, 7168),
+    (4096, 7168),
+    (7168, 2048),
+]
+
+
+def build_w8a8_block_fp8_runner(M, N, K, block_size, device):
+    """Build runner function for w8a8 block fp8 matmul."""
+    factor_for_scale = 1e-2
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    # Create random FP8 tensors
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    A = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32, device=device) - 0.5) * 2 * fp8_max
+    B = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    # Create scales
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32, device=device) * factor_for_scale
+    Bs = (
+        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
+        * factor_for_scale
+    )
+
+    def run():
+        return w8a8_block_fp8_matmul(A, B, As, Bs, block_size, torch.bfloat16)
+
+    return run
+
+
+@vllm_triton.testing.perf_report(
+    vllm_triton.testing.Benchmark(
+        x_names=["batch_size"],
+        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
+        x_log=False,
+        line_arg="provider",
+        line_vals=["torch-bf16", "w8a8-block-fp8"],
+        line_names=["torch-bf16", "w8a8-block-fp8"],
+        ylabel="TFLOP/s (larger is better)",
+        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
+        args={},
+    )
+)
+def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
+    M = batch_size
+    device = "cuda"
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "torch-bf16":
+        a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
+        b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
+        )
+    else:  # w8a8-block-fp8
+        run_w8a8 = build_w8a8_block_fp8_runner(M, N, K, block_size, device)
+        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
+            lambda: run_w8a8(), quantiles=quantiles
+        )
+
+    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
+    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
+
+
+if __name__ == "__main__":
+    block_size = (128, 128)
+
+    for N, K in DEEPSEEK_V3_SHAPES:
+        print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
+
+        print(f"TFLOP/s comparison (block_size={block_size}):")
+        benchmark_tflops.run(
+            print_data=True,
+            # show_plots=False,
+            # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
+            N=N,
+            K=K,
+            block_size=block_size,
+        )
+
+    print("\nBenchmark finished!")
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..0ea0225c96af
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..be487f2805b8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=12288,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..f74a52fc17c9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..8cab1b093276
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 1c61451fb34e..ae244f90bb06 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,73 +1,73 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
         "num_stages": 4
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -83,7 +83,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -115,15 +115,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,13 +133,13 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 63e661c80de6..b2931d68f488 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,83 +1,83 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
         "num_stages": 4
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,9 +99,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,8 +139,8 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
-}
+}
\ No newline at end of file
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 56b939e52fac..ad630f0d787c 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,30 +1,30 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
-        "num_stages": 4
+        "num_stages": 3
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 8,
-        "num_stages": 4
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
-        "num_stages": 3
+        "num_stages": 4
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -32,19 +32,19 @@
         "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 3
     },
     "32": {
@@ -59,9 +59,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -83,7 +83,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 63d9a0bf5d79..10b940c04fad 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,50 +1,50 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 8,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 4
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_K": 64,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "num_warps": 4,
         "num_stages": 3
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 3
     },
     "32": {
@@ -59,15 +59,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 2
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -75,7 +75,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 7fa398c15a2a..94ce6e77f09c 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,55 +1,55 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
-        "num_warps": 4,
-        "num_stages": 5
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -59,31 +59,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,7 +99,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,7 +131,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index f15d8f64c709..9540df407975 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,57 +1,57 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
         "num_stages": 3
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -59,33 +59,33 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -93,23 +93,23 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..96f6c307b357
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..567675787d4f
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 51e237b91b8e..0894ff2fa332 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,6 +1,6 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -8,55 +8,55 @@
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
     "48": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
@@ -64,74 +64,74 @@
         "num_stages": 4
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,8 +139,8 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 6280219c9ee7..86c68e08a1a6 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,78 +1,78 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -80,58 +80,58 @@
         "num_stages": 5
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "256": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 0a1e14cffbb2..af1a384cbcbd 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,14 +1,14 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
@@ -16,26 +16,26 @@
         "num_stages": 5
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -43,9 +43,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
@@ -59,7 +59,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -67,31 +67,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -101,31 +101,31 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 64,
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
@@ -141,6 +141,6 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 15b1c93f60fc..d381764a2641 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,22 +1,22 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
-        "num_warps": 4,
+        "num_warps": 8,
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -24,18 +24,18 @@
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -45,47 +45,47 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -93,29 +93,29 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "512": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index 8ff12e64c172..821ad0c70457 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,43 +1,43 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
@@ -45,7 +45,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "32": {
         "BLOCK_SIZE_M": 64,
@@ -59,7 +59,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
@@ -73,19 +73,19 @@
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
@@ -99,21 +99,21 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
@@ -123,9 +123,9 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
         "BLOCK_SIZE_M": 64,
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 4532f93681e2..daaf21c28655 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,67 +1,67 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 4,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
         "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "64": {
         "BLOCK_SIZE_M": 64,
@@ -73,25 +73,25 @@
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,31 +99,31 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -133,7 +133,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
@@ -141,6 +141,6 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
index ca7f32b9552b..2583b5a3441c 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,57 +1,57 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 5
+        "num_stages": 3
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -59,39 +59,39 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,23 +99,23 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,7 +131,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -139,8 +139,8 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     }
 }
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 5acea242cc0a..baa64f8d3d14 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,65 +1,65 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
-        "num_stages": 4
+        "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "16": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "24": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "48": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -69,21 +69,21 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -91,7 +91,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -99,13 +99,13 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
     "1024": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -123,7 +123,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -131,15 +131,15 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4096": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     }

From a11adafdcab61c059d2a76d952367a722e1b71d5 Mon Sep 17 00:00:00 2001
From: Jan Kessler <Ithanil@users.noreply.github.com>
Date: Thu, 28 Aug 2025 05:14:00 +0200
Subject: [PATCH 047/125] Gracefully handle edge cases in harmony utils
 (#23155)

Signed-off-by: Jan Kessler <jakessle@uni-mainz.de>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 vllm/entrypoints/harmony_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index bc810f683f4a..078d31684425 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -155,7 +155,7 @@ def parse_chat_input(chat_msg) -> Message:
         contents = [TextContent(text=content)]
     else:
         # TODO: Support refusal.
-        contents = [TextContent(text=c["text"]) for c in content]
+        contents = [TextContent(text=c.get("text", "")) for c in content]
     msg = Message.from_role_and_contents(role, contents)
     return msg
 
@@ -218,8 +218,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
             )
             output_items.append(reasoning_item)
     elif message.channel == "commentary":
-        if message.recipient.startswith("functions."):
-            function_name = message.recipient.split(".")[-1]
+        if recipient is not None and recipient.startswith("functions."):
+            function_name = recipient.split(".")[-1]
             for content in message.content:
                 random_id = random_uuid()
                 response_item = ResponseFunctionToolCall(
@@ -230,8 +230,8 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                     id=f"ft_{random_id}",
                 )
                 output_items.append(response_item)
-        elif message.recipient.startswith(
-                "python") or message.recipient.startswith("browser"):
+        elif recipient is not None and (recipient.startswith("python")
+                                        or recipient.startswith("browser")):
             for content in message.content:
                 reasoning_item = ResponseReasoningItem(
                     id=f"rs_{random_uuid()}",
@@ -245,7 +245,7 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
                 )
                 output_items.append(reasoning_item)
         else:
-            raise ValueError(f"Unknown recipient: {message.recipient}")
+            raise ValueError(f"Unknown recipient: {recipient}")
     elif message.channel == "final":
         contents = []
         for content in message.content:

From f48a9af8924ea617a964b1158acc142b64843edb Mon Sep 17 00:00:00 2001
From: Alex <30671301+killershrimp@users.noreply.github.com>
Date: Wed, 27 Aug 2025 23:27:36 -0500
Subject: [PATCH 048/125] [CI] make all multi-gpu weight loading tests run
 nightly (#23792)

Signed-off-by: Alex Yun <alexyun04@gmail.com>
---
 .buildkite/test-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 0d3b7a294d96..cf90505257e9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -798,6 +798,7 @@ steps:
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
+  optional: true
   source_file_dependencies:
   - vllm/
   - tests/weight_loading

From c8851a47235f5dfd3da3abf6c89453b3bdb41ad1 Mon Sep 17 00:00:00 2001
From: Jinheng <ahengljh@gmail.com>
Date: Thu, 28 Aug 2025 13:34:29 +0800
Subject: [PATCH 049/125] Add deprecation warning for lora_extra_vocab_size
 (#23635)

Signed-off-by: Jinheng Li <ahengljh@gmail.com>
---
 vllm/config/__init__.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 351833d3f02d..cfc5e07d8329 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -2439,8 +2439,8 @@ class LoRAConfig:
     lora_dtype: Union[torch.dtype, LoRADType] = "auto"
     """Data type for LoRA. If auto, will default to base model dtype."""
     lora_extra_vocab_size: int = 256
-    """Maximum size of extra vocabulary that can be present in a LoRA adapter
-    (added to the base model vocabulary)."""
+    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
+    LoRA adapter. Will be removed in v0.12.0."""
     lora_vocab_padding_size: ClassVar[int] = current_platform\
         .get_lora_vocab_padding_size()
 
@@ -2482,6 +2482,12 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self):
+        # Deprecation warning for lora_extra_vocab_size
+        logger.warning(
+            "`lora_extra_vocab_size` is deprecated and will be removed "
+            "in v0.12.0. Additional vocabulary support for "
+            "LoRA adapters is being phased out.")
+
         # Setting the maximum rank to 512 should be able to satisfy the vast
         # majority of applications.
         possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)

From 22feac8e957a2f9787eb721c685269afc15bb3b1 Mon Sep 17 00:00:00 2001
From: Kyle Sayers <kylesayrs@gmail.com>
Date: Thu, 28 Aug 2025 02:43:48 -0400
Subject: [PATCH 050/125] [Transform] [Quantization] Add transforms to
 compressed tensors (#22486)

---
 tests/conftest.py                             |  43 +++-
 tests/quantization/test_compressed_tensors.py |  22 ++
 vllm/model_executor/layers/linear.py          |  16 +-
 .../compressed_tensors/compressed_tensors.py  |  52 ++--
 .../compressed_tensors/transform/linear.py    | 227 ++++++++++++++++++
 .../compressed_tensors/transform/module.py    | 135 +++++++++++
 .../transform/schemes/linear_qutlass_nvfp4.py |  21 ++
 .../compressed_tensors/transform/utils.py     |  13 +
 vllm/model_executor/parameter.py              | 168 +++++++++++--
 9 files changed, 661 insertions(+), 36 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py

diff --git a/tests/conftest.py b/tests/conftest.py
index f8bfdfc8e625..6052ada1c5fd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
+import math
 import os
 import tempfile
 from enum import Enum
-from typing import Any, Callable, Optional, TypedDict, TypeVar, Union
+from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
 
 import numpy as np
 import pytest
@@ -33,6 +34,7 @@
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
+from vllm.sequence import Logprob
 from vllm.transformers_utils.utils import maybe_model_redirect
 
 logger = init_logger(__name__)
@@ -602,7 +604,7 @@ def _hidden_states_to_seq_logprobs(
     def _hidden_states_to_logprobs(
         self,
         hidden_states: tuple[tuple[torch.Tensor, ...], ...],
-        num_logprobs: int,
+        num_logprobs: Optional[int],
     ) -> tuple[list[dict[int, float]], int]:
         seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states)
         output_len = len(hidden_states)
@@ -630,7 +632,7 @@ def generate_greedy_logprobs_limit(
         self,
         prompts: list[str],
         max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
         videos: Optional[PromptVideoInput] = None,
@@ -677,7 +679,7 @@ def generate_encoder_decoder_greedy_logprobs_limit(
         self,
         encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
         max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
         images: Optional[PromptImageInput] = None,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs]:
@@ -966,7 +968,7 @@ def generate_greedy_logprobs(
         self,
         prompts: list[str],
         max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
         num_prompt_logprobs: Optional[int] = None,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
@@ -991,11 +993,40 @@ def generate_greedy_logprobs(
                                         videos=videos,
                                         **kwargs)
 
+    def generate_prompt_perplexity(self, prompts: list[str]) -> list[float]:
+        """
+        Return the perplexity score associated with generating the prompts
+
+        :param prompts: list of prompts to score
+        :return: perplexity score of each prompt
+        """
+        outputs = self.generate_greedy_logprobs(prompts,
+                                                max_tokens=1,
+                                                num_logprobs=None,
+                                                num_prompt_logprobs=0)
+
+        perplexities = []
+        for output in outputs:
+            output = cast(TokensTextLogprobsPromptLogprobs, output)
+            token_datas = cast(list[Optional[dict[int, Logprob]]], output[3])
+            assert token_datas[0] is None
+            token_log_probs = []
+            for token_data in token_datas[1:]:
+                assert token_data is not None
+                assert len(token_data) == 1
+                token_log_prob = list(token_data.values())[0].logprob
+                token_log_probs.append(token_log_prob)
+
+            perplexity = math.exp(-sum(token_log_probs) / len(token_log_probs))
+            perplexities.append(perplexity)
+
+        return perplexities
+
     def generate_encoder_decoder_greedy_logprobs(
         self,
         encoder_decoder_prompts: list[ExplicitEncoderDecoderPrompt[str, str]],
         max_tokens: int,
-        num_logprobs: int,
+        num_logprobs: Optional[int],
         num_prompt_logprobs: Optional[int] = None,
         skip_special_tokens: bool = True,
     ) -> Union[list[TokensTextLogprobs],
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index b9774b7ee263..484f53246f34 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -719,3 +719,25 @@ def check_model(model):
         output = llm.generate_greedy("Hello my name is", max_tokens=20)
         print(output)
         assert output
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test is skipped on non-CUDA platform.")
+@pytest.mark.parametrize("model,prompt,exp_perplexity", [
+    (
+        "nm-testing/Llama-3.2-1B-Instruct-spinquantR1R2R4-w4a16",
+        "Flat is better than nested.\nSparse is better than dense.",
+        150.0,
+    ),
+    (
+        "nm-testing/Llama-3.2-1B-Instruct-quip-w4a16",
+        "Flat is better than nested.\nSparse is better than dense.",
+        150.0,
+    ),
+])
+def test_compressed_tensors_transforms_perplexity(vllm_runner, model, prompt,
+                                                  exp_perplexity):
+    with vllm_runner(model, enforce_eager=True) as llm:
+        perplexity = llm.generate_prompt_perplexity([prompt])[0]
+        print(perplexity)
+        assert perplexity <= exp_perplexity
\ No newline at end of file
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c0fcacd1e6ee..19ff63145024 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -35,6 +35,7 @@
 
 WEIGHT_LOADER_V2_SUPPORTED = [
     "CompressedTensorsLinearMethod",
+    "CompressedTensorsLinearTransformMethod",
     "BitBLASLinearMethod",
     "GPTQBitBLASLinearMethod",
     "AWQMarlinLinearMethod",
@@ -199,6 +200,7 @@ def create_weights(self, layer: torch.nn.Module,
         set_weight_attrs(weight, extra_weight_attrs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # special postprocessing for CPU SGL
         if current_platform.is_cpu() and envs.VLLM_CPU_SGL_KERNEL:
             from vllm.model_executor.layers.utils import check_cpu_sgl_kernel
             N, K = layer.weight.size()
@@ -1470,7 +1472,7 @@ def __init__(self,
             self.bias = torch.nn.Parameter()
             set_weight_attrs(self.bias, {
                 "output_dim": 0,
-                "weight_loader": self.weight_loader,
+                "weight_loader": self.weight_loader_v1,
             })
         else:
             self.bias = None
@@ -1580,6 +1582,18 @@ def forward(  # type: ignore[override]
             k, v = kv_enc.split(self.kv_size, dim=-1)
         return q, k, v
 
+    def weight_loader_v1(self,
+                         param: torch.nn.Parameter,
+                         loaded_weight: torch.Tensor,
+                         loaded_shard_id: Optional[str] = None):
+        # just like all other parameters, does not yet
+        # support loading bias with weight_loader_v2
+        layer = (self.q_proj_decoder
+                 if loaded_shard_id == "q" else self.kv_proj_encoder)
+        target_param = self.select_proj_params(layer, param)
+        shard_id_args = (loaded_shard_id, ) if loaded_shard_id != "q" else ()
+        layer.weight_loader(target_param, loaded_weight, *shard_id_args)
+
     def weight_loader(self,
                       param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 245cf122ebab..230572041c80 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -11,6 +11,7 @@
 from compressed_tensors.quantization import (QuantizationArgs,
                                              QuantizationStrategy,
                                              QuantizationType)
+from compressed_tensors.transform import TransformConfig
 from pydantic import BaseModel
 
 import vllm.envs as envs
@@ -30,6 +31,8 @@
     CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import (  # noqa: E501
+    CompressedTensorsLinearTransformMethod, get_linear_transform_schemes)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
@@ -60,6 +63,7 @@ def __init__(
         sparsity_ignore_list: list[str],
         kv_cache_scheme: Optional[dict[str, Any]] = None,
         config: Optional[dict[str, Any]] = None,
+        transform_config: Optional[TransformConfig] = None,
     ):
         super().__init__()
         self.ignore = ignore
@@ -71,6 +75,12 @@ def __init__(
         self.sparsity_ignore_list = sparsity_ignore_list
         self.config = config
 
+        if transform_config is not None:
+            self.transform_config = TransformConfig.model_validate(
+                transform_config)
+        else:
+            self.transform_config = None
+
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
 
@@ -103,18 +113,27 @@ def get_quant_method(
     ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
-        # Check if the layer is skipped for quantization.
-        # TODO (@robertgshaw2): support module names
-        if should_ignore_layer(prefix,
-                               ignore=self.ignore,
-                               fused_mapping=self.packed_modules_mapping):
-            return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
-            scheme = self.get_scheme(layer=layer, layer_name=prefix)
-            if scheme is None:
-                return UnquantizedLinearMethod()
-            layer.scheme = scheme
-            return CompressedTensorsLinearMethod(self)
+            # collect schemes
+            quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            input_tfms, output_tfms = get_linear_transform_schemes(
+                layer, prefix, self.transform_config,
+                self.packed_modules_mapping)
+
+            # choose quantization method
+            quant_method: LinearMethodBase = UnquantizedLinearMethod()
+            if quant_scheme is not None:
+                layer.scheme = quant_scheme
+                quant_method = CompressedTensorsLinearMethod(self)
+
+            # choose transform method
+            if any((input_tfms, output_tfms)):
+                return CompressedTensorsLinearTransformMethod.from_schemes(
+                    quant_method, input_tfms, output_tfms)
+
+            else:
+                return quant_method
+
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
@@ -129,6 +148,7 @@ def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig":
             config=config)
         sparsity_scheme_map, sparsity_ignore_list = cls._parse_sparsity_config(
             config=config)
+        transform_config = config.get("transform_config")
 
         return cls(
             target_scheme_map=target_scheme_map,
@@ -137,6 +157,7 @@ def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig":
             sparsity_scheme_map=sparsity_scheme_map,
             sparsity_ignore_list=sparsity_ignore_list,
             config=config,
+            transform_config=transform_config,
         )
 
     @classmethod
@@ -537,9 +558,11 @@ def get_scheme(self,
 
         # Find the "target" in the compressed-tensors config
         # that our layer conforms to.
-        # TODO (@robertgshaw): add compressed-tensors as dep
-        # so we do not have to re-write these functions
-        # need to make accelerate optional in ct to do this
+        # TODO (@kylesayrs): support ignore module names with ct matching utils
+        if should_ignore_layer(layer_name,
+                               ignore=self.ignore,
+                               fused_mapping=self.packed_modules_mapping):
+            return None
 
         # Will be empty for models with only sparsity
         weight_quant = input_quant = None
@@ -722,7 +745,6 @@ def apply(self,
         layer input.  See LinearMethodBase for param details
 
         """
-
         scheme = layer.scheme
         if scheme is None:
             raise ValueError("A scheme must be defined for each layer")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
new file mode 100644
index 000000000000..2fc94b3c257e
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py
@@ -0,0 +1,227 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Generator
+from itertools import accumulate
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.transform import (TransformArgs, TransformConfig,
+                                          TransformLocation, TransformScheme)
+from compressed_tensors.utils import is_match
+
+from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED,
+                                               LinearMethodBase,
+                                               QKVCrossParallelLinear)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import (  # noqa: E501
+    HadamardTransform)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import (  # noqa: E501
+    TransformTuple)
+
+
+class CompressedTensorsLinearTransformMethod(LinearMethodBase):
+    """
+    Wraps `CompressedTensorsLinearMethod` or `UnquantizedLinearMethod` and adds
+    input and output transforms to either side of the original apply method
+    """
+
+    @classmethod
+    def from_schemes(
+        cls, quant_method: LinearMethodBase, input_tfms: dict[int,
+                                                              TransformTuple],
+        output_tfms: dict[int, TransformTuple]
+    ) -> "CompressedTensorsLinearTransformMethod":
+        assert input_tfms or output_tfms
+
+        # TODO (@ksayers): implement QutlassLinearMethodNvFP4
+        # hadacore and fwht can be selected by Transform module
+
+        return cls(quant_method, input_tfms, output_tfms)
+
+    def __init__(self, quant_method: LinearMethodBase,
+                 input_tfms: dict[int, TransformTuple],
+                 output_tfms: dict[int, TransformTuple]):
+        self.quant_method = quant_method
+        self.input_tfms = input_tfms
+        self.output_tfms = output_tfms
+
+        self.input_transform: Optional[HadamardTransform] = None
+        self.output_transform: Optional[HadamardTransform] = None
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+
+        # get weight loader for transforms
+        weight_loader: Callable = extra_weight_attrs.get(
+            "weight_loader")  # type: ignore[assignment]
+
+        # HACK: UnquantizedLinearMethod does not support weight loader v2, but
+        # transforms (specifically SharedWeightParameter) requires
+        # weight loader v2. Until UnquantizedLinearMethod supports v2, we must
+        # hack around this by getting weight loader v1 so ULM can load correctly
+        quant_method_name = self.quant_method.__class__.__name__
+        if quant_method_name not in WEIGHT_LOADER_V2_SUPPORTED:
+            if isinstance(layer, QKVCrossParallelLinear):
+                weight_loader_v1 = layer.weight_loader_v1
+            else:
+                weight_loader_v1 = layer.weight_loader
+            extra_weight_attrs["weight_loader"] = weight_loader_v1
+
+        self.quant_method.create_weights(
+            layer=layer,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            input_size=input_size,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            **extra_weight_attrs)
+
+        # validate schemes
+        num_partitions = len(output_partition_sizes)
+        self._validate_tfm_schemes(num_partitions)
+
+        # create submodules for weight loading
+        if len(self.input_tfms) > 0:
+            scheme_name = list(self.input_tfms.values())[0].scheme_name
+            location = list(self.input_tfms.values())[0].args.location
+            transform_name = f"{scheme_name}_{location}"
+
+            transform = HadamardTransform(self.input_tfms, layer,
+                                          weight_loader,
+                                          input_size_per_partition,
+                                          output_partition_sizes)
+            layer.register_module(transform_name, transform)
+            self.input_transform = transform
+
+        if len(self.output_tfms) > 0:
+            scheme_name = list(self.output_tfms.values())[0].scheme_name
+            location = list(self.output_tfms.values())[0].args.location
+            transform_name = f"{scheme_name}_{location}"
+
+            transform = HadamardTransform(self.output_tfms, layer,
+                                          weight_loader,
+                                          input_size_per_partition,
+                                          output_partition_sizes)
+            layer.register_module(transform_name, transform)
+            self.output_transform = transform
+
+        # compute partition ranges for slicing activations
+        starts = [0] + list(accumulate(output_partition_sizes))[:-1]
+        self.partition_ranges = list(zip(starts, output_partition_sizes))
+
+    def process_weights_after_loading(self, layer):
+        self.quant_method.process_weights_after_loading(layer)
+
+        for submodule in layer.children():
+            if isinstance(submodule, HadamardTransform):
+                submodule.process_weights_after_loading()
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        if self.input_transform is not None:
+            x = self.input_transform(x)
+
+        assert bias is None
+        x = self.quant_method.apply(layer, x, bias)
+
+        # TODO (@ksayers): Write a triton kernel to do this in parallel
+        if self.output_transform is not None:
+            for part_id, (start, length) in enumerate(self.partition_ranges):
+                x[:, start:start + length] = self.output_transform(
+                    x[:, start:start + length], part_id=part_id)
+
+        return x
+
+    def _validate_tfm_schemes(self, num_partitions: int):
+        if len(self.input_tfms) > 0:
+            if 0 not in self.input_tfms:
+                raise ValueError("Must have same input")
+
+            for part_index in range(num_partitions):
+                if self.input_tfms[part_index] != self.input_tfms[0]:
+                    raise ValueError("Must have same input")
+
+        if len(self.output_tfms) > 0:
+            scheme_name = list(self.output_tfms.values())[0].scheme_name
+            location = list(self.output_tfms.values())[0].args.location
+
+            for tfm in self.output_tfms.values():
+                if tfm.scheme_name != scheme_name:
+                    raise ValueError("Must have same scheme name")
+                if tfm.args.location != location:
+                    raise ValueError("Must have same location")
+
+        return self.input_tfms, self.output_tfms
+
+
+def get_linear_transform_schemes(
+    layer: torch.nn.Module, layer_name: str,
+    transform_config: Optional[TransformConfig],
+    packed_modules_mapping: dict[str, list[str]]
+) -> tuple[dict[int, TransformTuple], dict[
+        int, TransformTuple]]:  # [input_transform, [output_transform, ...]]
+    # there can only be one transform input scheme per (fused) module
+    input_tfms = {}
+    output_tfms = {}
+
+    partition_names = get_layer_partition_names(layer_name,
+                                                packed_modules_mapping)
+
+    for scheme_name, scheme, args in get_schemes_args(transform_config):
+        for part_index, part_name in enumerate(partition_names):
+            if is_match(part_name, layer, args.targets,
+                        args.ignore) and args.is_online():
+                if args.location == TransformLocation.INPUT:
+                    input_tfms[part_index] = TransformTuple(
+                        scheme_name, scheme, args)
+
+                elif args.location == TransformLocation.OUTPUT:
+                    output_tfms[part_index] = TransformTuple(
+                        scheme_name, scheme, args)
+
+                else:
+                    raise ValueError(f"Cannot apply `{args.location}` "
+                                     f"transform to `{layer_name}`")
+
+    return (input_tfms, output_tfms)
+
+
+def get_schemes_args(
+    transform_config: Optional[TransformConfig]
+) -> Generator[tuple[str, TransformScheme, TransformArgs]]:
+    if transform_config is None:
+        return
+
+    for scheme_name, scheme in transform_config.config_groups.items():
+        for args in scheme.apply:
+            yield (scheme_name, scheme, args)
+
+
+def get_layer_partition_names(
+        layer_name: str, packed_modules_mapping: dict[str,
+                                                      list[str]]) -> list[str]:
+    """
+    Get all partition names associated with this layer.
+    Names are returned in order of their partition indices.
+    
+    ```python
+    mapping = {"gate_up_proj", "gate_proj", "up_proj"}
+
+    assert get_layer_partition_names(
+        "mlp.gate_up_proj", mapping) == ["gate_proj", "up_proj"]
+    assert get_layer_partition_names(
+        "mlp.down_proj", mapping) == ["down_proj"]
+    """
+    for fused_suffix, part_suffixes in packed_modules_mapping.items():
+        if layer_name.endswith(fused_suffix):
+            return [
+                layer_name.removesuffix(fused_suffix) + part_suffix
+                for part_suffix in part_suffixes
+            ]
+
+    return [layer_name]
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
new file mode 100644
index 000000000000..b3be25471773
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py
@@ -0,0 +1,135 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Hashable
+from typing import Callable, Optional
+
+import torch
+from compressed_tensors.transform import TransformLocation, TransformScheme
+from torch import Tensor
+
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import (  # noqa: E501
+    TransformTuple)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.parameter import SharedWeightParameter
+
+
+class HadamardTransform(torch.nn.Module):
+    """
+    Class which handles weight loading, postprocessing, and application of
+    transforms. Meant to be used with `CompressedTensorsLinearTransformMethod`
+    and attention transforms method (not implemented yet)
+    """
+    transforms: dict[int, TransformTuple]  # info parsed from transforms config
+    weight: SharedWeightParameter  # container for shared tensors
+
+    kernel: Callable  # function used during application
+    scales: dict[int, float]  # hadamard scale, usually sqrt(matrix.size(0))
+
+    def __init__(self,
+                 transforms: dict[int, TransformTuple],
+                 layer: torch.nn.Module,
+                 weight_loader: Callable,
+                 input_size_per_partition: int,
+                 output_partition_sizes: list[int],
+                 kernel: Optional[Callable] = None):
+        super().__init__()
+        self.transforms = transforms
+        self.scales = {}
+
+        if get_tensor_model_parallel_world_size() > 1:
+            raise NotImplementedError("Online transforms with tensor "
+                                      "parallelism is not supported")
+
+        # Similar to row/col parallel params, but tensors are separate
+        # to allow for loading with shared memory
+        self.weight = SharedWeightParameter(weight_loader=weight_loader)
+
+        # create shared partition data for each partition of the original weight
+        input_size = input_size_per_partition
+        for part_index, (_scheme_name, scheme,
+                         args) in self.transforms.items():
+            output_size = output_partition_sizes[part_index]
+            weight_size = self._get_weight_size(layer, args.location,
+                                                input_size, output_size)
+
+            data_key = self._get_data_key(scheme, weight_size)
+            self.weight.add_partition(
+                part_index,
+                data_key,
+                size=(weight_size, weight_size),
+                dtype=scheme.precision,
+            )
+
+        # validate that shared tensors and schemes are correct
+        self._validate_input_transforms()
+
+        # select kernel based on transform schemes
+        self.kernel = self._infer_kernel() if kernel is None else kernel
+
+    def process_weights_after_loading(self):
+        for part_id in self.weight.partitions:
+            data = self.weight.partitions[part_id].data
+
+            # required by torch.compile
+            self.weight.process_weights_after_loading()
+
+            # precompute scale as a runtime multiply, not division
+            # do not fold into weight in order to utilize FWHT
+            self.scales[part_id] = 1 / math.sqrt(data.size(0))
+
+            # FUTURE: avoid runtime tranpose by processing weights
+            # prior to apply
+
+    def forward(self, value: Tensor, part_id: int = 0) -> Tensor:
+        if part_id not in self.weight.partitions:
+            return value
+
+        weight = self.weight.partitions[part_id]
+        weight = weight if self.transforms[
+            part_id].args.inverse else weight.T  # linear := x(W.T)
+        scale = self.scales[part_id]
+        return self.kernel(self, value.to(weight.dtype), weight, None).to(
+            value.dtype) * scale
+
+    def _get_data_key(self, scheme: TransformScheme,
+                      weight_size: int) -> Hashable:
+        return (id(scheme), weight_size)
+
+    def _get_weight_size(self, layer: torch.nn.Module,
+                         location: TransformLocation, input_size: int,
+                         output_size: int) -> int:
+        if isinstance(layer, LinearBase):
+            if location == TransformLocation.INPUT:
+                return input_size
+
+            elif location == TransformLocation.OUTPUT:
+                return output_size
+
+        elif isinstance(layer, VocabParallelEmbedding):
+            if location == TransformLocation.INPUT:
+                return output_size
+
+            elif location == TransformLocation.OUTPUT:
+                return input_size
+
+        raise ValueError()
+
+    def _validate_input_transforms(self):
+        assert len(self.transforms) > 0
+        location = list(self.transforms.values())[0].args.location
+
+        if location == TransformLocation.INPUT:
+            first_data = self.weight.partitions[0].data
+            for partition in self.weight.partitions.values():
+                if partition.data.data_ptr() != first_data.data_ptr():
+                    raise ValueError("")
+
+    def _infer_kernel(self) -> Callable:
+        # TODO (@ksayers): use fwht, hadacore
+        return dispatch_unquantized_gemm()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
new file mode 100644
index 000000000000..f42258f9f9d7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+
+import torch
+
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import (  # noqa: E501
+    CompressedTensorsLinearTransformMethod)
+
+
+# Because qutlass fuses hadamard with quantization, it cannot automatically be
+# composed with kernels in the way CompressedTensorsLinearTransformMethod does.
+# Therefore, a separate scheme must be created for each quantized dtype
+class QutlassLinearMethodNvFP4(CompressedTensorsLinearTransformMethod):
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        # fused hadamard quant linear method
+        raise NotImplementedError()
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py
new file mode 100644
index 000000000000..2f353de1e6a7
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import NamedTuple
+
+from compressed_tensors.transform import TransformArgs, TransformScheme
+
+__all__ = ["TransformTuple"]
+
+
+class TransformTuple(NamedTuple):
+    scheme_name: str
+    scheme: TransformScheme
+    args: TransformArgs
diff --git a/vllm/model_executor/parameter.py b/vllm/model_executor/parameter.py
index 750ee7850268..9465308e94e6 100644
--- a/vllm/model_executor/parameter.py
+++ b/vllm/model_executor/parameter.py
@@ -1,13 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Hashable
 from fractions import Fraction
 from typing import Callable, Optional, Union
+from weakref import WeakValueDictionary
 
 import torch
 from torch.nn import Parameter
 
-from vllm.distributed import get_tensor_model_parallel_rank
+from vllm.distributed import (get_tensor_model_parallel_rank,
+                              get_tensor_model_parallel_world_size)
 from vllm.logger import init_logger
 from vllm.model_executor.utils import _make_synced_weight_loader
 
@@ -27,7 +30,7 @@ class BasevLLMParameter(Parameter):
     into the parameter when the provided weight loader is called.
     """
 
-    def __new__(cls, data: torch.Tensor, **kwargs):
+    def __new__(cls, data: Optional[torch.Tensor], **kwargs):
 
         return super().__new__(cls, data=data, requires_grad=False)
 
@@ -81,6 +84,17 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
     def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
         self._assert_and_load(loaded_weight)
 
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        # if not int, assume shard_id for qkv
+        # map to int and return
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        assert isinstance(shard_id, str)
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
 
 class _ColumnvLLMParameter(BasevLLMParameter):
     """
@@ -113,6 +127,7 @@ def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
 
         shard_offset = kwargs.get("shard_offset")
         shard_size = kwargs.get("shard_size")
+        # TODO: move these to PackedColumnParameter and PackedvLLMParameter
         if isinstance(
                 self,
             (PackedColumnParameter,
@@ -137,6 +152,7 @@ def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
         shard_id = kwargs.get("shard_id")
         num_heads = kwargs.get("num_heads")
 
+        # TODO: move these to PackedColumnParameter and PackedvLLMParameter
         if isinstance(
                 self,
             (PackedColumnParameter,
@@ -224,19 +240,8 @@ class PerTensorScaleParameter(BasevLLMParameter):
     """
 
     def __init__(self, **kwargs):
-        self.qkv_idxs = {"q": 0, "k": 1, "v": 2}
         super().__init__(**kwargs)
 
-    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
-        if isinstance(shard_id, int):
-            return shard_id
-
-        # if not int, assume shard_id for qkv
-        # map to int and return
-        assert isinstance(shard_id, str)
-        assert shard_id in self.qkv_idxs
-        return self.qkv_idxs[shard_id]
-
     # For row parallel layers, no sharding needed
     # load weight into parameter as is
     def load_row_parallel_weight(self, *args, **kwargs):
@@ -373,6 +378,141 @@ class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
     pass
 
 
+class SharedWeightParameter(BasevLLMParameter):
+    """
+    Parameter for weights with many shared tensors across a model
+
+    For example, when applying transforms to the "gate" and "up" partitions of
+    `MergedColumnParallelLinear`, the transform weights must stay separate
+    tensors in order to allow for tensor memory sharing between layers.
+    """
+    # global registry for sharing tensors based on passed `data_key`
+    # this dict holds weaksrefs to avoid memory leak after model cleanup
+    tensors_registry: WeakValueDictionary = WeakValueDictionary()
+
+    # local container for strong references to shared tensors
+    # this set compensates for the fact that torch.nn.Parameter
+    # and Parameter subclasses do not hold reliable references to tensors
+    local_tensors: set[torch.Tensor]
+
+    # dictionary mapping partition indices to associated parameters
+    partitions: dict[int, Union[ModelWeightParameter, Parameter]]
+
+    def __new__(cls, **kwargs):
+        return super().__new__(cls, data=None, **kwargs)
+
+    def __init__(self, input_dim: int = 1, output_dim: int = 0, **kwargs):
+        weight_loader: Callable = kwargs.get(
+            "weight_loader")  # type: ignore[assignment]
+        super().__init__(data=None, weight_loader=weight_loader)
+
+        self.local_tensors = set()
+        self.partitions = {}
+        self.kwargs = {
+            "input_dim": input_dim,
+            "output_dim": output_dim,
+            "weight_loader": self._fake_weight_loader
+        }
+
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        if self.tp_size > 1:
+            raise NotImplementedError(f"{self.__class__.__name__} does not "
+                                      "currently support tensor parallelism")
+
+    def add_partition(self, index: int, data_key: Hashable, *args, **kwargs):
+        """
+        Add a partition to the weight parameter. Partitions whose `data_key`
+        is the same will share tensor data
+
+        :param index: index of partition to add
+        :param data_key: hashable key used to key shared tensors
+        :param *args: arguments for `torch.empty`
+        :param **kwargs: keyword arguments for `torch.empty`
+        """
+        # load (shared) tensor using `data_key`
+        if data_key not in self.tensors_registry:
+            data = torch.empty(*args, **kwargs)
+            self.tensors_registry[data_key] = data
+        else:
+            data = self.tensors_registry[data_key]
+
+        # create associated model parameter
+        self.partitions[index] = ModelWeightParameter(
+            data=data, **self.kwargs)  # type: ignore[arg-type]
+
+        # hold local reference, since ModelWeightParameter does not
+        # see https://github.com/pytorch/pytorch/issues/75932
+        self.local_tensors.add(data)
+
+    def load_column_parallel_weight(self, loaded_weight: torch.Tensor):
+        assert len(self.partitions) == 1 and 0 in self.partitions
+        partition = self.partitions[0]
+
+        ModelWeightParameter.load_column_parallel_weight(
+            partition, loaded_weight)
+
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        assert len(self.partitions) == 1 and 0 in self.partitions
+        partition = self.partitions[0]
+
+        ModelWeightParameter.load_row_parallel_weight(partition, loaded_weight)
+
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        partition_id = kwargs.pop("shard_id")
+        partition_id = self._shard_id_as_int(partition_id)
+        partition = self.partitions[partition_id]
+
+        input_dim = self.kwargs.get("input_dim")
+        shard_size = partition.data.size(input_dim) // self.tp_size
+        shard_offset = self.tp_rank * shard_size
+
+        ModelWeightParameter.load_merged_column_weight(
+            partition,
+            loaded_weight,
+            shard_offset=shard_offset,
+            shard_size=shard_size)
+
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        partition_id = self._shard_id_as_int(kwargs.pop("shard_id"))
+        partition = self.partitions[partition_id]
+
+        input_dim = self.kwargs.get("input_dim")
+        shard_size = partition.data.size(input_dim) // self.tp_size
+        shard_offset = self.tp_rank * shard_size
+        shard_id = "q"  # fake first partition
+        num_heads = kwargs.get("num_heads")
+
+        ModelWeightParameter.load_qkv_weight(
+            partition,
+            loaded_weight,
+            shard_offset=shard_offset,
+            shard_size=shard_size,
+            shard_id=shard_id,
+            num_heads=num_heads,
+        )
+
+    def process_weights_after_loading(self):
+        for key in self.partitions:
+            self.partitions[key] = torch.nn.Parameter(
+                data=self.partitions[key].data, requires_grad=False)
+
+    @property
+    def data(self):
+        raise ValueError("Accessing `data` of a "
+                         "`PartitionedModelWeightParameter` is not allowed. "
+                         "Instead, use `get_partition` to get the weight of "
+                         "the particular partition you want to access")
+
+    def _fake_weight_loader(self, param: BasevLLMParameter,
+                            loaded_weight: torch.Tensor,
+                            loaded_weight_shard_id: Optional[Union[str, int]]):
+        raise ValueError("When loading partition weights of "
+                         f"{self.__class__.__name__}, use methods provided by "
+                         f"{self.__class__.__name__}, not partition loader")
+
+
 def permute_param_layout_(param: BasevLLMParameter, input_dim: int,
                           output_dim: int, **kwargs) -> BasevLLMParameter:
     """
@@ -456,4 +596,4 @@ def _adjust_shard_indexes_for_packing(shard_size, shard_offset, packed_factor,
             shard_offset=shard_offset,
             bitblas_tile_size=bitblas_tile_size)
 
-    return shard_size, shard_offset
\ No newline at end of file
+    return shard_size, shard_offset

From c07a73317d202c2dad67f12893fcddb6d3664950 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 28 Aug 2025 14:51:24 +0800
Subject: [PATCH 051/125] [CI] enable idefics3 and fuyu-8b test in multimodal
 test (#23790)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 .../multimodal/generation/test_common.py      | 36 ++++++++-----------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 2b60faae8ec0..d61b182761e4 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -189,23 +189,21 @@
         },
         marks=[pytest.mark.core_model],
     ),
-    # FIXME(Isotr0py): Enable this test after
-    # https://github.com/huggingface/transformers/pull/39470 released
-    # "idefics3-transformers": VLMTestInfo(
-    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
-    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
-    #     img_idx_to_prompt=lambda idx: "<image>",
-    #     max_model_len=8192,
-    #     max_num_seqs=2,
-    #     auto_cls=AutoModelForImageTextToText,
-    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
-    #     image_size_factors=[(0.25, 0.5, 1.0)],
-    #     vllm_runner_kwargs={
-    #         "model_impl": "transformers",
-    #     },
-    #     marks=[pytest.mark.core_model],
-    # ),
+    "idefics3-transformers": VLMTestInfo(
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
     # Pixel values from processor are not 4D or 5D arrays
     "qwen2_5_vl-transformers": VLMTestInfo(
         models=["Qwen/Qwen2.5-VL-3B-Instruct"],
@@ -322,10 +320,6 @@
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
-        # FIXME(Isotr0py): This model is broken in Transformers v4.54.1, we
-        # should enable this again after the fix is released:
-        # https://github.com/huggingface/transformers/pull/39915
-        marks=[pytest.mark.skip("HF model is broken")],
     ),
     "gemma3": VLMTestInfo(
         models=["google/gemma-3-4b-it"],

From daa1273b14da5bdf643aa4b1bcbef3985b1edd75 Mon Sep 17 00:00:00 2001
From: "rongfu.leng" <rongfu.leng@daocloud.io>
Date: Thu, 28 Aug 2025 15:27:45 +0800
Subject: [PATCH 052/125] [Bugfix] when set offline model running error
 (#23711)

Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
---
 vllm/entrypoints/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index d8905fc14124..d2d7dba3ae46 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -313,12 +313,14 @@ def log_non_default_args(args: Union[argparse.Namespace, EngineArgs]):
 
     # Handle EngineArgs instance
     elif isinstance(args, EngineArgs):
-        default_args = EngineArgs()  # Create default instance
+        default_args = EngineArgs(model=args.model)  # Create default instance
         for field in dataclasses.fields(args):
             current_val = getattr(args, field.name)
             default_val = getattr(default_args, field.name)
             if current_val != default_val:
                 non_default_args[field.name] = current_val
+        if default_args.model != EngineArgs.model:
+            non_default_args["model"] = default_args.model
     else:
         raise TypeError("Unsupported argument type. " \
         "Must be argparse.Namespace or EngineArgs instance.")

From 186aced5ffb62b62b41eb1beaf2a598ada43351b Mon Sep 17 00:00:00 2001
From: yzds <41983536+youzhedian@users.noreply.github.com>
Date: Thu, 28 Aug 2025 15:29:11 +0800
Subject: [PATCH 053/125] [Kernel] cuda kernels for upcoming decode context
 parallel feature (#23791)

Co-authored-by: hongchao <hongchao@msh.team>
---
 csrc/cache.h                          |  17 +-
 csrc/cache_kernels.cu                 | 247 ++++++++++++++++++++++++++
 csrc/torch_bindings.cpp               |  15 ++
 tests/kernels/attention/test_cache.py |  72 ++++++++
 vllm/_custom_ops.py                   |  24 +++
 5 files changed, 374 insertions(+), 1 deletion(-)

diff --git a/csrc/cache.h b/csrc/cache.h
index fb0c353b9613..e8e069aefd9c 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -36,6 +36,13 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
                           const std::string& kv_cache_dtype,
                           torch::Tensor& scale);
 
+void cp_fused_concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
+                                   torch::Tensor& cp_local_token_select_indices,
+                                   torch::Tensor& kv_cache,
+                                   torch::Tensor& slot_mapping,
+                                   const std::string& kv_cache_dtype,
+                                   torch::Tensor& scale);
+
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
@@ -47,4 +54,12 @@ void gather_and_maybe_dequant_cache(
     torch::Tensor const& cu_seq_lens,  // [BATCH+1]
     int64_t batch_size, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
-    std::optional<torch::Tensor> seq_starts = std::nullopt);
\ No newline at end of file
+    std::optional<torch::Tensor> seq_starts = std::nullopt);
+
+// TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
+void cp_gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index b3a985c2d5bb..fc82a1fa8ed7 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1,6 +1,7 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAException.h>
 
 #include "cuda_utils.h"
 #include "cuda_compat.h"
@@ -395,6 +396,51 @@ __global__ void concat_and_cache_mla_kernel(
   copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
 }
 
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+__global__ void cp_fused_concat_and_cache_mla_kernel(
+    const scalar_t* __restrict__ kv_c,  // [num_full_tokens, kv_lora_rank]
+    const scalar_t* __restrict__ k_pe,  // [num_full_tokens, pe_dim]
+    const int64_t* __restrict__ cp_local_token_select_indices,  // [num_tokens]
+    cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank
+                                     // + pe_dim)]
+    const int64_t* __restrict__ slot_mapping,  // [num_tokens]
+    const int block_stride,                    //
+    const int entry_stride,                    //
+    const int kv_c_stride,                     //
+    const int k_pe_stride,                     //
+    const int kv_lora_rank,                    //
+    const int pe_dim,                          //
+    const int block_size,                      //
+    const float* scale                         //
+) {
+  const int64_t token_idx = cp_local_token_select_indices[blockIdx.x];
+  const int64_t slot_idx = slot_mapping[blockIdx.x];
+  // NOTE: slot_idx can be -1 if the token is padded
+  if (slot_idx < 0) {
+    return;
+  }
+  const int64_t block_idx = slot_idx / block_size;
+  const int64_t block_offset = slot_idx % block_size;
+
+  auto copy = [&](const scalar_t* __restrict__ src, cache_t* __restrict__ dst,
+                  int src_stride, int dst_stride, int size, int offset) {
+    for (int i = threadIdx.x; i < size; i += blockDim.x) {
+      const int64_t src_idx = token_idx * src_stride + i;
+      const int64_t dst_idx =
+          block_idx * block_stride + block_offset * entry_stride + i + offset;
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst[dst_idx] = src[src_idx];
+      } else {
+        dst[dst_idx] =
+            fp8::scaled_convert<cache_t, scalar_t, kv_dt>(src[src_idx], *scale);
+      }
+    }
+  };
+
+  copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
+  copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
+}
+
 }  // namespace vllm
 
 // KV_T is the data type of key and value tensors.
@@ -508,6 +554,20 @@ void reshape_and_cache_flash(
           kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
           reinterpret_cast<const float*>(scale.data_ptr()));
 
+// KV_T is the data type of key and value tensors.
+// CACHE_T is the stored data type of kv-cache.
+// KV_DTYPE is the real data type of kv-cache.
+#define CALL_CP_FUSED_CONCAT_AND_CACHE_MLA(KV_T, CACHE_T, KV_DTYPE)     \
+  vllm::cp_fused_concat_and_cache_mla_kernel<KV_T, CACHE_T, KV_DTYPE>   \
+      <<<grid, block, 0, stream>>>(                                     \
+          reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \
+          reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \
+          cp_local_token_select_indices.data_ptr<int64_t>(),            \
+          reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \
+          slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \
+          kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \
+          reinterpret_cast<const float*>(scale.data_ptr()));
+
 void concat_and_cache_mla(
     torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank]
     torch::Tensor& k_pe,          // [num_tokens, pe_dim]
@@ -546,6 +606,50 @@ void concat_and_cache_mla(
                              CALL_CONCAT_AND_CACHE_MLA);
 }
 
+// Note(hc): cp_fused_concat_and_cache_mla fuses the following three kernel
+// calls into one:
+// k_c_normed.index_select(0, cp_local_token_select_indices) + \
+// k_pe.squeeze(1).index_select(0, cp_local_token_select_indices) + \
+// concat_and_cache_mla.
+void cp_fused_concat_and_cache_mla(
+    torch::Tensor& kv_c,  // [num_total_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,  // [num_total_tokens, pe_dim]
+    torch::Tensor& cp_local_token_select_indices,  // [num_tokens]
+    torch::Tensor& kv_cache,      // [num_blocks, block_size, (kv_lora_rank +
+                                  // pe_dim)]
+    torch::Tensor& slot_mapping,  // [num_tokens] or [num_actual_tokens]
+    const std::string& kv_cache_dtype, torch::Tensor& scale) {
+  // NOTE(woosuk): In vLLM V1, key.size(0) can be different from
+  // slot_mapping.size(0) because of padding for CUDA graphs.
+  // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because
+  // both include padding.
+  // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0)
+  // since key includes padding for CUDA graphs, while slot_mapping does not.
+  // In this case, slot_mapping.size(0) represents the actual number of tokens
+  // before padding.
+  // For compatibility with both cases, we use slot_mapping.size(0) as the
+  // number of tokens.
+  int num_tokens = slot_mapping.size(0);
+  int kv_lora_rank = kv_c.size(1);
+  int pe_dim = k_pe.size(1);
+  int block_size = kv_cache.size(1);
+
+  TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
+
+  int kv_c_stride = kv_c.stride(0);
+  int k_pe_stride = k_pe.stride(0);
+  int block_stride = kv_cache.stride(0);
+  int entry_stride = kv_cache.stride(1);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(kv_lora_rank, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype,
+                             CALL_CP_FUSED_CONCAT_AND_CACHE_MLA);
+}
+
 namespace vllm {
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -779,3 +883,146 @@ void gather_and_maybe_dequant_cache(
 
   DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
 }
+
+namespace vllm {
+template <typename scalar_t>
+// Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
+// block_size.
+__global__ void cp_gather_cache(
+    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
+                                              // ENTRY_SIZE]
+    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRY_SIZE]
+    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
+    const int32_t block_size, const int32_t entry_size,
+    const int64_t block_table_stride, const int64_t cache_block_stride,
+    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
+    const int32_t* __restrict__ seq_starts  // Optional: starting offsets per
+                                            // batch
+) {
+  const int64_t bid = blockIdx.x;  // Batch ID
+  const int32_t num_splits = gridDim.y;
+  const int32_t split = blockIdx.y;
+  const int32_t seq_start = cu_seq_lens[bid];
+  const int32_t seq_end = cu_seq_lens[bid + 1];
+  const int32_t seq_len = seq_end - seq_start;
+  const int32_t tot_slots = seq_len;
+  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
+
+  const int32_t split_start = split * split_slots;
+  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
+
+  const bool is_active_split = (split_start < tot_slots);
+  const bool is_last_split = (split_end == tot_slots);
+
+  if (!is_active_split) return;
+
+  // Adjust the pointer for the block_table for this batch.
+  // If seq_starts is provided, compute an offset based on it
+  const int32_t batch_offset = bid * block_table_stride;
+  int32_t offset = split_start;
+  if (seq_starts != nullptr) {
+    offset += seq_starts[bid];
+  }
+  int32_t offset_div = offset / block_size;
+  offset = offset % block_size;
+  const int32_t* batch_block_table = block_table + batch_offset;
+
+  // Adjust dst pointer based on the cumulative sequence lengths.
+  dst += seq_start * dst_entry_stride;
+
+  auto copy_entry = [&](const scalar_t* __restrict__ _src,
+                        scalar_t* __restrict__ _dst) {
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
+      _dst[i] = _src[i];
+  };
+
+  for (int pid = split_start; pid < split_end; ++pid) {
+    auto block_id = batch_block_table[offset_div];
+    auto block_start_ptr = src_cache + block_id * cache_block_stride;
+    auto block_dst_ptr = dst + pid * dst_entry_stride;
+    copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
+    offset += 1;
+    // bump to next block
+    if (offset == block_size) {
+      offset_div += 1;
+      offset = 0;
+    }
+  }
+}
+}  // namespace vllm
+
+// Macro to dispatch the kernel based on the data type.
+#define CALL_CP_GATHER_CACHE(CPY_DTYPE)                                 \
+  vllm::cp_gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(         \
+      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
+      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
+      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
+      block_size, entry_size, block_table_stride, cache_block_stride,   \
+      cache_entry_stride, dst_entry_stride, seq_starts_ptr);
+
+// Gather sequences from the cache into the destination tensor.
+//  - cu_seq_lens contains the cumulative sequence lengths for each batch
+//  - block_table contains the cache block indices for each sequence
+//  - Optionally, seq_starts (if provided) offsets the starting slot index by
+//  seq_starts[bid]
+void cp_gather_cache(
+    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
+    int64_t batch_size,
+    std::optional<torch::Tensor> seq_starts = std::nullopt) {
+  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  int32_t block_size = src_cache.size(1);
+  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+
+  TORCH_CHECK(block_table.dtype() == torch::kInt32,
+              "block_table must be int32");
+  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
+              "cu_seq_lens must be int32");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
+                "seq_starts must be int32");
+  }
+
+  TORCH_CHECK(src_cache.device() == dst.device(),
+              "src_cache and dst must be on the same device");
+  TORCH_CHECK(src_cache.device() == block_table.device(),
+              "src_cache and block_table must be on the same device");
+  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
+              "src_cache and cu_seq_lens must be on the same device");
+  if (seq_starts.has_value()) {
+    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
+                "src_cache and seq_starts must be on the same device");
+  }
+
+  int64_t block_table_stride = block_table.stride(0);
+  int64_t cache_block_stride = src_cache.stride(0);
+  int64_t cache_entry_stride = src_cache.stride(1);
+  int64_t dst_entry_stride = dst.stride(0);
+
+  // Decide on the number of splits based on the batch size.
+  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
+  dim3 grid(batch_size, num_splits);
+  dim3 block(1024);
+
+  TORCH_CHECK(src_cache.dtype() == dst.dtype(),
+              "src_cache and dst must have the same dtype");
+
+  const int dtype_bits = src_cache.element_size() * 8;
+  const int32_t* seq_starts_ptr =
+      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
+
+  if (dtype_bits == 32) {
+    CALL_CP_GATHER_CACHE(uint32_t);
+  } else if (dtype_bits == 16) {
+    CALL_CP_GATHER_CACHE(uint16_t);
+  } else if (dtype_bits == 8) {
+    CALL_CP_GATHER_CACHE(uint8_t);
+  } else {
+    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
+  }
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 7ae054dc19fb..608b72440307 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -686,6 +686,16 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                     Tensor scale) -> ()");
   cache_ops.impl("concat_and_cache_mla", torch::kCUDA, &concat_and_cache_mla);
 
+  cache_ops.def(
+      "cp_fused_concat_and_cache_mla(Tensor kv_c, Tensor k_pe,"
+      "                              Tensor cp_local_token_select_indices,"
+      "                              Tensor! kv_cache,"
+      "                              Tensor slot_mapping,"
+      "                              str kv_cache_dtype,"
+      "                              Tensor scale) -> ()");
+  cache_ops.impl("cp_fused_concat_and_cache_mla", torch::kCUDA,
+                 &cp_fused_concat_and_cache_mla);
+
   // Convert the key and value cache to fp8 data type.
   cache_ops.def(
       "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
@@ -702,6 +712,11 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
       "                               Tensor scale, Tensor? seq_starts) -> ()");
   cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
                  &gather_and_maybe_dequant_cache);
+
+  cache_ops.def(
+      "cp_gather_cache(Tensor src_cache, Tensor! dst, Tensor block_table, "
+      "Tensor cu_seq_lens, int batch_size, Tensor? seq_starts) -> ()");
+  cache_ops.impl("cp_gather_cache", torch::kCUDA, &cp_gather_cache);
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index cbf11da63cab..69e96dfd2cb1 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -790,6 +790,78 @@ def test_gather_and_maybe_dequant_cache_mla(kv_lora_rank, qk_rope_head_dim,
     torch.testing.assert_close(dst, expected)
 
 
+@pytest.mark.parametrize("kv_lora_rank", [512])
+@pytest.mark.parametrize("qk_rope_head_dim", [64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_blocks", [1024])
+@pytest.mark.parametrize("max_seq_len", [512])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("dtype", [torch.float32])
+@pytest.mark.parametrize("kv_cache_dtype",
+                         ["auto"])  # You can also test "fp8" if needed.
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_cp_gather_cache_mla(kv_lora_rank, qk_rope_head_dim, block_size,
+                             num_blocks, max_seq_len, batch_size, dtype,
+                             kv_cache_dtype, device):
+    entry_size = kv_lora_rank + qk_rope_head_dim
+    src_cache = _create_mla_cache(num_blocks, block_size, entry_size, dtype,
+                                  kv_cache_dtype, device)
+    _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
+
+    seq_len_tensor = torch.randint(0,
+                                   max_seq_len + 1, (batch_size, ),
+                                   device=device)
+
+    total_tokens = seq_len_tensor.sum()
+    cu_seq_lens = torch.empty((batch_size + 1),
+                              dtype=torch.int32,
+                              device=device)
+    cu_seq_lens[0] = 0
+    cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    print("seq_len_tensor", seq_len_tensor)
+
+    tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
+    block_table = torch.empty((batch_size, num_blocks),
+                              dtype=torch.int32,
+                              device=device)
+
+    for b in range(batch_size):
+        perm = torch.randperm(num_blocks, device=device)
+        block_table[b, :] = perm
+
+    dst = torch.zeros((total_tokens, entry_size),
+                      dtype=src_cache.dtype,
+                      device=device)
+
+    expected_batches = []
+    for b in range(batch_size):
+        s = seq_len_tensor[b]
+        if s == 0:
+            continue
+        tot = tot_blocks_tensor[b]
+        blocks = block_table[b, :tot].tolist()
+
+        gathered_rows = []
+        for i in range(tot - 1):
+            gathered_rows.append(src_cache[blocks[i]])
+        remaining = s - (tot - 1) * block_size
+        gathered_rows.append(src_cache[blocks[-1], :remaining, :])
+
+        batch_expected = torch.cat(gathered_rows, dim=0)
+        expected_batches.append(batch_expected)
+    expected = torch.cat(expected_batches, dim=0)
+
+    opcheck(
+        torch.ops._C_cache_ops.cp_gather_cache,
+        (src_cache, dst, block_table, cu_seq_lens, batch_size, None),
+        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
+    )
+
+    ops.cp_gather_cache(src_cache, dst, block_table, cu_seq_lens, batch_size)
+    torch.testing.assert_close(dst, expected)
+
+
 @pytest.mark.parametrize("kv_lora_rank", KV_LORA_RANKS)
 @pytest.mark.parametrize("qk_rope_head_dim", QK_ROPE_HEAD_DIMS)
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS_MLA)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 054dc9d985a4..340d6e1164e4 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1625,6 +1625,20 @@ def concat_and_cache_mla(
                                                 scale)
 
 
+def cp_fused_concat_and_cache_mla(
+    kv_c: torch.Tensor,
+    k_pe: torch.Tensor,
+    cp_local_token_select_indices: torch.Tensor,
+    kv_cache: torch.Tensor,
+    slot_mapping: torch.Tensor,
+    kv_cache_dtype: str,
+    scale: torch.Tensor,
+) -> None:
+    torch.ops._C_cache_ops.cp_fused_concat_and_cache_mla(
+        kv_c, k_pe, cp_local_token_select_indices, kv_cache, slot_mapping,
+        kv_cache_dtype, scale)
+
+
 def copy_blocks(key_caches: list[torch.Tensor],
                 value_caches: list[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
@@ -1662,6 +1676,16 @@ def gather_and_maybe_dequant_cache(
         scale, seq_starts)
 
 
+def cp_gather_cache(src_cache: torch.Tensor,
+                    dst: torch.Tensor,
+                    block_table: torch.Tensor,
+                    cu_seq_lens: torch.Tensor,
+                    batch_size: int,
+                    seq_starts: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C_cache_ops.cp_gather_cache(src_cache, dst, block_table,
+                                           cu_seq_lens, batch_size, seq_starts)
+
+
 def get_device_attribute(attribute: int, device: int) -> int:
     return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
 

From 11a7fafaa8807bfeea4b60466c576ec6a7031bfd Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Thu, 28 Aug 2025 15:36:42 +0800
Subject: [PATCH 054/125] [New Model]: Support
 GteNewModelForSequenceClassification (#23524)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 docs/models/supported_models.md               |  4 +
 tests/conftest.py                             |  5 +-
 tests/models/language/pooling/embed_utils.py  |  3 +
 tests/models/language/pooling/mteb_utils.py   |  6 ++
 .../pooling/test_bge_reranker_v2_gemma.py     | 24 ++----
 tests/models/language/pooling/test_gte.py     | 24 +++---
 .../language/pooling/test_mxbai_rerank.py     | 19 ++---
 .../language/pooling/test_qwen3_reranker.py   | 26 ++----
 tests/models/registry.py                      |  4 +
 tests/models/utils.py                         | 28 ++++---
 vllm/model_executor/models/bert_with_rope.py  | 83 +++++++++++++++++--
 vllm/model_executor/models/config.py          |  1 +
 vllm/model_executor/models/registry.py        |  6 +-
 13 files changed, 157 insertions(+), 76 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 20cf75873af7..34e465584888 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -497,6 +497,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 |--------------|--------|-------------------|----------------------|---------------------------|---------------------|
 | `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | ✅︎ |
 | `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
+| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. |  |  | ✅︎ |
 | `Qwen2ForSequenceClassification` | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen3ForSequenceClassification` | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | ✅︎ |
 | `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | ✅︎ |
@@ -513,6 +514,9 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
     vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}'
     ```
 
+!!! note
+    The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture.
+
 !!! note
     Load the official original `mxbai-rerank-v2` by using the following command.
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 6052ada1c5fd..9fed43cba54b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -456,11 +456,10 @@ def classify(self, prompts: list[str]) -> list[str]:
         # output is final logits
         all_inputs = self.get_inputs(prompts)
         outputs = []
+        problem_type = getattr(self.config, "problem_type", "")
+
         for inputs in all_inputs:
             output = self.model(**self.wrap_device(inputs))
-
-            problem_type = getattr(self.config, "problem_type", "")
-
             if problem_type == "regression":
                 logits = output.logits[0].tolist()
             elif problem_type == "multi_label_classification":
diff --git a/tests/models/language/pooling/embed_utils.py b/tests/models/language/pooling/embed_utils.py
index 61c5fcab4f8a..a74ad2aa2597 100644
--- a/tests/models/language/pooling/embed_utils.py
+++ b/tests/models/language/pooling/embed_utils.py
@@ -51,6 +51,9 @@ def correctness_test_embed_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
     with vllm_runner(model_info.name,
                      runner="pooling",
                      max_model_len=None,
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 4a1f8a53d024..640858125bfc 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -172,6 +172,9 @@ def mteb_test_embed_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
     with vllm_runner(model_info.name,
                      runner="pooling",
                      max_model_len=None,
@@ -284,6 +287,9 @@ def mteb_test_rerank_models(hf_runner,
     vllm_extra_kwargs = vllm_extra_kwargs or {}
     vllm_extra_kwargs["dtype"] = model_info.dtype
 
+    if model_info.hf_overrides is not None:
+        vllm_extra_kwargs["hf_overrides"] = model_info.hf_overrides
+
     with vllm_runner(model_info.name,
                      runner="pooling",
                      max_model_len=None,
diff --git a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
index 206524d7caad..f473e0ba01ff 100644
--- a/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling/test_bge_reranker_v2_gemma.py
@@ -13,7 +13,14 @@
 
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("BAAI/bge-reranker-v2-gemma",
-                               architecture="GemmaForSequenceClassification"),
+                               architecture="GemmaForSequenceClassification",
+                               hf_overrides={
+                                   "architectures":
+                                   ["GemmaForSequenceClassification"],
+                                   "classifier_from_token": ["Yes"],
+                                   "method":
+                                   "no_post_processing",
+                               }),
 ]
 
 PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."  # noqa: E501
@@ -119,22 +126,9 @@ def predict(
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
-def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo,
-                            monkeypatch) -> None:
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-
-    assert model_info.architecture == "GemmaForSequenceClassification"
-
-    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["GemmaForSequenceClassification"],
-            "classifier_from_token": ["Yes"],
-            "method": "no_post_processing",
-        }
-    }
+def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
 
     mteb_test_rerank_models(GemmaRerankerHfRunner,
                             vllm_runner,
                             model_info,
-                            vllm_extra_kwargs,
                             vllm_mteb_encoder=GemmaMtebEncoder)
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index f805a64103c0..9911620c018e 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
 
 import pytest
 
@@ -33,12 +32,15 @@
     ########### NewModel
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-multilingual-base",
                              architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-base-en-v1.5",
                              architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                              enable_test=True),
     CLSPoolingEmbedModelInfo("Alibaba-NLP/gte-large-en-v1.5",
                              architecture="GteNewModel",
+                             hf_overrides={"architectures": ["GteNewModel"]},
                              enable_test=True),
     ########### Qwen2ForCausalLM
     LASTPoolingEmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
@@ -60,11 +62,16 @@
 ]
 
 RERANK_MODELS = [
-    # classifier_pooling: mean
     CLSPoolingRerankModelInfo(
+        # classifier_pooling: mean
         "Alibaba-NLP/gte-reranker-modernbert-base",
         architecture="ModernBertForSequenceClassification",
         enable_test=True),
+    CLSPoolingRerankModelInfo(
+        "Alibaba-NLP/gte-multilingual-reranker-base",
+        architecture="GteNewForSequenceClassification",
+        hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
+        enable_test=True),
 ]
 
 
@@ -75,12 +82,7 @@ def test_embed_models_mteb(hf_runner, vllm_runner,
         check_transformers_version(model_info.name,
                                    max_transformers_version="4.53.2")
 
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "GteNewModel":
-        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
-
-    mteb_test_embed_models(hf_runner, vllm_runner, model_info,
-                           vllm_extra_kwargs)
+    mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", MODELS)
@@ -91,12 +93,8 @@ def test_embed_models_correctness(hf_runner, vllm_runner,
         check_transformers_version(model_info.name,
                                    max_transformers_version="4.53.2")
 
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "GteNewModel":
-        vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
-
     correctness_test_embed_models(hf_runner, vllm_runner, model_info,
-                                  example_prompts, vllm_extra_kwargs)
+                                  example_prompts)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
diff --git a/tests/models/language/pooling/test_mxbai_rerank.py b/tests/models/language/pooling/test_mxbai_rerank.py
index 480bd5e4567c..73823deeff4e 100644
--- a/tests/models/language/pooling/test_mxbai_rerank.py
+++ b/tests/models/language/pooling/test_mxbai_rerank.py
@@ -10,12 +10,20 @@
 from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models
 
+mxbai_rerank_hf_overrides = {
+    "architectures": ["Qwen2ForSequenceClassification"],
+    "classifier_from_token": ["0", "1"],
+    "method": "from_2_way_softmax",
+}
+
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-base-v2",
                                architecture="Qwen2ForSequenceClassification",
+                               hf_overrides=mxbai_rerank_hf_overrides,
                                enable_test=True),
     LASTPoolingRerankModelInfo("mixedbread-ai/mxbai-rerank-large-v2",
                                architecture="Qwen2ForSequenceClassification",
+                               hf_overrides=mxbai_rerank_hf_overrides,
                                enable_test=False)
 ]
 
@@ -71,13 +79,4 @@ def compute_logits(inputs):
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
-    vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.architecture == "Qwen2ForSequenceClassification":
-        vllm_extra_kwargs["hf_overrides"] = {
-            "architectures": ["Qwen2ForSequenceClassification"],
-            "classifier_from_token": ["0", "1"],
-            "method": "from_2_way_softmax",
-        }
-
-    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info)
diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 37f5566a330d..8c6537f3193f 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -11,12 +11,20 @@
 from ...utils import LASTPoolingRerankModelInfo, RerankModelInfo
 from .mteb_utils import mteb_test_rerank_models
 
+qwen3_reranker_hf_overrides = {
+    "architectures": ["Qwen3ForSequenceClassification"],
+    "classifier_from_token": ["no", "yes"],
+    "is_original_qwen3_reranker": True,
+}
+
 RERANK_MODELS = [
     LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-0.6B",
                                architecture="Qwen3ForSequenceClassification",
+                               hf_overrides=qwen3_reranker_hf_overrides,
                                enable_test=True),
     LASTPoolingRerankModelInfo("Qwen/Qwen3-Reranker-4B",
                                architecture="Qwen3ForSequenceClassification",
+                               hf_overrides=qwen3_reranker_hf_overrides,
                                enable_test=False)
 ]
 
@@ -74,18 +82,7 @@ def compute_logits(inputs):
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
 def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
 
-    assert model_info.architecture == "Qwen3ForSequenceClassification"
-
-    vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        }
-    }
-
-    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
-                            vllm_extra_kwargs)
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
@@ -96,11 +93,6 @@ def test_rerank_models_mteb_tp(vllm_runner,
     assert model_info.architecture == "Qwen3ForSequenceClassification"
 
     vllm_extra_kwargs: dict[str, Any] = {
-        "hf_overrides": {
-            "architectures": ["Qwen3ForSequenceClassification"],
-            "classifier_from_token": ["no", "yes"],
-            "is_original_qwen3_reranker": True,
-        },
         "tensor_parallel_size": 2,
     }
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 2538e71692c4..85b4c96e3b1c 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -365,6 +365,10 @@ def check_available_online(
 
     # [Cross-encoder]
     "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
+                                                       trust_remote_code=True,
+                                                       hf_overrides={
+                                                           "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501
     "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
     "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
     "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 84aeb927c5fa..0fb1f5b3753b 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -3,7 +3,8 @@
 
 import warnings
 from collections.abc import Sequence
-from typing import Any, NamedTuple, Optional, Union
+from dataclasses import dataclass
+from typing import Any, Optional, Union
 
 import torch
 import torch.nn.functional as F
@@ -339,36 +340,43 @@ def softmax(data):
         return F.softmax(data, dim=-1)
 
 
-class EmbedModelInfo(NamedTuple):
+@dataclass
+class ModelInfo:
     name: str
-    is_matryoshka: bool = False
-    matryoshka_dimensions: Optional[list[int]] = None
     architecture: str = ""
     dtype: str = "auto"
+    hf_overrides: Optional[dict[str, Any]] = None
     default_pooling_type: str = ""
     enable_test: bool = True
 
 
+@dataclass
+class EmbedModelInfo(ModelInfo):
+    is_matryoshka: bool = False
+    matryoshka_dimensions: Optional[list[int]] = None
+
+
+@dataclass
 class CLSPoolingEmbedModelInfo(EmbedModelInfo):
     default_pooling_type: str = "CLS"
 
 
+@dataclass
 class LASTPoolingEmbedModelInfo(EmbedModelInfo):
     default_pooling_type: str = "LAST"
 
 
-class RerankModelInfo(NamedTuple):
-    name: str
-    architecture: str = ""
-    dtype: str = "auto"
-    default_pooling_type: str = ""
-    enable_test: bool = True
+@dataclass
+class RerankModelInfo(ModelInfo):
+    pass
 
 
+@dataclass
 class CLSPoolingRerankModelInfo(RerankModelInfo):
     default_pooling_type: str = "CLS"
 
 
+@dataclass
 class LASTPoolingRerankModelInfo(RerankModelInfo):
     default_pooling_type: str = "LAST"
 
diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py
index dcb7e75456cd..3be7e11d947d 100644
--- a/vllm/model_executor/models/bert_with_rope.py
+++ b/vllm/model_executor/models/bert_with_rope.py
@@ -27,12 +27,15 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.utils import WeightsMapper
+from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper,
+                                              maybe_prefix)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsQuant
+from ..layers.pooler import ClassifierPooler, DispatchPooler, Pooler
+from .bert import BertPooler
+from .interfaces import SupportsCrossEncoding, SupportsQuant
 from .interfaces_base import default_pooling_type
 
 
@@ -406,9 +409,14 @@ def forward(
 class BertWithRope(nn.Module, SupportsQuant):
     hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 add_pooling_layer: bool = False):
         super().__init__()
         self.vllm_config = vllm_config
+        self.add_pooling_layer = add_pooling_layer
         self.config = vllm_config.model_config.hf_config
         self.embeddings = BertWithRopeEmbedding(self.config)
         self.encoder = BertWithRopeEncoder(
@@ -416,6 +424,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             bias=getattr(self.config, "bias", True),
             rotary_kwargs=self.config.rotary_kwargs,
             prefix=f"{prefix}.encoder")
+        self.pooler = BertPooler(self.config) if add_pooling_layer else None
 
     def forward(
         self,
@@ -448,7 +457,7 @@ def load_weights(self, weights: Iterable[tuple[str,
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
-            if "pooler" in name:
+            if not self.add_pooling_layer and "pooler" in name:
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
@@ -508,8 +517,8 @@ class GteNewModel(BertWithRope):
             "attention.o_proj": "attn.out_proj",
         })
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs):
+        super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs)
 
         # GteNewModel only gate_up_proj does not have bias.
         # Hack method learned from vllm/model_executor/models/glm.py
@@ -614,3 +623,65 @@ def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         weights = self.jina_merge_lora_weights(weights)
         return super().load_weights(weights)
+
+
+@default_pooling_type("CLS")
+class GteNewForSequenceClassification(nn.Module, SupportsCrossEncoding):
+    is_pooling_model = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.new = GteNewModel(vllm_config=vllm_config,
+                               prefix=prefix,
+                               add_pooling_layer=True)
+        self.classifier = RowParallelLinear(config.hidden_size,
+                                            config.num_labels,
+                                            input_is_parallel=False,
+                                            bias=True,
+                                            quant_config=quant_config,
+                                            prefix=maybe_prefix(
+                                                prefix, "classifier"),
+                                            return_bias=False)
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+
+        self.pooler = DispatchPooler({
+            "encode":
+            Pooler.for_encode(pooler_config),
+            "classify":
+            ClassifierPooler(
+                pooling=self.new.pooler,
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_seq_cls(
+                    vllm_config.model_config),
+            ),
+            "score":
+            ClassifierPooler(
+                pooling=self.new.pooler,
+                classifier=self.classifier,
+                act_fn=ClassifierPooler.act_fn_for_cross_encoder(
+                    vllm_config.model_config),
+            ),
+        })
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        loaded_params = loader.load_weights(weights)
+        return loaded_params
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        return self.new(input_ids=input_ids,
+                        positions=positions,
+                        inputs_embeds=inputs_embeds,
+                        intermediate_tensors=intermediate_tensors)
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index b0dbfacece3a..377b7bf26a07 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -406,6 +406,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
     "GteModel": SnowflakeGteNewModelConfig,
     "GteNewModel": GteNewModelConfig,
+    "GteNewForSequenceClassification": GteNewModelConfig,
     "NomicBertModel": NomicBertModelConfig,
     "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
     "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 12c0c77784db..9040189ee558 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -191,12 +191,14 @@
 
 _CROSS_ENCODER_MODELS = {
     "BertForSequenceClassification": ("bert", "BertForSequenceClassification"),
+    "GteNewForSequenceClassification": ("bert_with_rope",
+                                        "GteNewForSequenceClassification"),
+    "ModernBertForSequenceClassification": ("modernbert",
+                                            "ModernBertForSequenceClassification"),
     "RobertaForSequenceClassification": ("roberta",
                                          "RobertaForSequenceClassification"),
     "XLMRobertaForSequenceClassification": ("roberta",
                                             "RobertaForSequenceClassification"),
-    "ModernBertForSequenceClassification": ("modernbert",
-                                            "ModernBertForSequenceClassification"),
     # [Auto-converted (see adapters.py)]
     "JinaVLForRanking": ("jina_vl", "JinaVLForSequenceClassification"), # noqa: E501,
 }

From c5d004aaaf3b2106d33974c673bec0568c18f762 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Thu, 28 Aug 2025 16:03:28 +0800
Subject: [PATCH 055/125] [Model] Add PP support and VLM backbone compatability
 for GPT-OSS (#23680)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 docs/models/supported_models.md       |   2 +-
 vllm/model_executor/models/gpt_oss.py | 119 +++++++++++++++++++-------
 2 files changed, 87 insertions(+), 34 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 34e465584888..17947e8cfad7 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -358,7 +358,7 @@ th {
 | `GPTBigCodeForCausalLM` | StarCoder, SantaCoder, WizardCoder | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GPTJForCausalLM` | GPT-J | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. | | ✅︎ | ✅︎ |
 | `GPTNeoXForCausalLM` | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | | ✅︎ | ✅︎ |
-| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | | ✅︎ |
+| `GptOssForCausalLM` | GPT-OSS | `openai/gpt-oss-120b`, `openai/gpt-oss-20b` | | ✅︎ | ✅︎ |
 | `GraniteForCausalLM` | Granite 3.0, Granite 3.1, PowerLM | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeForCausalLM` | Granite 3.0 MoE, PowerMoE | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `GraniteMoeHybridForCausalLM` | Granite 4.0 MoE Hybrid | `ibm-granite/granite-4.0-tiny-preview`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 9c1c05320cf3..2b118d8491ed 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -11,7 +11,8 @@
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
-from vllm.distributed import (get_ep_group, get_tensor_model_parallel_rank,
+from vllm.distributed import (get_ep_group, get_pp_group,
+                              get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -27,7 +28,10 @@
 from vllm.sequence import IntermediateTensors
 from vllm.utils import cdiv
 
+from .interfaces import SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
 
@@ -75,8 +79,6 @@ def __init__(
                         dtype=torch.bfloat16,
                         requires_grad=False))
 
-        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
-
         self.q_size = self.num_attention_heads * self.head_dim // tp_size
         self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
         self.scaling = self.head_dim**-0.5
@@ -119,16 +121,13 @@ def __init__(
 
     def forward(self, hidden_states: torch.Tensor,
                 positions: torch.Tensor) -> torch.Tensor:
-        t = self.norm(hidden_states)
-
-        qkv, _ = self.qkv(t)
+        qkv, _ = self.qkv(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
         v = v.contiguous()
         attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
-
-        return output + hidden_states
+        return output
 
 
 class MLPBlock(torch.nn.Module):
@@ -145,7 +144,6 @@ def __init__(
         self.num_experts = config.num_local_experts
         self.experts_per_token = config.num_experts_per_tok
         self.world_size = dist.get_world_size() if dist.is_initialized() else 1
-        self.norm = RMSNorm(config.hidden_size, eps=1e-5)
         self.router = torch.nn.Linear(config.hidden_size,
                                       config.num_local_experts,
                                       dtype=torch.bfloat16)
@@ -163,10 +161,9 @@ def __init__(
                                 activation="swigluoai")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        t = self.norm(x)
-        g = self.router(t)
-        t = self.experts(hidden_states=t, router_logits=g)
-        return x + t
+        g = self.router(x)
+        x = self.experts(hidden_states=x, router_logits=g)
+        return x
 
 
 class TransformerBlock(torch.nn.Module):
@@ -187,12 +184,28 @@ def __init__(
                             self.layer_idx,
                             quant_config=quant_config,
                             prefix=f"{prefix}.mlp")
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=1e-5)
 
-    def forward(self, hidden_states: torch.Tensor,
-                positions: torch.Tensor) -> torch.Tensor:
-        attn_output = self.attn(hidden_states, positions)
-        output = self.mlp(attn_output)
-        return output
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+        hidden_states = self.attn(hidden_states, positions)
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        output = self.mlp(hidden_states)
+        return output, residual
 
 
 @support_torch_compile
@@ -214,22 +227,52 @@ def __init__(
             self.config.vocab_size,
             self.config.hidden_size,
         )
-        self.layers = torch.nn.ModuleList([
-            TransformerBlock(
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            self.config.num_hidden_layers,
+            lambda prefix: TransformerBlock(
                 self.config,
                 cache_config=self.cache_config,
                 quant_config=self.quant_config,
-                prefix=maybe_prefix(prefix, f"block.{layer_idx}"),
-            ) for layer_idx in range(self.config.num_hidden_layers)
-        ])
+                prefix=prefix,
+            ),
+            prefix=f"{prefix}.layers",
+        )
         self.norm = RMSNorm(self.config.hidden_size, eps=1e-5)
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], self.config.hidden_size))
 
-    def forward(self, input_ids: torch.Tensor,
-                positions: torch.Tensor) -> torch.Tensor:
-        x = self.embedding(input_ids)
-        for layer in self.layers:
-            x = layer(x, positions)
-        x = self.norm(x)
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embedding(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                x = inputs_embeds
+            else:
+                x = self.get_input_embeddings(input_ids)
+
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            x = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            x, residual = layer(x, positions, residual)
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": x,
+                "residual": residual
+            })
+        x, _ = self.norm(x, residual)
         return x
 
     def _load_weights_mxfp4(
@@ -264,6 +307,10 @@ def _load_weights_mxfp4(
                           intermediate_size)
 
         for name, weight in weights:
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+
             # FIXME(woosuk): Remove this after testing.
             weight = weight.cuda()
 
@@ -445,6 +492,10 @@ def _load_weights_other(
                           intermediate_size)
 
         for name, weight in weights:
+            # Skip layers on other devices.
+            if is_pp_missing_parameter(name, self):
+                continue
+
             if ".w13_weight" in name:
                 # Handle MLP gate and up projection weights
                 # Extract gate and up projection parts
@@ -562,18 +613,15 @@ def load_weights(self, weights: Iterable[tuple[str,
                                             weights, stacked_params_mapping)
 
 
-class GptOssForCausalLM(nn.Module):
+class GptOssForCausalLM(nn.Module, SupportsPP):
     packed_modules_mapping = {"qkv": ["q_proj", "k_proj", "v_proj"]}
 
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_substr={
             ".self_attn.": ".attn.",
-            ".post_attention_layernorm.": ".mlp.norm.",
         },
         orig_to_new_suffix={
             ".embed_tokens.weight": ".embedding.weight",
-            ".input_layernorm.weight": ".attn.norm.weight",
-            ".post_attention_layernorm.weight": ".mlp.norm.weight",
 
             # MoE MXFP4 weights
             ".gate_up_proj_blocks": ".w13_weight",
@@ -609,6 +657,11 @@ def __init__(
             self.config.hidden_size,
         )
         self.logits_processor = LogitsProcessor(self.config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
 
     def forward(self,
                 input_ids: torch.Tensor,

From 3462c1c522d214755f1dfce3d645ab5afe7f00ae Mon Sep 17 00:00:00 2001
From: JartX <sagformas@epdcenter.es>
Date: Thu, 28 Aug 2025 11:03:22 +0200
Subject: [PATCH 056/125] [FIXBUG] Add return_success parameter to
 moe_wna16_weight_loader function (#22797)

Signed-off-by: JartX <sagformas@epdcenter.es>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .../layers/quantization/moe_wna16.py          | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 364d1ac314d2..0cde104cc75d 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -124,7 +124,7 @@ def is_moe_wna16_compatible(cls, quant_config: dict[str, Any]):
         awq_min_capability = AWQConfig.get_min_capability()
 
         gptq_compatible = quant_method == "gptq" and \
-                not desc_act and num_bits in [4, 8]
+            not desc_act and num_bits in [4, 8]
         awq_compatible = quant_method == "awq" and num_bits == 4 and \
             device_capability >= awq_min_capability
 
@@ -175,11 +175,8 @@ class MoeWNA16Method(FusedMoEMethodBase):
         quant_config: The MOE WNA16 (W8A16/W4A16) quantization config.
     """
 
-    def __init__(
-        self,
-        quant_config: MoeWNA16Config,
-        moe: FusedMoEConfig,
-    ):
+    def __init__(self, quant_config: MoeWNA16Config,
+                 moe: "FusedMoEConfig") -> None:
         super().__init__(moe)
         self.quant_config = quant_config
 
@@ -187,6 +184,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
 
+        self.moe = layer
         layer.quant_config = self.quant_config
         bit8_pack_factor = self.quant_config.bit8_pack_factor
         group_size = self.quant_config.group_size
@@ -308,7 +306,6 @@ def apply(
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert self.fused_experts is None
-
         if enable_eplb:
             raise NotImplementedError(
                 "EPLB not supported for `MoeWNA16Method` yet.")
@@ -404,12 +401,14 @@ def convert_gptq_int4_qzeros(tensor):
 
         def moe_wna16_weight_loader(param: torch.nn.Parameter,
                                     loaded_weight: torch.Tensor,
-                                    weight_name: str, shard_id: str,
-                                    expert_id: int):
+                                    weight_name: str,
+                                    shard_id: str,
+                                    expert_id: int,
+                                    return_success: bool = False):
             if "g_idx" in weight_name:
-                return
+                return False if return_success else None
             if not layer.quant_config.has_zp and "qzeros" in weight_name:
-                return
+                return False if return_success else None
 
             device = get_tp_group().device
             tp_rank = get_tensor_model_parallel_rank()
@@ -455,11 +454,18 @@ def moe_wna16_weight_loader(param: torch.nn.Parameter,
                     param.data[expert_id, :shard_size // 2] = tensor
                 else:
                     param.data[expert_id, shard_size // 2:] = tensor
+                return True if return_success else None
             elif "w2_qzeros" in weight_name:
                 param.data[expert_id] = loaded_weight.view(
                     loaded_weight.size(0), layer.tp_size, -1)[:, tp_rank]
+                return True if return_success else None
             else:
-                weight_loader(param, loaded_weight, weight_name, shard_id,
-                              expert_id)
+                # Delegate to the original loader, passing return_success
+                return weight_loader(param,
+                                     loaded_weight,
+                                     weight_name,
+                                     shard_id,
+                                     expert_id,
+                                     return_success=return_success)
 
         return moe_wna16_weight_loader

From d99c3a4f7bd33e3e3acf7c2c82d52d15ba501eaf Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Thu, 28 Aug 2025 13:38:19 +0200
Subject: [PATCH 057/125] [Doc]: fix typos in .md files (including those of
 #23751) (#23825)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 docs/contributing/ci/update_pytorch_version.md     | 2 +-
 docs/contributing/model/multimodal.md              | 2 +-
 docs/deployment/frameworks/lobe-chat.md            | 2 +-
 docs/deployment/k8s.md                             | 2 +-
 docs/design/fused_moe_modular_kernel.md            | 2 +-
 docs/design/metrics.md                             | 4 ++--
 docs/features/lora.md                              | 2 +-
 docs/features/reasoning_outputs.md                 | 2 +-
 docs/features/structured_outputs.md                | 2 +-
 docs/getting_started/installation/aws_neuron.md    | 4 ++--
 docs/getting_started/installation/cpu/apple.inc.md | 2 +-
 docs/getting_started/installation/gpu/cuda.inc.md  | 2 +-
 docs/getting_started/installation/gpu/rocm.inc.md  | 4 ++--
 docs/models/pooling_models.md                      | 2 +-
 docs/models/supported_models.md                    | 2 +-
 docs/usage/usage_stats.md                          | 2 +-
 16 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index 7ef22d6f8c3f..3dae62dd5d94 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -90,7 +90,7 @@ address the long build time at its source, the current workaround is to set `VLL
 to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
 when manually triggering a build on Buildkite. This branch accomplishes two things:
 
-1. Increase the timeout limit to 10 hours so that the build doesn't timeout.
+1. Increase the timeout limit to 10 hours so that the build doesn't time out.
 2. Allow the compiled artifacts to be written to the vLLM sccache S3 bucket
 to warm it up so that future builds are faster.
 
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
index 76d0f067fd45..dc742c8fcf2c 100644
--- a/docs/contributing/model/multimodal.md
+++ b/docs/contributing/model/multimodal.md
@@ -855,7 +855,7 @@ Examples:
 
 ### Custom HF processor
 
-Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
+Some models don't define an HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
 
 Examples:
 
diff --git a/docs/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md
index e3e7dbe6e1e8..8ecd1484eab0 100644
--- a/docs/deployment/frameworks/lobe-chat.md
+++ b/docs/deployment/frameworks/lobe-chat.md
@@ -6,6 +6,6 @@ Supports speech-synthesis, multi-modal, and extensible (function call) plugin sy
 
 One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application.
 
-It supports vLLM as a AI model provider to efficiently serve large language models.
+It supports vLLM as an AI model provider to efficiently serve large language models.
 
 For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm).
diff --git a/docs/deployment/k8s.md b/docs/deployment/k8s.md
index cad801a4312c..ca23e0b9fd8a 100644
--- a/docs/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -380,7 +380,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
 
 ### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"
 
-If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
+If the startup or readiness probe failureThreshold is too low for the time needed to start up the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
 
 1. container log contains "KeyboardInterrupt: terminated"
 2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted`
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 202e9c1caf11..b03483d1c9b2 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -138,7 +138,7 @@ Typically a FusedMoEPrepareAndFinalize type is backed by an All2All Dispatch & C
 
 #### Step 1: Add an All2All manager
 
-The purpose of the All2All Manager is to setup the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
+The purpose of the All2All Manager is to set up the All2All kernel implementations. The `FusedMoEPrepareAndFinalize` implementations typically fetch a kernel-implementation "handle" from the All2All Manager to invoke the Dispatch and Combine functions. Please look at the All2All Manager implementations [here](gh-file:vllm/distributed/device_communicators/all2all.py).
 
 #### Step 2: Add a FusedMoEPrepareAndFinalize Type
 
diff --git a/docs/design/metrics.md b/docs/design/metrics.md
index b24364247b3f..90b2fd32f297 100644
--- a/docs/design/metrics.md
+++ b/docs/design/metrics.md
@@ -99,11 +99,11 @@ http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201
 
 ### Multi-process Mode
 
-In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
+In v0, metrics are collected in the engine core process and we use multiprocess mode to make them available in the API server process. See <gh-pr:7279>.
 
 ### Built in Python/Process Metrics
 
-The following metrics are supported by default by `prometheus_client`, but they are not exposed when multi-process mode is used:
+The following metrics are supported by default by `prometheus_client`, but they are not exposed when multiprocess mode is used:
 
 - `python_gc_objects_collected_total`
 - `python_gc_objects_uncollectable_total`
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 668460a368a7..db794b2ebd71 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -52,7 +52,7 @@ Check out <gh-file:examples/offline_inference/multilora_inference.py> for an exa
 ## Serving LoRA Adapters
 
 LoRA adapted models can also be served with the Open-AI compatible vLLM server. To do so, we use
-`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kickoff the server:
+`--lora-modules {name}={path} {name}={path}` to specify each LoRA module when we kick off the server:
 
 ```bash
 vllm serve meta-llama/Llama-2-7b-hf \
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 04b943efbbbb..d9a785eb73fb 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -143,7 +143,7 @@ OpenAI Python client library does not officially support `reasoning_content` att
             print(content, end="", flush=True)
     ```
 
-Remember to check whether the `reasoning_content` exists in the response before accessing it. You could checkout the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
+Remember to check whether the `reasoning_content` exists in the response before accessing it. You could check out the [example](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py).
 
 ## Tool Calling
 
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index 8a934d406f38..0d6294a5fdd7 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -205,7 +205,7 @@ This section covers the OpenAI beta wrapper over the `client.chat.completions.cr
 
 At the time of writing (`openai==1.54.4`), this is a "beta" feature in the OpenAI client library. Code reference can be found [here](https://github.com/openai/openai-python/blob/52357cff50bee57ef442e94d78a0de38b4173fc2/src/openai/resources/beta/chat/completions.py#L100-L104).
 
-For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
+For the following examples, vLLM was set up using `vllm serve meta-llama/Llama-3.1-8B-Instruct`
 
 Here is a simple example demonstrating how to get structured output using Pydantic models:
 
diff --git a/docs/getting_started/installation/aws_neuron.md b/docs/getting_started/installation/aws_neuron.md
index b8bd76bd5bcb..ff2500f03527 100644
--- a/docs/getting_started/installation/aws_neuron.md
+++ b/docs/getting_started/installation/aws_neuron.md
@@ -140,8 +140,8 @@ Alternatively, users can directly call the NxDI library to trace and compile you
 
 - `NEURON_COMPILED_ARTIFACTS`: set this environment variable to point to your pre-compiled model artifacts directory to avoid
   compilation time upon server initialization. If this variable is not set, the Neuron module will perform compilation and save the
-  artifacts under `neuron-compiled-artifacts/{unique_hash}/` sub-directory in the model path. If this environment variable is set,
-  but the directory does not exist, or the contents are invalid, Neuron will also fallback to a new compilation and store the artifacts
+  artifacts under `neuron-compiled-artifacts/{unique_hash}/` subdirectory in the model path. If this environment variable is set,
+  but the directory does not exist, or the contents are invalid, Neuron will also fall back to a new compilation and store the artifacts
   under this specified path.
 - `NEURON_CONTEXT_LENGTH_BUCKETS`: Bucket sizes for context encoding. (Only applicable to `transformers-neuronx` backend).
 - `NEURON_TOKEN_GEN_BUCKETS`: Bucket sizes for token generation. (Only applicable to `transformers-neuronx` backend).
diff --git a/docs/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md
index 2828173a76a9..124a41adf1ae 100644
--- a/docs/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@@ -1,6 +1,6 @@
 # --8<-- [start:installation]
 
-vLLM has experimental support for macOS with Apple silicon. For now, users must build from source to natively run on macOS.
+vLLM has experimental support for macOS with Apple Silicon. For now, users must build from source to natively run on macOS.
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
index 69a9842e4719..275232e12e08 100644
--- a/docs/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -48,7 +48,7 @@ uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VE
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
 
 ```bash
 uv pip install -U vllm \
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 560883d3caf9..80e99d3034d3 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -149,7 +149,7 @@ Build a docker image from <gh-file:docker/Dockerfile.rocm_base> which setup ROCm
 **This step is optional as this rocm_base image is usually prebuilt and store at [Docker Hub](https://hub.docker.com/r/rocm/vllm-dev) under tag `rocm/vllm-dev:base` to speed up user experience.**
 If you choose to build this rocm_base image yourself, the steps are as follows.
 
-It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```json
 {
@@ -170,7 +170,7 @@ DOCKER_BUILDKIT=1 docker build \
 #### Build an image with vLLM
 
 First, build a docker image from <gh-file:docker/Dockerfile.rocm> and launch a docker container from the image.
-It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
+It is important that the user kicks off the docker build using buildkit. Either the user put `DOCKER_BUILDKIT=1` as environment variable when calling docker build command, or the user needs to set up buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
 
 ```bash
 {
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index fbb5f6f6dd17..d2fbb1870dde 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -258,4 +258,4 @@ Expected output:
 {"id":"embd-5c21fc9a5c9d4384a1b021daccaf9f64","object":"list","created":1745476417,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-0.3828125,-0.1357421875,0.03759765625,0.125,0.21875,0.09521484375,-0.003662109375,0.1591796875,-0.130859375,-0.0869140625,-0.1982421875,0.1689453125,-0.220703125,0.1728515625,-0.2275390625,-0.0712890625,-0.162109375,-0.283203125,-0.055419921875,-0.0693359375,0.031982421875,-0.04052734375,-0.2734375,0.1826171875,-0.091796875,0.220703125,0.37890625,-0.0888671875,-0.12890625,-0.021484375,-0.0091552734375,0.23046875]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0,"prompt_tokens_details":null}}
 ```
 
-A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
+An OpenAI client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 17947e8cfad7..01c1090c6fca 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -40,7 +40,7 @@ If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it mean
 
 #### Custom models
 
-If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
+If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
 
 For a model to be compatible with the Transformers backend for vLLM it must:
 
diff --git a/docs/usage/usage_stats.md b/docs/usage/usage_stats.md
index e78c67522f61..4c7a7ff019e8 100644
--- a/docs/usage/usage_stats.md
+++ b/docs/usage/usage_stats.md
@@ -51,7 +51,7 @@ tail ~/.config/vllm/usage_stats.json
 
 ## Opting out
 
-You can opt-out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
+You can opt out of usage stats collection by setting the `VLLM_NO_USAGE_STATS` or `DO_NOT_TRACK` environment variable, or by creating a `~/.config/vllm/do_not_track` file:
 
 ```bash
 # Any of the following methods can disable usage stats collection

From 67cee40da035b7478483c76dfbe0bfc321c3822f Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 28 Aug 2025 19:57:05 +0800
Subject: [PATCH 058/125] [CI/Build][Bugfix] Fix Qwen VL tests on CPU (#23818)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       | 20 +++++++++----------
 vllm/model_executor/models/utils.py           |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 9dec9f8e9eb3..8b8f0e8c6578 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -49,23 +49,23 @@ function cpu_tests() {
   # Run kernel tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -v -s tests/kernels/test_onednn.py"
+    pytest -x -v -s tests/kernels/test_onednn.py"
 
   # Run basic model test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
     # Note: disable until supports V1
-    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
 
     # Note: disable Bart until supports V1
-    pytest -v -s tests/models/language/generation -m cpu_model \
+    pytest -x -v -s tests/models/language/generation -m cpu_model \
                 --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
                 --ignore=tests/models/language/generation/test_bart.py
 
-    pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation \
+    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -x -v -s tests/models/multimodal/generation \
                 --ignore=tests/models/multimodal/generation/test_mllama.py \
                 --ignore=tests/models/multimodal/generation/test_pixtral.py \
                 -m cpu_model"
@@ -73,20 +73,20 @@ function cpu_tests() {
   # Run compressed-tensor test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -s -v \
+    pytest -x -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
 
   # Note: disable it until supports V1
   # Run AWQ test
   # docker exec cpu-test-"$NUMA_NODE" bash -c "
   #   set -e
-  #   VLLM_USE_V1=0 pytest -s -v \
+  #   VLLM_USE_V1=0 pytest -x -s -v \
   #   tests/quantization/test_ipex_quant.py"
 
   # Run multi-lora tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -s -v \
+    pytest -x -s -v \
     tests/lora/test_qwen2vl.py"
 
   # online serving
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 11e098f1d7bd..28cfefac30dd 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -507,10 +507,10 @@ def merge_multimodal_embeddings(
         This updates ``inputs_embeds`` in place.
     """
     if isinstance(placeholder_token_id, list):
-        placeholder_token_id = torch.tensor(placeholder_token_id,
-                                            pin_memory=True).to(
-                                                device=input_ids.device,
-                                                non_blocking=True)
+        placeholder_token_id = torch.tensor(
+            placeholder_token_id,
+            pin_memory=is_pin_memory_available()).to(device=input_ids.device,
+                                                     non_blocking=True)
         return _merge_multimodal_embeddings(
             inputs_embeds,
             torch.isin(input_ids, placeholder_token_id),

From a3432f18fdd85eb18e29fc32327507fe1063ad57 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Aug 2025 05:26:45 -0700
Subject: [PATCH 059/125] [BugFix][Spec Decode] Use float64 for uniform_probs
 (#23803)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 examples/offline_inference/spec_decode.py | 2 +-
 vllm/v1/sample/rejection_sampler.py       | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index c4972f02d0f8..5af232cb6af6 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -138,7 +138,7 @@ def main():
     sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
     if not args.custom_mm_prompts:
         outputs = llm.generate(
-            TokensPrompt(prompt_token_ids=prompt_ids),
+            [TokensPrompt(prompt_token_ids=x) for x in prompt_ids],
             sampling_params=sampling_params,
         )
     else:
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 2d9ce3101b6c..511cdb323425 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -365,9 +365,14 @@ def generate_uniform_probs(
             A tensor of shape `(num_tokens, )` containing uniform
             random values in the range [0, 1).
     """
+    # NOTE(woosuk): We deliberately use float64 instead of float32 here
+    # because when using float32, there's a non-negligible chance that
+    # uniform_prob is sampled to be exact 0.0 as reported in
+    # https://github.com/pytorch/pytorch/issues/16706. Using float64
+    # mitigates the issue.
     uniform_probs = torch.rand(
         (num_tokens, ),
-        dtype=torch.float32,
+        dtype=torch.float64,
         device=device,
     )
     start_idx = 0

From bfab219648fdd6d398c09cd022117b0e663c9e36 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Thu, 28 Aug 2025 20:36:55 +0800
Subject: [PATCH 060/125] [Model] [gpt-oss] fix gpt-oss pp support (#23815)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
---
 vllm/model_executor/models/gpt_oss.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 2b118d8491ed..e0b4df772875 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -668,9 +668,8 @@ def forward(self,
                 positions: torch.Tensor,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
                 inputs_embeds: Optional[torch.Tensor] = None) -> torch.Tensor:
-        assert intermediate_tensors is None
-        assert inputs_embeds is None
-        return self.model(input_ids, positions)
+        return self.model(input_ids, positions, intermediate_tensors,
+                          inputs_embeds)
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:

From d3da2eea546b33b9444519f99c26721f7344117f Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Thu, 28 Aug 2025 14:37:38 +0200
Subject: [PATCH 061/125] [Doc]: fix typos in Python scripts (#23828)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/compilation/backends.py                                  | 4 ++--
 vllm/config/cache.py                                          | 2 +-
 vllm/engine/arg_utils.py                                      | 2 +-
 vllm/entrypoints/chat_utils.py                                | 2 +-
 vllm/entrypoints/openai/api_server.py                         | 2 +-
 .../quantization/compressed_tensors/compressed_tensors.py     | 2 +-
 .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +-
 vllm/v1/cudagraph_dispatcher.py                               | 4 ++--
 vllm/v1/worker/block_table.py                                 | 2 +-
 vllm/v1/worker/cpu_model_runner.py                            | 2 +-
 10 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index fa86773d2474..3361b65a9b88 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -271,7 +271,7 @@ def split_graph(graph: fx.GraphModule,
         outputs.append(
             SplitItem(name, graph_id, (graph_id in split_op_graphs), module))
 
-    # sort by intetger graph_id, rather than string name
+    # sort by integer graph_id, rather than string name
     outputs.sort(key=lambda x: x.graph_id)
 
     return split_gm, outputs
@@ -424,7 +424,7 @@ def __init__(
 
         # if the model is initialized with a non-empty prefix,
         # then usually it's enough to use that prefix,
-        # e.g. launguage_model, vision_model, etc.
+        # e.g. language_model, vision_model, etc.
         # when multiple parts are initialized as independent
         # models, we need to use the model_tag to distinguish
         # them, e.g. backbone (default), eagle_head, etc.
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index a9550d4390ad..3d2aa6b17be7 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -115,7 +115,7 @@ class CacheConfig:
 
     In some KV sharing setups, e.g. YOCO (https://arxiv.org/abs/2405.05254),
     some layers can skip tokens corresponding to prefill. This flag enables
-    attention metadata for eligible layers to be overriden with metadata
+    attention metadata for eligible layers to be overridden with metadata
     necessary for implementing this optimization in some models (e.g. Gemma3n)
     """
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index e4d205aeb863..7802802f138b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1053,7 +1053,7 @@ def create_speculative_config(
                                    self.trust_remote_code, self.revision,
                                    self.code_revision, self.config_format)
 
-            # if loading a SpeculatorsConfig, load the specualtive_config
+            # if loading a SpeculatorsConfig, load the speculative_config
             # details from the config directly
             # no user input required / expected
             if isinstance(hf_config, SpeculatorsConfig):
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 7b11a50642de..1954cbcbf1ed 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -640,7 +640,7 @@ class BaseMultiModalContentParser(ABC):
     def __init__(self) -> None:
         super().__init__()
 
-        # stores model placehodlers list with corresponding
+        # stores model placeholders list with corresponding
         # general MM placeholder:
         # {
         #   "<##IMAGE##>": ["<image>", "<image>", "<image>"],
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 9a2470649c8d..a28d38729f9f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1096,7 +1096,7 @@ async def collective_rpc(raw_request: Request):
             raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value,
                                 detail="Missing 'method' in request body")
         # For security reason, only serialized string args/kwargs are passed.
-        # User-defined `method` is responsible for deseralization if needed.
+        # User-defined `method` is responsible for deserialization if needed.
         args: list[str] = body.get("args", [])
         kwargs: dict[str, str] = body.get("kwargs", {})
         timeout: Optional[float] = body.get("timeout")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 230572041c80..b07bf675ca47 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -579,7 +579,7 @@ def get_scheme(self,
             format = scheme_dict.get("format")
 
         # Find the sparsity scheme of the layer
-        # assume that fused layers inerhit first component's sparsity scheme
+        # assume that fused layers inherit first component's sparsity scheme
         sparsity_targets = (self.sparsity_scheme_map.keys() -
                             set(self.sparsity_ignore_list))
         sparsity_scheme: Optional[SparsityCompressionConfig] = None
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index af9d1c46f68f..2cad9ff0d321 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -71,7 +71,7 @@ def get_moe_method(
     ) -> "CompressedTensorsMoEMethod":
         # TODO: @dsikka: refactor this to use schemes as other kernels
         # are supported + check if the layer is being ignored.
-        # Check if a using "Linear" to select scheems
+        # Check if a using "Linear" to select schemes
         if "Linear" in quant_config.target_scheme_map:
             matched_target = "Linear"
         else:
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 02e65820b7c0..3b4f1d20b64f 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -11,7 +11,7 @@
 
 class CudagraphDispatcher:
     """
-    Runtime cudagraph dispatcher to dispach keys for multiple set of cudagraphs.
+    Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs.
 
     The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
     for FULL cudagraph runtime mode. The keys are initialized depending on 
@@ -21,7 +21,7 @@ class CudagraphDispatcher:
 
     At runtime, the dispatch method generates the runtime cudagraph mode (FULL, 
     PIECEWISE, or NONE for no cudagraph) and the valid key (batch descriptor)
-    based on the input key. After dispatching (commuicate via forward context), 
+    based on the input key. After dispatching (communicate via forward context),
     the cudagraph wrappers will trust the dispatch key to do either capturing
     or replaying (if mode matched), or pass through to the underlying runnable 
     without cudagraph (if mode no match or mode is NONE).
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 5662fc350e19..6ab5ce2748a4 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -110,7 +110,7 @@ def clear(self) -> None:
         self.block_table_cpu.fill_(0)
 
     def get_device_tensor(self) -> torch.Tensor:
-        """Ruturns the device tensor of the block table."""
+        """Returns the device tensor of the block table."""
         return self.block_table
 
     def get_cpu_tensor(self) -> torch.Tensor:
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 742e553b77e0..7d0726112704 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -43,7 +43,7 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         Args:
             scheduler_output: The scheduler output.
         """
-        # Attention free models have zero kv_cache_goups, however models
+        # Attention free models have zero kv_cache_groups, however models
         # like Mamba are also attention free but use the kv_cache for
         # keeping its internal state. This is why we check the number
         # of kv_cache groups instead of solely checking

From 66548f66031006ca873ac799d2dc8497fec33339 Mon Sep 17 00:00:00 2001
From: "YUQI.CHENG" <420985011@qq.com>
Date: Thu, 28 Aug 2025 21:44:09 +0800
Subject: [PATCH 062/125] [Bugfix] Fix benchmark_moe.py for blockwise fp8.
 (#23823)

Signed-off-by: crischeng <420985011@qq.com>
Co-authored-by: cris <grace@guisenbindeMacBook-Pro.local>
---
 benchmarks/kernels/benchmark_moe.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 752c2d008216..710d30adfd84 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -419,8 +419,10 @@ def benchmark(
         )
         # NOTE(woosuk): The current naming convention uses w2.shape[2], which
         # is the intermediate size after silu_and_mul.
+        block_n = block_quant_shape[0] if block_quant_shape else None
+        block_k = block_quant_shape[1] if block_quant_shape else None
         op_config = get_moe_configs(
-            num_experts, shard_intermediate_size // 2, dtype_str
+            num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
         )
         if op_config is None:
             config = get_default_config(
@@ -430,6 +432,7 @@ def benchmark(
                 hidden_size,
                 topk,
                 dtype_str,
+                block_quant_shape,
             )
         else:
             config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]

From 1f096f9b9536aa8f520f89b178a518da294a7dce Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Thu, 28 Aug 2025 15:52:01 +0200
Subject: [PATCH 063/125] [CI] Fix linting error on main (#23835)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 vllm/v1/cudagraph_dispatcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index 3b4f1d20b64f..d2db7dcb3f09 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -11,7 +11,8 @@
 
 class CudagraphDispatcher:
     """
-    Runtime cudagraph dispatcher to dispatch keys for multiple set of cudagraphs.
+    Runtime cudagraph dispatcher to dispatch keys for multiple set of
+    cudagraphs.
 
     The dispatcher stores two sets of dispatch keys, one for PIECEWISE and one
     for FULL cudagraph runtime mode. The keys are initialized depending on 

From 95089607fa307c5facfb9706ea919292fb56e78c Mon Sep 17 00:00:00 2001
From: "Po-Han Huang (NVIDIA)" <53919306+nvpohanh@users.noreply.github.com>
Date: Thu, 28 Aug 2025 21:56:20 +0800
Subject: [PATCH 064/125] [Model][gpt-oss] Support DP+EP for GPT-OSS with
 FlashInfer trtllm-gen MoE (#23819)

Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
---
 vllm/model_executor/layers/fused_moe/config.py   | 15 ++++++++-------
 vllm/model_executor/layers/fused_moe/layer.py    |  8 ++++----
 vllm/model_executor/layers/quantization/mxfp4.py |  6 ++----
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index cab610decf90..0b501cd87fb5 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -190,12 +190,6 @@ def use_deepep_ll_kernels(self):
         return (self.use_all2all_kernels
                 and envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency")
 
-    @property
-    def use_flashinfer_cutlass_kernels(self):
-        return (envs.VLLM_USE_FLASHINFER_MOE_FP4
-                and has_flashinfer_cutlass_fused_moe()
-                and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
-
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
              vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig":
@@ -404,7 +398,14 @@ def use_deepep_ll_kernels(self):
 
     @property
     def use_flashinfer_cutlass_kernels(self):
-        return self.moe_parallel_config.use_flashinfer_cutlass_kernels
+        """
+        Whether to use FlashInfer cutlass kernels for NVFP4 MoE.
+        """
+        return (self.quant_config is not None
+                and self.quant_config.quant_dtype == "nvfp4"
+                and envs.VLLM_USE_FLASHINFER_MOE_FP4
+                and has_flashinfer_cutlass_fused_moe()
+                and envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput")
 
     @staticmethod
     def make(
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b9de03ddd216..28123d3958ad 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -920,7 +920,7 @@ def __init__(
         self.batched_router_logits: Optional[torch.Tensor] = None
         if (self.moe_parallel_config.use_pplx_kernels
                 or self.moe_parallel_config.use_deepep_ll_kernels
-                or self.moe_parallel_config.use_flashinfer_cutlass_kernels):
+                or self.moe_config.use_flashinfer_cutlass_kernels):
             self.batched_hidden_states = torch.zeros(
                 (moe.max_num_tokens, self.hidden_size),
                 dtype=moe.in_dtype,
@@ -974,7 +974,7 @@ def use_deepep_ll_kernels(self):
 
     @property
     def use_flashinfer_cutlass_kernels(self):
-        return self.moe_parallel_config.use_flashinfer_cutlass_kernels
+        return self.moe_config.use_flashinfer_cutlass_kernels
 
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
@@ -1665,7 +1665,7 @@ def forward_impl(self, hidden_states: torch.Tensor,
         # only when data parallelism (DP) is enabled.
         use_flashinfer_cutlass_kernels = (
             self.dp_size > 1
-            and self.moe_parallel_config.use_flashinfer_cutlass_kernels)
+            and self.moe_config.use_flashinfer_cutlass_kernels)
         if (self.moe_parallel_config.use_pplx_kernels
                 or self.moe_parallel_config.use_deepep_ll_kernels
                 or use_flashinfer_cutlass_kernels):
@@ -1674,7 +1674,7 @@ def forward_impl(self, hidden_states: torch.Tensor,
         do_naive_dispatch_combine: bool = (
             self.dp_size > 1
             and not self.moe_parallel_config.use_deepep_ht_kernels
-            and not self.moe_parallel_config.use_flashinfer_cutlass_kernels)
+            and not self.moe_config.use_flashinfer_cutlass_kernels)
         if do_naive_dispatch_combine:
             hidden_states, router_logits = get_ep_group().dispatch(
                 hidden_states, router_logits)
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 6724796904f0..f7d591328f93 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -623,8 +623,6 @@ def apply(
 
         if should_use_flashinfer_mxfp4():
             from flashinfer import mxfp8_quantize, trtllm_fp4_block_scale_moe
-            assert not self.moe.use_ep, (
-                "EP is not supported for flashinfer mxfp4 moe backend yet.")
             if _should_use_flashinfer_mxfp4_bf16():
                 assert x.dtype == torch.bfloat16
                 x_quant = x
@@ -650,12 +648,12 @@ def apply(
                 None,  # output1_scale_scalar
                 None,  # output1_scale_gate_scalar
                 None,  # output2_scale_scalar
-                self.num_experts,
+                global_num_experts,
                 top_k,
                 None,  # n_group
                 None,  # topk_group
                 self.intermediate_size,  # padded to multiple of 256
-                0,  # local_expert_offset
+                layer.ep_rank * layer.local_num_experts,  # local_expert_offset
                 self.num_experts,  # local num experts
                 None,
                 self._get_tile_tokens_dim(x, top_k),

From db74d604900d397e4ee524f93bcb256537679ce4 Mon Sep 17 00:00:00 2001
From: Angela Yi <angelayi@meta.com>
Date: Thu, 28 Aug 2025 08:25:56 -0700
Subject: [PATCH 065/125] [Bugfix] Add fake mode around passes (#23349)

Signed-off-by: angelayi <yiangela7@gmail.com>
---
 vllm/compilation/activation_quant_fusion.py |  2 +
 vllm/compilation/collective_fusion.py       |  6 ++
 vllm/compilation/fusion.py                  |  2 +
 vllm/compilation/fusion_attn.py             | 75 ++++++++++-----------
 vllm/compilation/inductor_pass.py           | 20 ++++++
 vllm/compilation/sequence_parallelism.py    |  2 +
 6 files changed, 66 insertions(+), 41 deletions(-)

diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index ce4e50a2b02d..826014f770df 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -10,6 +10,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
@@ -61,6 +62,7 @@ class ActivationQuantFusionPass(VllmInductorPass):
     https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
     """
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 0c545d8cffd2..7a99aaff707d 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -19,6 +19,7 @@
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
 
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -349,6 +350,7 @@ def replacement(x: torch.Tensor, weight: torch.Tensor,
 
 class AsyncTPPass(VllmInductorPass):
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
@@ -1121,6 +1123,10 @@ def __init__(self, config: VllmConfig):
             # in fallback path, when we don't use flashinfer
             fuse_rms_quant=config.compilation_config.pass_config.enable_fusion)
 
+        self.register_patterns()
+
+    @enable_fake_mode
+    def register_patterns(self):
         for epsilon in [1e-5, 1e-6]:
             AllReduceFusedRMSNormStaticQuantFP8Pattern(
                 epsilon,
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 0d8d562514e3..afa739c966a5 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -17,6 +17,7 @@
 from vllm.platforms import current_platform
 
 from .fx_utils import find_getitem_maybe
+from .inductor_pass import enable_fake_mode
 from .multi_output_match import MultiOutputMatch
 from .vllm_inductor_pass import VllmInductorPass
 
@@ -528,6 +529,7 @@ def instance(cls, config: VllmConfig):
             cls._instance.pass_config = config.compilation_config.pass_config
         return cls._instance
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         assert self.__class__._instance is None, \
             "FusionPass singleton instance already exists"
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index f942afe6a28e..3095f17110fd 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -7,8 +7,6 @@
 import torch._inductor.pattern_matcher as pm
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
-from torch._subclasses.fake_tensor import (FakeTensorMode,
-                                           unset_fake_temporarily)
 
 from vllm.attention import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
@@ -19,6 +17,7 @@
 from vllm.utils import round_up
 
 from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
@@ -139,24 +138,21 @@ def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                                       output_block_scale=None)
             return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
 
-        # Need custom fake mode, otherwise tracing happens with real tensors.
-        # That would not work for the unified_attention custom op.
-        with unset_fake_temporarily(), FakeTensorMode():
-            inputs = [
-                empty_bf16(5, self.num_heads, self.head_size),  # q
-                empty_bf16(5, self.num_heads, self.head_size),  # k
-                empty_bf16(5, self.num_heads, self.head_size),  # v
-                empty_bf16(5, self.num_heads, self.head_size),  # attn_output
-                self.empty_quant(5, self.num_heads *
-                                 self.head_size),  # quant_output
-                empty_fp32(1, 1)  # scale
-            ]
-
-            pm.register_replacement(
-                pattern, replacement, inputs,
-                AttentionQuantPattern.wrap_trace_fn(
-                    AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
-                pm_pass)
+        inputs = [
+            empty_bf16(5, self.num_heads, self.head_size),  # q
+            empty_bf16(5, self.num_heads, self.head_size),  # k
+            empty_bf16(5, self.num_heads, self.head_size),  # v
+            empty_bf16(5, self.num_heads, self.head_size),  # attn_output
+            self.empty_quant(5,
+                             self.num_heads * self.head_size),  # quant_output
+            empty_fp32(1, 1)  # scale
+        ]
+
+        pm.register_replacement(
+            pattern, replacement, inputs,
+            AttentionQuantPattern.wrap_trace_fn(
+                AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
+            pm_pass)
 
 
 class AttentionNvfp4QuantPattern(AttentionQuantPattern):
@@ -219,27 +215,23 @@ def replacement(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                                 [-1, self.num_heads * self.head_size // 2])
             return output, at2[2]
 
-        # Need custom fake mode, otherwise tracing happens with real tensors.
-        # That would not work for the unified_attention custom op.
-        with unset_fake_temporarily(), FakeTensorMode():
-            inputs = [
-                empty_bf16(5, self.num_heads, self.head_size),  # q
-                empty_bf16(5, self.num_heads, self.head_size),  # k
-                empty_bf16(5, self.num_heads, self.head_size),  # v
-                empty_bf16(5, self.num_heads, self.head_size),  # output_attn
-                self.empty_quant(5, self.num_heads * self.head_size //
-                                 2),  # output_quant
-                empty_i32(128,
-                          round_up(self.num_heads * self.head_size // 16,
-                                   4)),  # output_scale
-                empty_fp32(1, 1),  # input_scale
-            ]
-
-            pm.register_replacement(
-                pattern, replacement, inputs,
-                AttentionQuantPattern.wrap_trace_fn(
-                    AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
-                pm_pass)
+        inputs = [
+            empty_bf16(5, self.num_heads, self.head_size),  # q
+            empty_bf16(5, self.num_heads, self.head_size),  # k
+            empty_bf16(5, self.num_heads, self.head_size),  # v
+            empty_bf16(5, self.num_heads, self.head_size),  # output_attn
+            self.empty_quant(5, self.num_heads * self.head_size //
+                             2),  # output_quant
+            empty_i32(128, round_up(self.num_heads * self.head_size // 16,
+                                    4)),  # output_scale
+            empty_fp32(1, 1),  # input_scale
+        ]
+
+        pm.register_replacement(
+            pattern, replacement, inputs,
+            AttentionQuantPattern.wrap_trace_fn(
+                AttentionQuantPattern.fx_view_to_reshape, pm.fwd_only),
+            pm_pass)
 
 
 class AttnFusionPass(VllmInductorPass):
@@ -255,6 +247,7 @@ class AttnFusionPass(VllmInductorPass):
     support are attention kernels, which need to support fusing output quant.
     """
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index 2a149c65b387..e1b691df385d 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
 import hashlib
 import inspect
 import json
@@ -10,6 +11,8 @@
 
 import torch
 from torch import fx
+from torch._subclasses.fake_tensor import (FakeTensorMode,
+                                           unset_fake_temporarily)
 
 from vllm.utils import is_torch_equal_or_newer
 
@@ -114,3 +117,20 @@ def __call__(self, graph: torch.fx.Graph):
 
     def uuid(self) -> Any:
         return self._uuid
+
+
+def enable_fake_mode(fn: Callable[..., Any]) -> Callable[..., Any]:
+    """
+    Applies a FakeTensorMode context. This is useful when you don't want to
+    create or run things with real tensors.
+    """
+
+    @functools.wraps(fn)
+    def fn_new(*args, **kwargs) -> Any:
+        with torch._guards.tracing(
+                None), unset_fake_temporarily(), FakeTensorMode():
+            result = fn(*args, **kwargs)
+
+        return result
+
+    return fn_new
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index ebc025cba71e..1758ed4c86d2 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -14,6 +14,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
+from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
@@ -436,6 +437,7 @@ class SequenceParallelismPass(VllmInductorPass):
     performance.
     """
 
+    @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 

From 0583578f42fb23cc8a6d612e041c6be402551282 Mon Sep 17 00:00:00 2001
From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com>
Date: Thu, 28 Aug 2025 17:59:19 +0200
Subject: [PATCH 066/125] [ci] breaks down V1 Test into 3 groups of approx 30
 minutes runtime (#23757)

Signed-off-by: Jean Schmidt <contato@jschmidt.me>
---
 .buildkite/test-pipeline.yaml | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index cf90505257e9..24cc57e9dfb9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -234,16 +234,33 @@ steps:
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
-- label: V1 Test
+- label: V1 Test e2e + engine
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
     - vllm/
     - tests/v1
   commands:
-    # split the test to avoid interference
-    - pytest -v -s v1/core
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
     - pytest -v -s v1/engine
+
+- label: V1 Test entrypoints
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
     - pytest -v -s v1/entrypoints
+
+- label: V1 Test others
+  mirror_hardwares: [amdexperimental]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s v1/core
     - pytest -v -s v1/executor
     - pytest -v -s v1/sample
     - pytest -v -s v1/logits_processors
@@ -256,9 +273,6 @@ steps:
     - pytest -v -s v1/test_utils.py
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_metrics_reader.py
-    # TODO: accuracy does not match, whether setting
-    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-    - pytest -v -s v1/e2e
     # Integration test for streaming correctness (requires special branch).
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine

From 8805ad9fa9c04b2ce4e2a9adc217471798b1ae64 Mon Sep 17 00:00:00 2001
From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com>
Date: Thu, 28 Aug 2025 18:31:20 +0200
Subject: [PATCH 067/125] Add scale_config.yml file for Meta autoscalers for GH
 Actions (#23840)

Signed-off-by: Jean Schmidt <contato@jschmidt.me>
---
 .github/scale-config.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/scale-config.yml

diff --git a/.github/scale-config.yml b/.github/scale-config.yml
new file mode 100644
index 000000000000..c41a3ee3eb19
--- /dev/null
+++ b/.github/scale-config.yml
@@ -0,0 +1,21 @@
+# scale-config.yml:
+#   Powers what instance types are available for GHA auto-scaled
+#   runners. Runners listed here will be available as self hosted
+#   runners, configuration is directly pulled from the main branch.
+# runner_types:
+#   runner_label:
+#     instance_type: m4.large
+#     os: linux
+#     # min_available defaults to the global cfg in the ALI Terraform
+#     min_available: undefined
+#     # when max_available value is not defined, no max runners is enforced
+#     max_available: undefined
+#     disk_size: 50
+#     is_ephemeral: true
+
+runner_types:
+  linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: true
+    os: linux

From f32a5bc5058afc2fb601dcb456b581e2fefa94dd Mon Sep 17 00:00:00 2001
From: Benji Beck <benjibeck@meta.com>
Date: Thu, 28 Aug 2025 10:29:37 -0700
Subject: [PATCH 068/125] Migrate Llama4ImagePatchInputs to TensorSchema
 (#22021)

Signed-off-by: Benji Beck <benjibeck@meta.com>
---
 vllm/model_executor/models/mllama4.py | 41 +++++++++++++++------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index ac9b968f7a0c..ecbbb5f57bec 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -19,7 +19,7 @@
 import math
 from collections.abc import Iterable, Mapping
 from itertools import tee
-from typing import Literal, Optional, TypedDict, Union
+from typing import Annotated, Literal, Optional, Union
 
 import torch
 from torch import nn
@@ -53,6 +53,7 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.multimodal.utils import run_dp_sharded_vision_model
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .llama4 import Llama4ForCausalLM
@@ -60,28 +61,34 @@
                     merge_multimodal_embeddings)
 
 
-class Llama4ImagePatchInputs(TypedDict):
-    type: Literal["pixel_values"]
-    flat_data: torch.Tensor
+class Llama4ImagePatchInputs(TensorSchema):
     """
-    Shape:
-    `(batch_size * num_chunks, num_channels, image size, image size)`
+    Dimensions:
+        - batch_size: Batch size
+        - total_num_chunks: Batch size * number of chunks
+        - num_channels: Number of channels
+        - image_size: Size of each image
     """
-    patches_per_image: torch.Tensor
+
+    type: Literal["pixel_values"] = "pixel_values"
+
+    flat_data: Annotated[torch.Tensor,
+                         TensorShape("total_num_chunks", "num_channels",
+                                     "image_size", "image_size")]
+
+    patches_per_image: Annotated[torch.Tensor, TensorShape("batch_size")]
     """
     The number of total patches for each image in the batch.
-
+    
     This is used to split the embeddings which has the first two dimensions
     flattened just like `flat_data`.
     """
 
-    aspect_ratios: Union[torch.Tensor, list[torch.Tensor]]
+    aspect_ratios: Annotated[torch.Tensor, TensorShape("batch_size", 2)]
     """
     A list of aspect ratios corresponding to the number of tiles
     in each dimension that each image in the batch corresponds to.
-
-    Shape:
-    `(batch_size, ratio)` where ratio is a pair `(ratio_h, ratio_w)`
+    Each aspect ratio is a pair (ratio_h, ratio_w).
     """
 
 
@@ -623,7 +630,7 @@ def _call_hf_processor(
                 for (r_h, r_w) in aspect_ratios
             ]
 
-            processed_outputs["aspect_ratios"] = aspect_ratios
+            processed_outputs["aspect_ratios"] = torch.tensor(aspect_ratios)
             processed_outputs["patches_per_image"] = torch.tensor(
                 patches_per_image)
 
@@ -770,11 +777,9 @@ def _parse_and_validate_image_input(
         # TODO: confirm handling for variable lengths
         flat_pixel_values = flatten_bn(pixel_values, concat=True)
         patches_per_image = flatten_bn(kwargs.pop("patches_per_image"))
-
-        aspect_ratios = kwargs.pop("aspect_ratios", None)
-        if not isinstance(aspect_ratios, (torch.Tensor, list)):
-            raise ValueError("Incorrect type of aspect_ratios. "
-                             f"Got type: {type(aspect_ratios)}")
+        aspect_ratios = kwargs.pop("aspect_ratios")
+        if aspect_ratios.ndim == 3:
+            aspect_ratios = aspect_ratios.squeeze(1)
 
         return Llama4ImagePatchInputs(
             type="pixel_values",

From 04d1dd7f4a444a61ae4b01ea0271490082dbd605 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Thu, 28 Aug 2025 13:18:08 -0500
Subject: [PATCH 069/125] [ROCm][Aiter] Add triton fp8 bmm kernel for mla
 (#23264)

Signed-off-by: Divakar Verma <divakar.verma@amd.com>
Co-authored-by: ShaoChunLee <Shao-Chun.Lee@amd.com>
---
 vllm/envs.py                             |   8 ++
 vllm/v1/attention/backends/mla/common.py | 108 ++++++++++++++++++++---
 2 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index a6a795dcfcda..1232bd7bf963 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -99,6 +99,7 @@
     VLLM_ROCM_USE_AITER_RMSNORM: bool = True
     VLLM_ROCM_USE_AITER_MLA: bool = True
     VLLM_ROCM_USE_AITER_MHA: bool = True
+    VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
     VLLM_ROCM_MOE_PADDING: bool = True
@@ -774,6 +775,12 @@ def get_vllm_port() -> Optional[int]:
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_MHA", "True").lower() in
              ("true", "1")),
 
+    # Whether to use aiter triton fp8 bmm kernel
+    # By default is enabled.
+    "VLLM_ROCM_USE_AITER_FP8BMM":
+    lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP8BMM", "True").lower() in
+             ("true", "1")),
+
     # use rocm skinny gemms
     "VLLM_ROCM_USE_SKINNY_GEMM":
     lambda: (os.getenv("VLLM_ROCM_USE_SKINNY_GEMM", "True").lower() in
@@ -1272,6 +1279,7 @@ def compute_hash() -> str:
         "VLLM_ROCM_USE_AITER_RMSNORM",
         "VLLM_ROCM_USE_AITER_MLA",
         "VLLM_ROCM_USE_AITER_MHA",
+        "VLLM_ROCM_USE_AITER_FP8BMM",
         "VLLM_ROCM_USE_SKINNY_GEMM",
         "VLLM_ROCM_FP8_PADDING",
         "VLLM_ROCM_MOE_PADDING",
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index ce45b34f6435..9f93b50b075b 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -193,6 +193,7 @@
 from typing import ClassVar, Generic, Optional, TypeVar, Union
 
 import torch
+from tqdm import tqdm
 
 import vllm.envs as envs
 from vllm import _custom_ops as ops
@@ -203,6 +204,7 @@
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import VllmConfig
+from vllm.distributed.parallel_state import is_global_first_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
@@ -234,6 +236,28 @@
 except ImportError:
     flashinfer_available = False
 
+
+def is_rocm_aiter_fp8bmm_enabled() -> bool:
+    return current_platform.is_rocm() \
+        and envs.VLLM_ROCM_USE_AITER_FP8BMM \
+        and envs.VLLM_ROCM_USE_AITER
+
+
+if is_rocm_aiter_fp8bmm_enabled():
+    from aiter.ops.triton.batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant import (  # noqa: E501 # isort: skip
+        batched_gemm_a8w8_a_per_token_group_prequant_w_per_batched_tensor_quant
+        as aiter_triton_fp8_bmm)
+
+    def dynamic_per_batched_tensor_quant(
+            x: torch.Tensor, dtype: torch.dtype = torch.float8_e4m3fn):
+        DTYPE_MAX = torch.finfo(dtype).max
+        min_val, max_val = x.aminmax()
+        amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-10)
+        scale = DTYPE_MAX / amax
+        x_scl_sat = (x * scale).clamp(min=-DTYPE_MAX, max=DTYPE_MAX)
+        return x_scl_sat.to(dtype).contiguous(), scale.float().reciprocal()
+
+
 logger = init_logger(__name__)
 
 CUDNN_WORKSPACE_SIZE = 12800
@@ -945,10 +969,21 @@ def _run_prefill_context_chunk_cudnn(self,
     def _v_up_proj(self, x):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
-        # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
-        x = torch.bmm(x, self.W_UV)
-        # Convert from (N, B, V) to (B, N * V)
-        return x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        if is_rocm_aiter_fp8bmm_enabled():
+            # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
+            x = aiter_triton_fp8_bmm(x,
+                                     self.W_V,
+                                     self.W_V_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
+            # Convert from (B, N, V) to (B, N * V)
+            x = x.reshape(-1, self.num_heads * self.v_head_dim)
+        else:
+            # Multiply (N, B, L) x (N, L, V) -> (N, B, V)
+            x = torch.bmm(x, self.W_UV)
+            # Convert from (N, B, V) to (B, N * V)
+            x = x.transpose(0, 1).reshape(-1, self.num_heads * self.v_head_dim)
+        return x
 
     def process_weights_after_loading(self, act_dtype: torch.dtype):
 
@@ -996,10 +1031,50 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
         W_UK, W_UV = kv_b_proj_weight.split(
             [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
 
-        # Convert from (L, N, V) to (N, L, V)
-        self.W_UV = W_UV.transpose(0, 1)
-        # Convert from (L, N, P) to (N, P, L)
-        self.W_UK_T = W_UK.permute(1, 2, 0)
+        if is_rocm_aiter_fp8bmm_enabled():
+            W_K = W_UK.transpose(0, 1)  # 16 512 128
+            W_V = W_UV.permute(1, 2, 0)  # 16 128 512
+            self.W_K, self.W_K_scale = dynamic_per_batched_tensor_quant(
+                W_K, dtype=current_platform.fp8_dtype())
+            self.W_V, self.W_V_scale = dynamic_per_batched_tensor_quant(
+                W_V, dtype=current_platform.fp8_dtype())
+
+            # The kernel operates on non-padded inputs. Hence, pre-compiling
+            # triton kernel to avoid runtime compilation for unseen batch sizes
+            # Pre-compile for batch sizes 1 to 1024 to cover most use-cases.
+            # On DS-R1, this step adds roughly 50s to the model loading time.
+            max_batch_size = 1024  # [ToDo] Find the optimal upper limit
+            pre_compilation_list = list(range(1, max_batch_size + 1))
+            if is_global_first_rank():
+                pre_compilation_list = tqdm(
+                    pre_compilation_list,
+                    desc="[Aiter Triton] Pre-compiling fp8 BMM kernel",
+                    total=max_batch_size,
+                )
+
+            for m in pre_compilation_list:
+                x = torch.empty((self.W_K.shape[0], m, self.W_K.shape[2]),
+                                dtype=torch.bfloat16,
+                                device=self.W_K.device)
+                aiter_triton_fp8_bmm(x,
+                                     self.W_K,
+                                     self.W_K_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
+
+                x = torch.empty((self.W_V.shape[0], m, self.W_V.shape[2]),
+                                dtype=torch.bfloat16,
+                                device=self.W_V.device)
+                aiter_triton_fp8_bmm(x,
+                                     self.W_V,
+                                     self.W_V_scale,
+                                     group_size=128,
+                                     transpose_bm=True)
+        else:
+            # Convert from (L, N, V) to (N, L, V)
+            self.W_UV = W_UV.transpose(0, 1)
+            # Convert from (L, N, P) to (N, P, L)
+            self.W_UK_T = W_UK.permute(1, 2, 0)
 
     def _compute_prefill_context(
         self,
@@ -1203,10 +1278,19 @@ def forward(
                 [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
             # Convert from (B, N, P) to (N, B, P)
             decode_q_nope = decode_q_nope.transpose(0, 1)
-            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
-            decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
-            # Convert from (N, B, L) to (B, N, L)
-            decode_ql_nope = decode_ql_nope.transpose(0, 1)
+
+            if is_rocm_aiter_fp8bmm_enabled():
+                # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
+                decode_ql_nope = aiter_triton_fp8_bmm(decode_q_nope,
+                                                      self.W_K,
+                                                      self.W_K_scale,
+                                                      group_size=128,
+                                                      transpose_bm=True)
+            else:
+                # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+                decode_ql_nope = torch.bmm(decode_q_nope, self.W_UK_T)
+                # Convert from (N, B, L) to (B, N, L)
+                decode_ql_nope = decode_ql_nope.transpose(0, 1)
 
             if fp8_attention:
                 ql_nope_shape = decode_ql_nope.shape

From 57d4ede520b6071341ebd310c0ddd4c6f4d54917 Mon Sep 17 00:00:00 2001
From: Jingkai He <he-jingkai@outlook.com>
Date: Fri, 29 Aug 2025 03:05:20 +0800
Subject: [PATCH 070/125] [bugfix] [spec-decoding] fix data race in
 sample_recovered_tokens_kernel (vLLM v1) (#23829)

Signed-off-by: He-Jingkai <he-jingkai@outlook.com>
---
 vllm/v1/sample/rejection_sampler.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 511cdb323425..3d5e59addfcf 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -598,17 +598,10 @@ def sample_recovered_tokens_kernel(
     vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE)
     if NO_DRAFT_PROBS:
         draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
-        orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
-                            draft_token_id)
-        # Temporarily zero out the probability of the draft token.
-        # This is essentially the same as target_prob - draft_prob, except that
-        # n-gram does not have draft_prob. We regard it as 1.
-        tl.store(
-            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
-            0)
         prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
                        vocab_offset,
-                       mask=vocab_offset < vocab_size,
+                       mask=((vocab_offset < vocab_size) &
+                             (vocab_offset != draft_token_id)),
                        other=0)
     else:
         draft_prob = tl.load(draft_probs_ptr + (start_idx + pos) * vocab_size +
@@ -628,9 +621,3 @@ def sample_recovered_tokens_kernel(
                 other=float("-inf"))
     recovered_id = tl.argmax(prob / q, axis=-1)
     tl.store(output_token_ids_ptr + start_idx + pos, recovered_id)
-
-    if NO_DRAFT_PROBS:
-        # Restore the original probability.
-        tl.store(
-            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
-            orig_prob)

From 16a45b3a281805ea4d4ff3908cef512fdf6d9f84 Mon Sep 17 00:00:00 2001
From: elvischenv <219235043+elvischenv@users.noreply.github.com>
Date: Fri, 29 Aug 2025 03:36:50 +0800
Subject: [PATCH 071/125] [NVIDIA] Support SiluMul + NVFP4 quant fusion
 (#23671)

Signed-off-by: jindih <jindih@nvidia.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: jindih <jindih@nvidia.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Luka Govedic <lgovedic@redhat.com>
---
 .buildkite/test-pipeline.yaml                 |   2 +
 CMakeLists.txt                                |   2 +
 csrc/dispatch_utils.h                         |  16 +
 csrc/ops.h                                    |   8 +
 .../activation_nvfp4_quant_fusion_kernels.cu  | 368 ++++++++++++++++++
 csrc/torch_bindings.cpp                       |   7 +
 tests/compile/test_silu_mul_quant_fusion.py   |  97 +++--
 .../test_silu_nvfp4_quant_fusion.py           | 126 ++++++
 vllm/compilation/activation_quant_fusion.py   | 170 ++++++--
 vllm/compilation/fix_functionalization.py     |   7 +
 .../layers/quantization/modelopt.py           |   7 +-
 11 files changed, 746 insertions(+), 64 deletions(-)
 create mode 100644 csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
 create mode 100644 tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 24cc57e9dfb9..454aaca0a112 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -668,6 +668,7 @@ steps:
     # Quantization
     - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
     - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
+    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -677,6 +678,7 @@ steps:
     - pytest -v -s tests/compile/test_fusion_all_reduce.py
     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0eb0f32e03a..e92e08f0d0ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -541,6 +541,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
       "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
@@ -559,6 +560,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
+      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
       "csrc/quantization/fp4/nvfp4_experts_quant.cu"
       "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
       "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index f7b75c48373f..2728aa81f0c9 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -19,6 +19,13 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 
+#define VLLM_DISPATCH_CASE_HALF_TYPES(...)            \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__))
+
 // ROCm devices might use either fn or fnuz, so set up dispatch table for both.
 // A host-based check at runtime will create a preferred FP8 type for ROCm
 // such that the correct kernel is dispatched.
@@ -45,6 +52,15 @@
 #define VLLM_DISPATCH_FP8_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FP8_TYPES(__VA_ARGS__))
 
+#define AT_DISPATCH_BYTE_CASE(enum_type, ...) \
+  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, byte_t, __VA_ARGS__)
+
+#define VLLM_DISPATCH_CASE_BYTE_TYPES(...) \
+  AT_DISPATCH_BYTE_CASE(at::ScalarType::Byte, __VA_ARGS__)
+
+#define VLLM_DISPATCH_BYTE_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_BYTE_TYPES(__VA_ARGS__))
+
 #define VLLM_DISPATCH_QUANT_TYPES(TYPE, NAME, ...) \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_QUANT_TYPES(__VA_ARGS__))
 
diff --git a/csrc/ops.h b/csrc/ops.h
index 86fe848e2fd5..78a487201bdd 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -130,6 +130,14 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                         torch::Tensor& scale);
 
+#ifndef USE_ROCM
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& out,
+                              torch::Tensor& output_block_scale,
+                              torch::Tensor& input,
+                              torch::Tensor& input_global_scale);
+#endif
+
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
diff --git a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
new file mode 100644
index 000000000000..9bbeb0334fb9
--- /dev/null
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <torch/all.h>
+
+#include <cuda_runtime_api.h>
+#include <cuda_runtime.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda_fp8.h>
+#include "dispatch_utils.h"
+
+#include "cuda_utils.h"
+
+namespace vllm {
+
+// Get type2 from type or vice versa (applied to half and bfloat16)
+template <typename T>
+struct TypeConverter {
+  using Type = half2;
+};  // keep for generality
+
+template <>
+struct TypeConverter<half2> {
+  using Type = c10::Half;
+};
+
+template <>
+struct TypeConverter<c10::Half> {
+  using Type = half2;
+};
+
+template <>
+struct TypeConverter<__nv_bfloat162> {
+  using Type = c10::BFloat16;
+};
+
+template <>
+struct TypeConverter<c10::BFloat16> {
+  using Type = __nv_bfloat162;
+};
+
+#define ELTS_PER_THREAD 8
+
+constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+
+// Convert 8 float32 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float (&array)[8]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0]), "f"(array[1]), "f"(array[2]), "f"(array[3]),
+        "f"(array[4]), "f"(array[5]), "f"(array[6]), "f"(array[7]));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Convert 4 float2 values into 8 e2m1 values (represented as one uint32_t).
+inline __device__ uint32_t fp32_vec_to_e2m1(float2 (&array)[4]) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  uint32_t val;
+  asm volatile(
+      "{\n"
+      ".reg .b8 byte0;\n"
+      ".reg .b8 byte1;\n"
+      ".reg .b8 byte2;\n"
+      ".reg .b8 byte3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte0, %2, %1;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte1, %4, %3;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte2, %6, %5;\n"
+      "cvt.rn.satfinite.e2m1x2.f32   byte3, %8, %7;\n"
+      "mov.b32 %0, {byte0, byte1, byte2, byte3};\n"
+      "}"
+      : "=r"(val)
+      : "f"(array[0].x), "f"(array[0].y), "f"(array[1].x), "f"(array[1].y),
+        "f"(array[2].x), "f"(array[2].y), "f"(array[3].x), "f"(array[3].y));
+  return val;
+#else
+  return 0;
+#endif
+}
+
+// Fast reciprocal.
+inline __device__ float reciprocal_approximate_ftz(float a) {
+  float b;
+  asm volatile("rcp.approx.ftz.f32 %0, %1;\n" : "=f"(b) : "f"(a));
+  return b;
+}
+
+template <class SFType, int CVT_FP4_NUM_THREADS_PER_SF>
+__device__ uint8_t* cvt_quant_to_fp4_get_sf_out_offset(int rowIdx, int colIdx,
+                                                       int numCols,
+                                                       SFType* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  static_assert(CVT_FP4_NUM_THREADS_PER_SF == 1 ||
+                CVT_FP4_NUM_THREADS_PER_SF == 2);
+
+  // One pair of threads write one SF to global memory.
+  // TODO: stage through smem for packed STG.32
+  // is it better than STG.8 from 4 threads ?
+  if (threadIdx.x % CVT_FP4_NUM_THREADS_PER_SF == 0) {
+    // SF vector index (16 elements share one SF in the K dimension).
+    int32_t kIdx = colIdx / CVT_FP4_NUM_THREADS_PER_SF;
+    int32_t mIdx = rowIdx;
+
+    // SF layout [numMTiles, numKTiles, 32 (mTile), 4 (mTile), 4(kTile)]
+    // --> index [mTileIdx, kTileIdx, outerMIdx, innerMIdx, innerKIdx]
+
+    int32_t mTileIdx = mIdx / (32 * 4);
+    // SF vector size 16.
+    int factor = CVT_FP4_SF_VEC_SIZE * 4;
+    int32_t numKTiles = (numCols + factor - 1) / factor;
+    int64_t mTileStride = numKTiles * 32 * 4 * 4;
+
+    int32_t kTileIdx = (kIdx / 4);
+    int64_t kTileStride = 32 * 4 * 4;
+
+    // M tile layout [32, 4] is column-major.
+    int32_t outerMIdx = (mIdx % 32);
+    int64_t outerMStride = 4 * 4;
+
+    int32_t innerMIdx = (mIdx % (32 * 4)) / 32;
+    int64_t innerMStride = 4;
+
+    int32_t innerKIdx = (kIdx % 4);
+    int64_t innerKStride = 1;
+
+    // Compute the global offset.
+    int64_t SFOffset = mTileIdx * mTileStride + kTileIdx * kTileStride +
+                       outerMIdx * outerMStride + innerMIdx * innerMStride +
+                       innerKIdx * innerKStride;
+
+    return reinterpret_cast<uint8_t*>(SFout) + SFOffset;
+  }
+#endif
+  return nullptr;
+}
+
+// Define a 16 bytes packed data type.
+template <class Type>
+struct PackedVec {
+  typename TypeConverter<Type>::Type elts[4];
+};
+
+template <>
+struct PackedVec<__nv_fp8_e4m3> {
+  __nv_fp8x2_e4m3 elts[8];
+};
+
+template <class Type>
+__inline__ __device__ PackedVec<Type> compute_silu(PackedVec<Type>& vec,
+                                                   PackedVec<Type>& vec2) {
+  PackedVec<Type> result;
+#pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; ++i) {
+    if constexpr (std::is_same_v<Type, c10::Half>) {
+      half2 val(0.5f, 0.5f);
+      half2 t0 = __hmul2(vec.elts[i], val);
+      half2 t1 = __hfma2(h2tanh(t0), val, val);
+      half2 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
+    } else {
+      __nv_bfloat162 val(0.5f, 0.5f);
+      __nv_bfloat162 t0 = __hmul2(vec.elts[i], val);
+      __nv_bfloat162 t1 = __hfma2(h2tanh(t0), val, val);
+      __nv_bfloat162 t2 = __hmul2(vec.elts[i], t1);
+      result.elts[i] = __hmul2(t2, vec2.elts[i]);
+    }
+  }
+  return result;
+}
+
+// Quantizes the provided PackedVec into the uint32_t output
+template <class Type, bool UE8M0_SF = false>
+__device__ uint32_t silu_and_cvt_warp_fp16_to_fp4(PackedVec<Type>& vec,
+                                                  PackedVec<Type>& vec2,
+                                                  float SFScaleVal,
+                                                  uint8_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  PackedVec<Type> out_silu = compute_silu(vec, vec2);
+  // Get absolute maximum values among the local 8 values.
+  auto localMax = __habs2(out_silu.elts[0]);
+
+  // Local maximum value.
+  #pragma unroll
+  for (int i = 1; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    localMax = __hmax2(localMax, __habs2(out_silu.elts[i]));
+  }
+
+  // Get the absolute maximum among all 16 values (two threads).
+  localMax = __hmax2(__shfl_xor_sync(uint32_t(-1), localMax, 1), localMax);
+  // Get the final absolute maximum values.
+  float vecMax = float(__hmax(localMax.x, localMax.y));
+
+  // Get the SF (max value of the vector / max value of e2m1).
+  // maximum value of e2m1 = 6.0.
+  // TODO: use half as compute data type.
+  float SFValue = SFScaleVal * (vecMax * reciprocal_approximate_ftz(6.0f));
+  // 8 bits representation of the SF.
+  uint8_t fp8SFVal;
+  // Write the SF to global memory (STG.8).
+  if constexpr (UE8M0_SF) {
+    // Extract the 8 exponent bits from float32.
+    // float 32bits = 1 sign bit + 8 exponent bits + 23 mantissa bits.
+    uint32_t tmp = reinterpret_cast<uint32_t&>(SFValue) >> 23;
+    fp8SFVal = tmp & 0xff;
+    // Convert back to fp32.
+    reinterpret_cast<uint32_t&>(SFValue) = tmp << 23;
+  } else {
+    // Here SFValue is always positive, so E4M3 is the same as UE4M3.
+    __nv_fp8_e4m3 tmp = __nv_fp8_e4m3(SFValue);
+    reinterpret_cast<__nv_fp8_e4m3&>(fp8SFVal) = tmp;
+    // Convert back to fp32.
+    SFValue = float(tmp);
+  }
+  // Get the output scale.
+  // Recipe: final_scale = reciprocal(fp32(fp8(SFValue * SFScaleVal))) *
+  //                       reciprocal(SFScaleVal))
+  float outputScale =
+      SFValue != 0 ? reciprocal_approximate_ftz(
+                         SFValue * reciprocal_approximate_ftz(SFScaleVal))
+                   : 0.0f;
+
+  if (SFout) {
+    // Write the SF to global memory (STG.8).
+    *SFout = fp8SFVal;
+  }
+
+  // Convert the input to float.
+  float2 fp2Vals[CVT_FP4_ELTS_PER_THREAD / 2];
+
+  #pragma unroll
+  for (int i = 0; i < CVT_FP4_ELTS_PER_THREAD / 2; i++) {
+    if constexpr (std::is_same_v<Type, c10::Half>) {
+      fp2Vals[i] = __half22float2(out_silu.elts[i]);
+    } else {
+      fp2Vals[i] = __bfloat1622float2(out_silu.elts[i]);
+    }
+    fp2Vals[i].x *= outputScale;
+    fp2Vals[i].y *= outputScale;
+  }
+
+  // Convert to e2m1 values.
+  uint32_t e2m1Vec = fp32_vec_to_e2m1(fp2Vals);
+
+  // Write the e2m1 values to global memory.
+  return e2m1Vec;
+#else
+  return 0;
+#endif
+}
+
+// Use UE4M3 by default.
+template <class Type, bool UE8M0_SF = false>
+__global__ void
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+__launch_bounds__(1024, 4) silu_and_cvt_fp16_to_fp4(
+#else
+silu_and_cvt_fp16_to_fp4(
+#endif
+    int32_t numRows, int32_t numCols, Type const* in, float const* SFScale,
+    uint32_t* out, uint32_t* SFout) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
+  using PackedVec = PackedVec<Type>;
+  static constexpr int CVT_FP4_NUM_THREADS_PER_SF =
+      (CVT_FP4_SF_VEC_SIZE / CVT_FP4_ELTS_PER_THREAD);
+  static_assert(sizeof(PackedVec) == sizeof(Type) * CVT_FP4_ELTS_PER_THREAD,
+                "Vec size is not matched.");
+
+  // Get the global scaling factor, which will be applied to the SF.
+  // Note SFScale is the same as next GEMM's alpha, which is
+  // (448.f / (Alpha_A / 6.f)).
+  float const SFScaleVal = SFScale == nullptr ? 1.0f : SFScale[0];
+
+  // Input tensor row/col loops.
+  for (int rowIdx = blockIdx.x; rowIdx < numRows; rowIdx += gridDim.x) {
+    for (int colIdx = threadIdx.x; colIdx < numCols / CVT_FP4_ELTS_PER_THREAD;
+         colIdx += blockDim.x) {
+      int64_t inOffset =
+          rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      int64_t inOffset2 = rowIdx * (numCols * 2 / CVT_FP4_ELTS_PER_THREAD) +
+                          numCols / CVT_FP4_ELTS_PER_THREAD + colIdx;
+      PackedVec in_vec = reinterpret_cast<PackedVec const*>(in)[inOffset];
+      PackedVec in_vec2 = reinterpret_cast<PackedVec const*>(in)[inOffset2];
+
+      // Get the output tensor offset.
+      // Same as inOffset because 8 elements are packed into one uint32_t.
+      int64_t outOffset = rowIdx * (numCols / CVT_FP4_ELTS_PER_THREAD) + colIdx;
+      ;
+      auto& out_pos = out[outOffset];
+
+      auto sf_out =
+          cvt_quant_to_fp4_get_sf_out_offset<uint32_t,
+                                             CVT_FP4_NUM_THREADS_PER_SF>(
+              rowIdx, colIdx, numCols, SFout);
+
+      out_pos = silu_and_cvt_warp_fp16_to_fp4<Type, UE8M0_SF>(
+          in_vec, in_vec2, SFScaleVal, sf_out);
+    }
+  }
+#endif
+}
+
+}  // namespace vllm
+
+void silu_and_mul_nvfp4_quant(torch::Tensor& output,  // [..., d]
+                              torch::Tensor& output_sf,
+                              torch::Tensor& input,  // [..., 2 * d]
+                              torch::Tensor& input_sf) {
+  TORCH_CHECK(input.dtype() == torch::kFloat16 ||
+              input.dtype() == torch::kBFloat16);
+  int32_t m = input.size(0);
+  int32_t n = input.size(1) / 2;
+  TORCH_CHECK(n % 16 == 0, "The N dimension must be multiple of 16.");
+  int multiProcessorCount =
+      get_device_attribute(cudaDevAttrMultiProcessorCount, -1);
+  auto input_sf_ptr = static_cast<float const*>(input_sf.data_ptr());
+  auto sf_out = static_cast<int32_t*>(output_sf.data_ptr());
+  auto output_ptr = static_cast<int64_t*>(output.data_ptr());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+  dim3 block(std::min(int(n / ELTS_PER_THREAD), 1024));
+  int const numBlocksPerSM = 2048 / block.x;
+  dim3 grid(std::min(int(m), multiProcessorCount * numBlocksPerSM));
+  VLLM_DISPATCH_HALF_TYPES(
+      input.scalar_type(), "act_and_mul_quant_kernel", [&] {
+        auto input_ptr = reinterpret_cast<scalar_t const*>(input.data_ptr());
+        VLLM_DISPATCH_BYTE_TYPES(
+            output.scalar_type(), "fused_act_and_mul_quant_kernel_nvfp4_type",
+            [&] {
+              vllm::silu_and_cvt_fp16_to_fp4<scalar_t>
+                  <<<grid, block, 0, stream>>>(
+                      m, n, input_ptr, input_sf_ptr,
+                      reinterpret_cast<uint32_t*>(output_ptr),
+                      reinterpret_cast<uint32_t*>(sf_out));
+            });
+      });
+}
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 608b72440307..b769c09adc0f 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -115,6 +115,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
 
+#ifndef USE_ROCM
+  ops.def(
+      "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
+      "Tensor input, Tensor input_global_scale) -> ()");
+  ops.impl("silu_and_mul_nvfp4_quant", torch::kCUDA, &silu_and_mul_nvfp4_quant);
+#endif
+
   ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
   ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
 
diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py
index 0e1059e65447..fcc2589e4211 100644
--- a/tests/compile/test_silu_mul_quant_fusion.py
+++ b/tests/compile/test_silu_mul_quant_fusion.py
@@ -4,32 +4,41 @@
 import torch
 
 import vllm.envs as envs
-from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe
+from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.compilation.activation_quant_fusion import (
+    FUSED_OPS, SILU_MUL_OP, ActivationQuantFusionPass)
+# yapf: enable
+from vllm.compilation.fusion import QUANT_OPS
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    GroupShape)
+    GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     Fp8LinearOp)
 from vllm.platforms import current_platform
 
 from .backend import TestBackend
 
+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
 
-class TestModel(torch.nn.Module):
 
-    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, *args,
-                 **kwargs):
-        super().__init__(*args, **kwargs)
+def is_nvfp4_supported():
+    return current_platform.has_device_capability(100)
+
+
+class TestSiluMulFp8QuantModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
+        super().__init__()
         self.silu_and_mul = SiluAndMul()
         self.wscale = torch.rand(1, dtype=torch.float32)
         self.scale = torch.rand(1, dtype=torch.float32)
 
-        self.w = (torch.rand(
-            hidden_size,
-            hidden_size).to(dtype=current_platform.fp8_dtype()).t())
+        self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
 
         self.fp8_linear = Fp8LinearOp(
             force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
@@ -45,14 +54,56 @@ def forward(self, x):
                                    input_scale=self.wscale)
         return x2
 
+    def ops_in_model_before(self):
+        return [SILU_MUL_OP, QUANT_OPS[kFp8StaticTensorSym]]
+
+    def ops_in_model_after(self):
+        return [FUSED_OPS[kFp8StaticTensorSym]]
+
+
+class TestSiluMulNvfp4QuantModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, **kwargs):
+        super().__init__()
+        self.silu_and_mul = SiluAndMul()
+        self.w = torch.randint(256, (hidden_size, hidden_size // 2),
+                               dtype=FP4_DTYPE)
+        self.wscale = torch.randn(hidden_size,
+                                  hidden_size // 16).to(dtype=FP8_DTYPE)
+        self.wscale2 = torch.rand(1, dtype=torch.float32)
+        self.scale = torch.rand(1, dtype=torch.float32)
 
-@pytest.mark.parametrize("num_tokens", [256])
-@pytest.mark.parametrize("hidden_size", [64])
+    def forward(self, x):
+        y = self.silu_and_mul(x)
+        y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
+        out = cutlass_scaled_fp4_mm(a=y_quant,
+                                    b=self.w,
+                                    block_scale_a=y_block_scale,
+                                    block_scale_b=self.wscale,
+                                    alpha=self.scale * self.wscale2,
+                                    out_dtype=y.dtype)
+        return out
+
+    def ops_in_model_before(self):
+        return [SILU_MUL_OP, QUANT_OPS[kNvfp4Quant]]
+
+    def ops_in_model_after(self):
+        return [FUSED_OPS[kNvfp4Quant]]
+
+
+@pytest.mark.parametrize("num_tokens", [64])
+@pytest.mark.parametrize("hidden_size", [128])
+@pytest.mark.parametrize(
+    "model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
+    if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
 @pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
                     reason="Only test on CUDA and ROCm")
-def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
+def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
                                    force_fp8_e4m3fnuz):
+    if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
+        pytest.skip("Duplicate tests for NVFP4")
+
     torch.set_default_device("cuda")
     torch.set_default_dtype(torch.float16)
 
@@ -63,7 +114,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
     fusion_pass = ActivationQuantFusionPass(config)
 
     backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
-    model = TestModel(hidden_size, force_fp8_e4m3fnuz)
+    model = model_class(hidden_size=hidden_size,
+                        force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)
 
     # First dimension dynamic
     x = torch.rand(num_tokens, hidden_size * 2)
@@ -80,17 +132,8 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size,
                                atol=1e-3,
                                rtol=1e-3)
 
-    # Check substitution worked
-    pre_nodes = backend.graph_pre_pass.nodes
-    post_nodes = backend.graph_post_pass.nodes
-
-    silu_and_mul_quant = torch.ops._C.silu_and_mul_quant.default
-    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
-
-    # In pre-nodes, fp8 quant should be present and fused kernels should not
-    assert find_auto_fn_maybe(pre_nodes, silu_and_mul_quant) is None
-    find_auto_fn(pre_nodes, fp8_quant)
+    # In pre-nodes, quant op should be present and fused kernels should not
+    backend.check_before_ops(model.ops_in_model_before())
 
-    # In post-nodes, fused kernels should be present and fp8 quant should not
-    find_auto_fn(post_nodes, silu_and_mul_quant)
-    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+    # In post-nodes, fused kernels should be present and quant op should not
+    backend.check_after_ops(model.ops_in_model_after())
diff --git a/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
new file mode 100644
index 000000000000..969f14cc3fe6
--- /dev/null
+++ b/tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from tests.kernels.utils import opcheck
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+
+BLOCK_SIZE = 16
+
+
+def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
+             global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    silu_and_mul_out = silu_and_mul.forward_native(x)
+    assert not current_platform.is_rocm()
+    assert silu_and_mul_out.ndim >= 1, (
+        f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.')
+    other_dims = 1 if silu_and_mul_out.ndim == 1 else -1
+    silu_and_mul_out = silu_and_mul_out.reshape(other_dims,
+                                                silu_and_mul_out.shape[-1])
+    m, n = silu_and_mul_out.shape
+    device = silu_and_mul_out.device
+
+    # Two fp4 values will be packed into an uint8.
+    out = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    output_scale = ref_output_scale
+
+    torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale,
+                                  global_scale)
+
+    return out, output_scale
+
+
+def ops_impl(x: torch.Tensor, global_scale: torch.Tensor,
+             ref_output_scale: torch.Tensor) -> torch.Tensor:
+    out_shape = (x.shape[0], x.shape[1] // 4)
+    output_scale = ref_output_scale
+    out = torch.empty(out_shape, dtype=torch.uint8, device=x.device)
+    torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale)
+    return out, output_scale
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    m, n = shape
+
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+
+    block_size = 16
+
+    assert n % block_size == 0, (
+        f'last dim has to be multiple of 16, but got {n}.')
+    assert x.dtype in (torch.float16, torch.bfloat16), (
+        f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.')
+
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(x.shape[0], 128)
+    scale_n = x.shape[1] // (2 * block_size)
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty((rounded_m, rounded_n // 4),
+                               device=x.device,
+                               dtype=torch.int32)
+
+    layer = SiluAndMul()
+
+    ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale)
+
+    fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale)
+
+    assert ref_out.dtype == torch.uint8
+    assert fusion_out.dtype == torch.uint8
+    assert ref_out.shape == fusion_out.shape
+
+    assert ref_out_scale.dtype == torch.int32
+    assert fusion_out_scale.dtype == torch.int32
+    assert ref_out_scale.shape == fusion_out_scale.shape
+
+    # Allow up to 2% of mismatched values since BF16 has accuracy issues.
+    mis_threshold = 0.02
+    atol = 0.4
+    rtol = 0.4
+    ref_logits = ref_out[-1]
+    fusion_logits = fusion_out[-1]
+
+    mis_count = torch.sum(
+        torch.abs(fusion_logits - ref_logits) > (atol +
+                                                 rtol * torch.abs(ref_logits)))
+    mis_ratio = mis_count / fusion_logits.numel()
+
+    assert mis_ratio < mis_threshold, \
+        f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}"
+
+    torch.testing.assert_close(ref_out_scale, fusion_out_scale)
+
+    opcheck(torch.ops._C.silu_and_mul_nvfp4_quant,
+            (fusion_out, fusion_out_scale, x, global_scale))
diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index 826014f770df..40e124a03eb0 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -1,55 +1,154 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
+
 import torch
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import (PatternMatcherPass, fwd_only,
                                              register_replacement)
+from torch._ops import OpOverload
 
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey, kFp8StaticTensorSym, kNvfp4Quant, kStaticTensorScale)
 from vllm.platforms import current_platform
 
+from .fusion import QUANT_OPS, empty_bf16, empty_fp32, empty_i32
 from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
 
 logger = init_logger(__name__)
 
+FP8_DTYPE = current_platform.fp8_dtype()
+FP4_DTYPE = torch.uint8
+
+SILU_MUL_OP = torch.ops._C.silu_and_mul.default
 
-def silu_mul_pattern_static(result: torch.Tensor,
-                            result_silu_mul: torch.Tensor, input: torch.Tensor,
-                            scale: torch.Tensor):
-    at1 = auto_functionalized(torch.ops._C.silu_and_mul.default,
-                              result=result_silu_mul,
-                              input=input)
-    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
-                              result=result,
-                              input=at1[1],
-                              scale=scale)
-    return at2[1]
+FUSED_OPS: dict[QuantKey, OpOverload] = {
+    kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default,  # noqa: E501
+}
+if current_platform.is_cuda() and hasattr(torch.ops._C,
+                                          "silu_and_mul_nvfp4_quant"):
+    FUSED_OPS[
+        kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default  # noqa: E501
 
 
-def silu_mul_replacement_static(result: torch.Tensor,
-                                result_silu_mul: torch.Tensor,
-                                input: torch.Tensor, scale: torch.Tensor):
-    at = auto_functionalized(torch.ops._C.silu_and_mul_quant.default,
-                             result=result,
-                             input=input,
-                             scale=scale)
-    return at[1]
+class ActivationQuantPattern(ABC):
+    """
+    The base class for Activation+Quant fusions.
+    Should not be used directly.
+    """
 
+    def __init__(
+        self,
+        quant_key: QuantKey,
+    ):
+        self.quant_key = quant_key
+        self.quant_dtype = quant_key.dtype
 
-def empty_bf16(*args, **kwargs):
-    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
+        assert self.quant_key in QUANT_OPS, \
+            f"unsupported quantization scheme {self.quant_key}"
+        self.QUANT_OP = QUANT_OPS[self.quant_key]
 
+        assert self.quant_key in FUSED_OPS, \
+            f"unsupported fusion scheme {self.quant_key}"
+        self.FUSED_OP = FUSED_OPS[self.quant_key]
 
-def empty_fp8(*args, **kwargs):
-    fp8 = current_platform.fp8_dtype()
-    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
+    def empty_quant(self, *args, **kwargs):
+        kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
+        return torch.empty(*args, **kwargs)
 
+    @abstractmethod
+    def register(self, pm_pass: PatternMatcherPass):
+        raise NotImplementedError
 
-def empty_fp32(*args, **kwargs):
-    return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
+
+class SiluMulFp8StaticQuantPattern(ActivationQuantPattern):
+    """
+    Fusion for SiluMul+Fp8StaticQuant Pattern
+    """
+
+    def __init__(self, symmetric: bool = True):
+        quant_key = QuantKey(dtype=FP8_DTYPE,
+                             scale=kStaticTensorScale,
+                             symmetric=symmetric)
+        super().__init__(quant_key)
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(result: torch.Tensor, result_silu_mul: torch.Tensor,
+                    input: torch.Tensor, scale: torch.Tensor):
+            at1 = auto_functionalized(SILU_MUL_OP,
+                                      result=result_silu_mul,
+                                      input=input)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      result=result,
+                                      input=at1[1],
+                                      scale=scale)
+            return at2[1]
+
+        def replacement(result: torch.Tensor, result_silu_mul: torch.Tensor,
+                        input: torch.Tensor, scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     input=input,
+                                     scale=scale)
+            return at[1]
+
+        inputs = [
+            self.empty_quant(5, 4),  # result
+            empty_bf16(5, 4),  # result_silu_mul
+            empty_bf16(5, 4),  # input
+            empty_fp32(1, 1)  # scale
+        ]
+
+        register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
+
+
+class SiluMulNvfp4QuantPattern(ActivationQuantPattern):
+    """
+    Fusion for SiluMul+Nvfp4Quant Pattern
+    """
+
+    def __init__(self):
+        super().__init__(kNvfp4Quant)
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(result: torch.Tensor, output_scale: torch.Tensor,
+                    result_silu_mul: torch.Tensor, input: torch.Tensor,
+                    scale: torch.Tensor):
+            at1 = auto_functionalized(SILU_MUL_OP,
+                                      result=result_silu_mul,
+                                      input=input)
+            at2 = auto_functionalized(self.QUANT_OP,
+                                      output=result,
+                                      input=at1[1],
+                                      output_scale=output_scale,
+                                      input_scale=scale)
+            return at2[1], at2[2]
+
+        def replacement(result: torch.Tensor, output_scale: torch.Tensor,
+                        result_silu_mul: torch.Tensor, input: torch.Tensor,
+                        scale: torch.Tensor):
+            at = auto_functionalized(self.FUSED_OP,
+                                     result=result,
+                                     result_block_scale=output_scale,
+                                     input=input,
+                                     input_global_scale=scale)
+            return at[1], at[2]
+
+        inputs = [
+            self.empty_quant(5, 32),  # result
+            empty_i32(128, 4),  # output_scale
+            empty_bf16(5, 64),  # result_silu_mul
+            empty_bf16(5, 64),  # input
+            empty_fp32(1, 1)  # scale
+        ]
+
+        register_replacement(pattern, replacement, inputs, fwd_only, pm_pass)
 
 
 class ActivationQuantFusionPass(VllmInductorPass):
@@ -69,15 +168,11 @@ def __init__(self, config: VllmConfig):
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="activation_quant_fusion_pass")
 
-        inputs = [
-            empty_fp8(5, 4),  # Quant output
-            empty_bf16(5, 4),  # Silu_and_mul output
-            empty_bf16(5, 4),  # Input
-            empty_fp32(1, 1)  # Scale
-        ]
-        register_replacement(silu_mul_pattern_static,
-                             silu_mul_replacement_static, inputs, fwd_only,
-                             self.patterns)
+        pattern_silu_mul_fp8 = SiluMulFp8StaticQuantPattern()
+        pattern_silu_mul_fp8.register(self.patterns)
+
+        pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern()
+        pattern_silu_mul_nvfp4.register(self.patterns)
 
     def __call__(self, graph: torch.fx.Graph):
         self.begin()
@@ -89,3 +184,8 @@ def __call__(self, graph: torch.fx.Graph):
 
         self.dump_graph(graph, "after_act_quant_fusion")
         self.end_and_log()
+
+    def uuid(self):
+        return VllmInductorPass.hash_source(self, ActivationQuantPattern,
+                                            SiluMulFp8StaticQuantPattern,
+                                            SiluMulNvfp4QuantPattern)
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index 60ae14331879..a36dd8b845f1 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -97,6 +97,13 @@ def __call__(self, graph: torch.fx.Graph):
                                      node,
                                      mutated_args,
                                      args=('result', 'input', 'scale'))
+            elif at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:
+                mutated_args = {1: 'result', 2: 'result_block_scale'}
+                self.defunctionalize(graph,
+                                     node,
+                                     mutated_args,
+                                     args=('result', 'result_block_scale',
+                                           'input', 'input_global_scale'))
             else:
                 continue  # skip the count
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 9d4e453ffc54..1fbb2e3bb6f2 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -885,6 +885,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
         layer.alpha = Parameter(layer.input_scale * layer.weight_scale_2,
                                 requires_grad=False)
 
+        # Calculate `1 / input_scale` so that we don't need to do so at runtime
+        layer.input_scale_inv = Parameter(
+            (1 / layer.input_scale).to(torch.float32), requires_grad=False)
+
         # Swizzle the weight blockscale.
         # contracting dimension is input dimension
         # block_size = 16;
@@ -941,8 +945,7 @@ def apply(
         output_shape = [x.shape[0], layer.weight.shape[0]]
 
         # quantize BF16 or FP16 to (FP4 and interleaved block scale)
-        s_quant = 1 / layer.input_scale
-        x_fp4, x_blockscale = scaled_fp4_quant(x, s_quant)
+        x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_scale_inv)
 
         # validate dtypes of quantized input, input block scale,
         # weight and weight_blockscale

From 27e88cee748d41e07268ca140d15252c6b38acf1 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 28 Aug 2025 13:17:15 -0700
Subject: [PATCH 072/125] chore: build release image by default (#23852)

Signed-off-by: Codex <codex@openai.com>
---
 .buildkite/release-pipeline.yaml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 86aae426c258..92a1bcada387 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -62,12 +62,8 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build release image (x86)"
-    depends_on: ~
-    key: block-release-image-build
-
   - label: "Build release image (x86)"
-    depends_on: block-release-image-build
+    depends_on: ~
     id: build-release-image-x86
     agents:
       queue: cpu_queue_postmerge
@@ -80,7 +76,7 @@ steps:
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
   - label: "Build release image (arm64)"
-    depends_on: block-release-image-build
+    depends_on: ~
     id: build-release-image-arm64
     agents:
       queue: arm64_cpu_queue_postmerge

From 7ffbf27239c3ff68d773e7d2e2cd284f1375349f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Aug 2025 14:22:46 -0700
Subject: [PATCH 073/125] [BugFix][FlashInfer] Fix potential race condition for
 paged_kv_indptr_cpu (#23737)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 70d3471a4725..5fc3a1517b69 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -237,6 +237,8 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
                                                device="cpu",
                                                pin_memory=pin_memory)
         self.paged_kv_indptr_np = self.paged_kv_indptr_cpu.numpy()
+        self.paged_kv_indptr_buffer = torch.zeros_like(
+            self.paged_kv_indptr_cpu, pin_memory=pin_memory)
         self.paged_kv_indices_cpu = torch.zeros(max_num_pages,
                                                 dtype=torch.int32,
                                                 device="cpu",
@@ -361,12 +363,18 @@ def build(self,
             dtype=np.int32,
             out=self.paged_kv_indptr_np[1:num_reqs + 1],
         )
+        # NOTE(woosuk): Because self.paged_kv_indptr_cpu can be modified
+        # after this line (e.g., for cuda graphs), we need to copy the data to
+        # self.paged_kv_indptr_buffer to avoid race condition.
+        self.paged_kv_indptr_buffer[:num_reqs +
+                                    1] = (self.paged_kv_indptr_cpu[:num_reqs +
+                                                                   1])
         paged_kv_indptr = self.paged_kv_indptr[:num_reqs + 1]
-        paged_kv_indptr.copy_(self.paged_kv_indptr_cpu[:num_reqs + 1],
+        paged_kv_indptr.copy_(self.paged_kv_indptr_buffer[:num_reqs + 1],
                               non_blocking=True)
 
         # write self.paged_kv_indices inplace
-        num_actual_pages = num_blocks_np.sum().item()
+        num_actual_pages = self.paged_kv_indptr_np[num_reqs]
         paged_kv_indices = self.paged_kv_indices[:num_actual_pages]
         _copy_page_indices_kernel[(num_reqs, )](
             paged_kv_indices,

From cb293f6a790d555d6d7ced872118ff029bd828e8 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Thu, 28 Aug 2025 14:54:30 -0700
Subject: [PATCH 074/125] [V1] Enable prefill optimization for Gemma3n (#22628)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py |  57 ---
 vllm/config/cache.py                         |  12 +-
 vllm/model_executor/models/gemma3n.py        | 419 ++++++++++++++++---
 vllm/model_executor/models/gemma3n_mm.py     |   2 +-
 vllm/v1/attention/backends/utils.py          | 139 +++++-
 vllm/v1/engine/async_llm.py                  |   7 +
 vllm/v1/worker/gpu_model_runner.py           |  96 +++--
 vllm/v1/worker/tpu_model_runner.py           |  39 +-
 vllm/v1/worker/utils.py                      |  40 +-
 9 files changed, 583 insertions(+), 228 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index d72e50e5196b..7bc7f44dd7ab 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import random
-from typing import Optional, Union
 
 import pytest
 import torch
@@ -10,12 +9,6 @@
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationLevel
 from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.forward_context import get_forward_context
-from vllm.model_executor.models.gemma3n_mm import (
-    Gemma3nForConditionalGeneration)
-from vllm.model_executor.models.registry import ModelRegistry
-from vllm.model_executor.models.utils import extract_layer_index
-from vllm.sequence import IntermediateTensors
 
 from ...utils import fork_new_process_for_each_test
 
@@ -23,54 +16,6 @@
 SEED = 42
 
 
-class TestGemma3nForConditionalGeneration(Gemma3nForConditionalGeneration):
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = super().forward(input_ids, positions,
-                                        intermediate_tensors, inputs_embeds,
-                                        **kwargs)
-        attn_metadata = get_forward_context().attn_metadata
-        # attn_metadata is None during dummy runs
-        if (attn_metadata is not None
-                and self.language_model.cache_config.kv_sharing_fast_prefill):
-            assert isinstance(attn_metadata, dict)  # true in V1
-            # Gemma3n-E2B has 30 layers, with last 20 layers being
-            # cross-decoder layers. Check attention metadata is correct
-            for layer_name, metadata in attn_metadata.items():
-                layer_idx = extract_layer_index(layer_name)
-                if layer_idx >= 20:
-                    assert hasattr(metadata, 'logits_indices_padded')
-                    assert hasattr(metadata, 'num_logits_indices')
-                else:
-                    assert not hasattr(metadata, 'logits_indices_padded')
-                    assert not hasattr(metadata, 'num_logits_indices')
-
-            # Last layer will be a KV sharing layer
-            layer_attn_metadata = attn_metadata[
-                self.language_model.model.layers[-1].self_attn.attn.layer_name]
-            logits_indices_padded = (layer_attn_metadata.logits_indices_padded)
-            assert logits_indices_padded is not None
-            num_logits_indices = layer_attn_metadata.num_logits_indices
-            assert num_logits_indices > 0
-            # Reset hidden states to random values and
-            # only set logits at logits_indices to valid values
-            # Because logits_indices are the only positions that are used
-            # for output token sampling, this still produces same outputs
-            logits_hs = hidden_states[logits_indices_padded]
-            hidden_states = torch.randn_like(hidden_states)
-            gen_indices = logits_indices_padded[:num_logits_indices]
-            hidden_states[gen_indices] = logits_hs[:num_logits_indices]
-
-        return hidden_states
-
-
 @pytest.fixture
 def test_prompts():
     """
@@ -124,8 +69,6 @@ def test_kv_sharing_fast_prefill(
     enforce_eager: bool,
     test_prompts: list[str],
 ):
-    ModelRegistry.register_model("Gemma3nForConditionalGeneration",
-                                 TestGemma3nForConditionalGeneration)
     sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
     compilation_config = CompilationConfig(
         # This allows vLLM compilation backend to handle allocating and
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 3d2aa6b17be7..79761e784485 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -145,12 +145,19 @@ def __post_init__(self) -> None:
 
         self._verify_cache_dtype()
         self._verify_prefix_caching()
+        self._verify_kv_sharing_fast_prefill()
 
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
         # metrics info
         return {key: str(value) for key, value in self.__dict__.items()}
 
+    def _verify_kv_sharing_fast_prefill(self) -> None:
+        if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
+            raise NotImplementedError(
+                "Fast prefill optimization for KV sharing is not supported "
+                "in V0 currently.")
+
     @model_validator(mode='after')
     def _verify_args(self) -> Self:
         if self.cpu_offload_gb < 0:
@@ -162,11 +169,6 @@ def _verify_args(self) -> Self:
                 "GPU memory utilization must be less than 1.0. Got "
                 f"{self.gpu_memory_utilization}.")
 
-        if self.kv_sharing_fast_prefill:
-            logger.warning_once(
-                "--kv-sharing-fast-prefill is currently work in progress "
-                "and not functional yet (i.e. no prefill savings)")
-
         return self
 
     def _verify_cache_dtype(self) -> None:
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index ffec3408702c..0e0e191e75fc 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -23,9 +23,11 @@
 from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
 
 from vllm.attention import Attention
+from vllm.compilation.backends import set_model_tag
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
                                                    GeluAndMul,
@@ -45,6 +47,7 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
 
 from .interfaces import SupportsQuant
 from .utils import (AutoWeightsLoader, extract_layer_index,
@@ -533,7 +536,178 @@ def forward(
         return corrected_predictions
 
 
-@support_torch_compile
+# This enables torch.compile if --kv-sharing-fast-prefill passed
+@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
+                       kv_sharing_fast_prefill)
+class Gemma3nSelfDecoder(nn.Module):
+    """
+    Includes altup embedding and self decoder layers
+    """
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        decoder_layers: list[Gemma3nDecoderLayer],
+        layer_idx_start: int,
+        per_layer_model_projection: ColumnParallelLinear,
+        embed_scale_per_layer: torch.Tensor,
+        embed_tokens_per_layer: VocabParallelEmbedding,
+        per_layer_projection_norm: RMSNorm,
+        per_layer_input_scale: torch.Tensor,
+        altup_projections: nn.ModuleList,
+        eps: torch.Tensor,
+        embed_tokens: VocabParallelEmbedding,
+        embed_scale: torch.Tensor,
+    ):
+        super().__init__()
+        self.decoder_layers = decoder_layers
+        self.layer_idx_start = layer_idx_start
+        self.per_layer_model_projection = per_layer_model_projection
+        self.config = vllm_config.model_config.hf_config
+        self.embed_scale_per_layer = embed_scale_per_layer
+        self.embed_tokens_per_layer = embed_tokens_per_layer
+        self.per_layer_projection_norm = per_layer_projection_norm
+        self.per_layer_input_scale = per_layer_input_scale
+        self.altup_projections = altup_projections
+        self.eps = eps
+        self.embed_tokens = embed_tokens
+        self.embed_scale = embed_scale
+
+    def get_per_layer_input_embeddings(
+            self, input_ids: torch.Tensor) -> torch.Tensor:
+        # Deal with the fact that vocab_size_per_layer_input < vocab_size
+        # which causes us to have some out of vocab tokens by setting
+        # those token ids to 0. This matches the HF implementation.
+        per_layer_inputs_mask = torch.logical_and(
+            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
+        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
+                                              torch.zeros_like(input_ids))
+        return self.embed_tokens_per_layer(
+            per_layer_inputs_tokens) * self.embed_scale_per_layer
+
+    def get_per_layer_inputs(
+        self,
+        hidden_states_0: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
+        per_layer_projection = per_layer_projection.reshape(
+            *hidden_states_0.shape[:-1],
+            self.config.num_hidden_layers,
+            self.config.hidden_size_per_layer_input,
+        )
+        per_layer_projection = self.per_layer_projection_norm(
+            per_layer_projection)
+        if per_layer_inputs is not None:
+            # Profiling run does not compute per_layer_inputs
+            per_layer_inputs = per_layer_projection + per_layer_inputs
+            per_layer_inputs *= self.per_layer_input_scale
+        else:
+            per_layer_inputs = per_layer_projection
+        return per_layer_inputs
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids) * self.embed_scale
+
+    def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor:
+        # Altup embed.
+        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
+        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
+                                      keepdim=True)**0.5
+        for i in range(1, self.config.altup_num_inputs):
+            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
+            new_magnitude = torch.mean(hidden_states[i]**2,
+                                       dim=-1,
+                                       keepdim=True)**0.5
+            hidden_states[i] *= target_magnitude / torch.maximum(
+                new_magnitude, self.eps)
+        hidden_states = torch.stack(hidden_states, dim=-1)
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if inputs_embeds is not None:
+            hidden_states_0 = inputs_embeds
+        else:
+            hidden_states_0 = self.get_input_embeddings(input_ids)
+
+        adjusted_per_layer_inputs = self.get_per_layer_inputs(
+            hidden_states_0, per_layer_inputs)
+        hidden_states = self.altup_embed(hidden_states_0)
+
+        # [altnum_inputs, num_tokens, hidden_size]
+        hidden_states = hidden_states.permute(2, 0, 1)
+
+        for idx, layer in enumerate(self.decoder_layers):
+            layer_idx = idx + self.layer_idx_start
+            # [altup_num_inputs, num_tokens, hidden_size]
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                per_layer_input=adjusted_per_layer_inputs[:, layer_idx, :],
+                **kwargs,
+            )
+
+        # [num_tokens, hidden_size, altnum_inputs]
+        hidden_states = hidden_states.permute(1, 2, 0)
+
+        return hidden_states, adjusted_per_layer_inputs
+
+
+# This enables torch.compile if --kv-sharing-fast-prefill passed
+@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
+                       kv_sharing_fast_prefill)
+class Gemma3nCrossDecoder(nn.Module):
+    """
+    Cross-decoder layers
+    """
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        decoder_layers: list[Gemma3nDecoderLayer],
+        layer_idx_start: int,
+    ):
+        super().__init__()
+        self.decoder_layers = decoder_layers
+        self.layer_idx_start = layer_idx_start
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        per_layer_inputs: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        # [altnum_inputs, num_tokens, hidden_size]
+        hidden_states = hidden_states.permute(2, 0, 1)
+        for idx, layer in enumerate(self.decoder_layers):
+            layer_idx = idx + self.layer_idx_start
+            # [altup_num_inputs, num_tokens, hidden_size]
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                per_layer_input=per_layer_inputs[:, layer_idx, :],
+                **kwargs,
+            )
+        # [num_tokens, hidden_size, altnum_inputs]
+        hidden_states = hidden_states.permute(1, 2, 0)
+        return hidden_states
+
+
+# This disables torch.compile if --kv-sharing-fast-prefill passed
+@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
+                       cache_config.kv_sharing_fast_prefill)
 class Gemma3nTextModel(nn.Module, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -543,7 +717,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -613,95 +786,211 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: Gemma3nDecoderLayer(
                 config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
+
+        self.eps = torch.tensor(torch.finfo().min)
+
+        first_kv_shared_layer_idx = (config.num_hidden_layers -
+                                     config.num_kv_shared_layers)
+        # Layer idx 0-19 are self-decoder layers in You Only Cache Once (YOCO)
+        with set_model_tag("self_decoder"):
+            self.self_decoder = Gemma3nSelfDecoder(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.self_decoder",
+                decoder_layers=self.layers[:first_kv_shared_layer_idx],
+                layer_idx_start=0,
+                per_layer_model_projection=self.per_layer_model_projection,
+                embed_scale_per_layer=self.embed_scale_per_layer,
+                embed_tokens_per_layer=self.embed_tokens_per_layer,
+                per_layer_projection_norm=self.per_layer_projection_norm,
+                per_layer_input_scale=self.per_layer_input_scale,
+                altup_projections=self.altup_projections,
+                eps=self.eps,
+                embed_tokens=self.embed_tokens,
+                embed_scale=self.embed_scale,
+            )
+        # Layer idx 20-30 are cross-decoder layers in YOCO
+        with set_model_tag("cross_decoder"):
+            self.cross_decoder = Gemma3nCrossDecoder(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.cross_decoder",
+                decoder_layers=self.layers[first_kv_shared_layer_idx:],
+                layer_idx_start=first_kv_shared_layer_idx,
+            )
+
         self.norm = RMSNorm(
             config.hidden_size,
             eps=config.rms_norm_eps,
         )
-        self.eps = torch.tensor(torch.finfo().min)
 
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids) * self.embed_scale
+        self.fast_prefill_enabled = cache_config.kv_sharing_fast_prefill
+
+        if self.fast_prefill_enabled:
+            # Allocate static buffers for CUDAGraph
+            # TODO(sarckk): Extract this functionality to interface
+            max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
+            device = next(self.parameters()).device
+            self.positions = torch.zeros(max_num_tokens,
+                                         dtype=torch.int64,
+                                         device=device)
+            self.hidden_states = torch.zeros(
+                (max_num_tokens, config.hidden_size,
+                 self.config.altup_num_inputs),
+                dtype=self.embed_tokens.weight.dtype,
+                device=device,
+            )
+            self.per_layer_inputs = torch.zeros(
+                (max_num_tokens, self.config.num_hidden_layers,
+                 self.config.hidden_size_per_layer_input),
+                dtype=self.embed_tokens.weight.dtype,
+                device=device,
+            )
 
-    def get_per_layer_input_embeddings(
-            self, input_ids: torch.Tensor) -> torch.Tensor:
-        # Deal with the fact that vocab_size_per_layer_input < vocab_size
-        # which causes us to have some out of vocab tokens by setting
-        # those token ids to 0. This matches the HF implementation.
-        per_layer_inputs_mask = torch.logical_and(
-            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
-        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
-                                              torch.zeros_like(input_ids))
-        return self.embed_tokens_per_layer(
-            per_layer_inputs_tokens) * self.embed_scale_per_layer
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.self_decoder.get_input_embeddings(input_ids)
 
-    def forward(
+    def fast_prefill_forward(
         self,
-        input_ids: Optional[torch.Tensor],
+        input_ids: torch.Tensor,
         positions: torch.Tensor,
-        per_layer_inputs: torch.Tensor,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
         **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if inputs_embeds is not None:
-            hidden_states_0 = inputs_embeds
-        else:
-            hidden_states_0 = self.get_input_embeddings(input_ids)
+    ) -> torch.Tensor:
+        logits_indices_padded, num_logits_indices = None, None
+        attn_metadata = get_forward_context().attn_metadata
+
+        # attn_metadata is None during dummy runs
+        if (self.fast_prefill_enabled and attn_metadata is not None):
+            assert isinstance(attn_metadata, dict)
+            # Last layer is a KV sharing layer
+            layer_attn_metadata = attn_metadata[
+                self.layers[-1].self_attn.attn.layer_name]
+            if (isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata)):
+                logits_indices_padded = (
+                    layer_attn_metadata.logits_indices_padded)
+                num_logits_indices = layer_attn_metadata.num_logits_indices
+
+        # Copy inputs for cudagraph
+        batch_size = positions.size(0)
+        self.positions[:batch_size].copy_(positions)
+        self_decoder_hidden_states, per_layer_inputs_adjusted = \
+            self.self_decoder(
+                input_ids=input_ids,
+                positions=self.positions[:batch_size],
+                inputs_embeds=inputs_embeds,
+                per_layer_inputs=per_layer_inputs,
+                **kwargs,
+            )
 
-        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
-        per_layer_projection = per_layer_projection.reshape(
-            *hidden_states_0.shape[:-1],
-            self.config.num_hidden_layers,
-            self.config.hidden_size_per_layer_input,
+        if logits_indices_padded is None:
+            logits_indices_padded = torch.arange(
+                positions.size(0),
+                dtype=positions.dtype,
+                device=positions.device,
+            )
+
+        # NOTE(sarckk): There is currently a bug caused by
+        # vLLM converting output of last piecewise CUDA graph
+        # to weakref, causing memory to be prematurely freed
+        # when there are multiple compilation units
+        # Keep .clone() until fix in
+        # https://github.com/vllm-project/vllm/pull/22282
+        hidden_states = self_decoder_hidden_states.clone()
+
+        # Copy inputs for cudagraph
+        num_padded_logits_indices = logits_indices_padded.size(0)
+        self.positions[:num_padded_logits_indices].copy_(
+            positions[logits_indices_padded])
+        self.hidden_states[:num_padded_logits_indices].copy_(
+            self_decoder_hidden_states[logits_indices_padded])
+        self.per_layer_inputs[:num_padded_logits_indices].copy_(
+            per_layer_inputs_adjusted[logits_indices_padded])
+        cross_decoder_hidden_states = self.cross_decoder(
+            positions=self.positions[:num_padded_logits_indices],
+            hidden_states=self.hidden_states[:num_padded_logits_indices],
+            per_layer_inputs=self.per_layer_inputs[:num_padded_logits_indices],
+            **kwargs,
         )
-        per_layer_projection = self.per_layer_projection_norm(
-            per_layer_projection)
 
-        if per_layer_inputs is not None:
-            # Profiling run does not compute per_layer_inputs
-            per_layer_inputs = per_layer_projection + per_layer_inputs
-            per_layer_inputs *= self.per_layer_input_scale
+        if num_logits_indices is not None:
+            assert num_logits_indices > 0
+            # Merge cross-decoder and self-decoder hidden states
+            hidden_states[logits_indices_padded[:num_logits_indices]] = (
+                cross_decoder_hidden_states[:num_logits_indices])
         else:
-            per_layer_inputs = per_layer_projection
+            hidden_states = cross_decoder_hidden_states
 
-        # Altup embed.
-        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
-        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
-                                      keepdim=True)**0.5
-        for i in range(1, self.config.altup_num_inputs):
-            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
-            new_magnitude = torch.mean(hidden_states[i]**2,
-                                       dim=-1,
-                                       keepdim=True)**0.5
-            hidden_states[i] *= target_magnitude / torch.maximum(
-                new_magnitude, self.eps)
-        hidden_states = torch.stack(hidden_states, dim=0)
+        return hidden_states
 
-        # Transformer blocks.
-        for layer_idx, layer in enumerate(self.layers):
-            # [altup_num_inputs, num_tokens, hidden_size]
-            hidden_states = layer(
-                positions=positions,
-                hidden_states=hidden_states,
-                per_layer_input=per_layer_inputs[:, layer_idx, :],
-                **kwargs,
-            )
+    def normal_forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states, per_layer_inputs = self.self_decoder(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            per_layer_inputs=per_layer_inputs,
+            **kwargs,
+        )
+        hidden_states = self.cross_decoder(
+            positions=positions,
+            hidden_states=hidden_states,
+            per_layer_inputs=per_layer_inputs,
+            **kwargs,
+        )
+        return hidden_states
 
+    def altup_unembed(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
         # Altup unembed.
-        target_magnitude = torch.mean(hidden_states[0]**2,
+        target_magnitude = torch.mean(hidden_states[..., 0]**2,
                                       dim=-1,
                                       keepdim=True)**0.5
         for i in range(1, self.config.altup_num_inputs):
-            hidden_states[i] = self.altup_unembed_projections[i - 1](
-                hidden_states[i])
-            new_magnitude = torch.mean(hidden_states[i]**2,
+            hidden_states[..., i] = self.altup_unembed_projections[i - 1](
+                hidden_states[..., i])
+            new_magnitude = torch.mean(hidden_states[..., i]**2,
                                        dim=-1,
                                        keepdim=True)**0.5
-            hidden_states[i] *= target_magnitude / torch.maximum(
+            hidden_states[..., i] *= target_magnitude / torch.maximum(
                 new_magnitude, self.eps)
-        # [altup_num_inputs,num_tokens,hidden_size] -> [num_tokens,hidden_size]
-        hidden_states = torch.mean(hidden_states, dim=0)
+        # [num_tokens,hidden_size, altup_num_inputs] -> [num_tokens,hidden_size]
+        hidden_states = torch.mean(hidden_states, dim=-1)
+        return hidden_states
 
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        per_layer_inputs: Optional[torch.Tensor] = None,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if self.fast_prefill_enabled:
+            hidden_states = self.fast_prefill_forward(
+                input_ids,
+                positions,
+                inputs_embeds,
+                per_layer_inputs,
+                **kwargs,
+            )
+        else:
+            hidden_states = self.normal_forward(
+                input_ids,
+                positions,
+                inputs_embeds,
+                per_layer_inputs,
+                **kwargs,
+            )
+        hidden_states = self.altup_unembed(hidden_states)
         return self.norm(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index d59dde1560ae..aba4f98ea5f3 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -620,7 +620,7 @@ def get_input_embeddings(
         # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
         # them here, as the model  forward has only access to the input_embeds.
         if input_ids is not None:
-            per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings(
+            per_layer_inputs = self.language_model.model.self_decoder.get_per_layer_input_embeddings(
                 input_ids)
             per_layer_inputs = per_layer_inputs.reshape(
                 -1, self.config.text_config.num_hidden_layers,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 39bdbe125635..ad53b2e80bc7 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -4,11 +4,13 @@
 import enum
 import functools
 from abc import abstractmethod
-from dataclasses import dataclass, make_dataclass
-from typing import TYPE_CHECKING, Any, ClassVar, Generic, Optional, TypeVar
+from dataclasses import dataclass, fields, make_dataclass
+from typing import (TYPE_CHECKING, Any, ClassVar, Generic, Optional, Protocol,
+                    TypeVar)
 
 import numpy as np
 import torch
+from typing_extensions import runtime_checkable
 
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.utils import cdiv
@@ -19,7 +21,8 @@
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.backends.abstract import (AttentionBackend,
+                                              AttentionMetadata)
 from vllm.attention.layer import Attention
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout)
@@ -65,6 +68,10 @@ class CommonAttentionMetadata:
 
     causal: bool = True
 
+    # Needed by FastPrefillAttentionBuilder
+    logits_indices_padded: Optional[torch.Tensor] = None
+    num_logits_indices: Optional[int] = None
+
 
 @dataclass
 class UbatchSlice:
@@ -542,6 +549,69 @@ def make_local_attention_virtual_batches(
     )
 
 
+def make_kv_sharing_fast_prefill_common_attn_metadata(
+    common_attn_metadata: CommonAttentionMetadata,
+) -> CommonAttentionMetadata:
+    if common_attn_metadata.max_query_len == 1:
+        # All requests are decode (assume 1 token for now)
+        # Skip computing fast prefill path
+        return common_attn_metadata
+
+    assert common_attn_metadata.logits_indices_padded is not None
+    assert common_attn_metadata.num_logits_indices is not None
+
+    logits_indices_padded = common_attn_metadata.logits_indices_padded
+    num_logits_indices = common_attn_metadata.num_logits_indices
+    # Get rid of CUDAGraph padding, if any
+    logits_indices = logits_indices_padded[:num_logits_indices]
+    num_reqs = common_attn_metadata.num_reqs
+    query_start_loc = common_attn_metadata.query_start_loc
+    seq_lens = common_attn_metadata.seq_lens
+    # Example inputs
+    # num_reqs: 3
+    # generation_indices:  [14, 18, 19, 27]
+    # query_start_loc: [0, 15, 20, 28]
+    # seq_lens:        [41, 31, 40]
+
+    # Find how many decode indices belong to each request
+    # request_ids: [0, 1, 1, 2]
+    request_ids = torch.bucketize(logits_indices,
+                                  query_start_loc[1:],
+                                  right=True)
+
+    # Figure out how many tokens are in each request
+    # num_decode_tokens: [1, 2, 1]
+    num_decode_tokens = torch.bincount(request_ids, minlength=num_reqs)
+
+    # Calculate new query_start_loc with tokens in generation_indices
+    # decode_query_start_loc: [0, 1, 3, 4]
+    decode_query_start_loc = torch.empty(num_reqs + 1,
+                                         device=query_start_loc.device,
+                                         dtype=query_start_loc.dtype)
+
+    decode_query_start_loc[0] = 0
+    decode_query_start_loc[1:] = torch.cumsum(num_decode_tokens, dim=0)
+    decode_max_query_len = int(num_decode_tokens.max().item())
+    total_num_decode_tokens = int(num_decode_tokens.sum().item())
+
+    common_attn_metadata = CommonAttentionMetadata(
+        query_start_loc=decode_query_start_loc,
+        query_start_loc_cpu=decode_query_start_loc.to("cpu",
+                                                      non_blocking=True),
+        seq_lens=seq_lens,
+        seq_lens_cpu=seq_lens.to("cpu", non_blocking=True),
+        num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu,
+        num_reqs=num_reqs,
+        num_actual_tokens=total_num_decode_tokens,
+        max_query_len=decode_max_query_len,
+        max_seq_len=common_attn_metadata.max_seq_len,
+        block_table_tensor=common_attn_metadata.block_table_tensor,
+        slot_mapping=common_attn_metadata.slot_mapping,
+        causal=True,
+    )
+    return common_attn_metadata
+
+
 def subclass_attention_backend(
         name_prefix: str, attention_backend_cls: type[AttentionBackend],
         builder_cls: type[AttentionMetadataBuilder[M]]
@@ -679,13 +749,56 @@ def subclass_attention_metadata(
     return Wrapped
 
 
-def make_kv_sharing_fast_prefill_attention_metadata(
-    metadata_cls: Any, ) -> Any:
-    """
-    Return a new subclass of `metadata_cls` for fast prefill
-    """
-    return subclass_attention_metadata(
-        name_prefix="KVSharingFastPrefill",
-        metadata_cls=metadata_cls,
-        fields=KV_SHARING_FAST_PREFILL_METADATA_FIELDS,
-    )
+@runtime_checkable
+class KVSharingFastPrefillMetadata(Protocol):
+    logits_indices_padded: torch.Tensor
+    num_logits_indices: int
+
+
+def create_fast_prefill_custom_backend(
+    prefix: str,
+    underlying_attn_backend: AttentionBackend,
+) -> type[AttentionBackend]:
+
+    underlying_builder = underlying_attn_backend.get_builder_cls()
+
+    class FastPrefillAttentionBuilder(underlying_builder):  # type: ignore
+
+        def build(self,
+                  common_prefix_len: int,
+                  common_attn_metadata: CommonAttentionMetadata,
+                  fast_build: bool = False) -> AttentionMetadata:
+            new_common_attn_metadata =\
+            make_kv_sharing_fast_prefill_common_attn_metadata(common_attn_metadata)
+            metadata = super().build(common_prefix_len,
+                                     new_common_attn_metadata, fast_build)
+
+            class KVSharingFastPrefillAttentionMetadata(
+                    metadata.__class__,  #  type: ignore
+                    KVSharingFastPrefillMetadata):
+
+                def __init__(self, metadata, common_attn_metadata):
+                    # Shallow copy all fields in metadata cls
+                    for field in fields(metadata.__class__):
+                        setattr(self, field.name,
+                                getattr(metadata, field.name))
+
+                    # Set additional fields that will be used in model code
+                    assert (common_attn_metadata.logits_indices_padded
+                            is not None
+                            and common_attn_metadata.num_logits_indices
+                            is not None)
+                    self.logits_indices_padded = \
+                        common_attn_metadata.logits_indices_padded
+                    self.num_logits_indices = \
+                        common_attn_metadata.num_logits_indices
+
+            return KVSharingFastPrefillAttentionMetadata(
+                metadata, common_attn_metadata)
+
+    attn_backend = subclass_attention_backend(
+        name_prefix=prefix,
+        attention_backend_cls=underlying_attn_backend,
+        builder_cls=FastPrefillAttentionBuilder)
+
+    return attn_backend
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index dbea0b610b31..7440fe1f07e9 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -335,6 +335,13 @@ async def generate(
         returning the RequestOutput back to the caller.
         """
 
+        if (self.vllm_config.cache_config.kv_sharing_fast_prefill
+                and sampling_params.prompt_logprobs):
+            raise ValueError(
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, please disable it when the requests need "
+                "prompt logprobs")
+
         try:
             # We start the output_handler on the first call to generate() so
             # we can call __init__ before the event loop, which enables us
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index a194808e513d..0250a4e19a02 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import dataclasses
 import gc
 import itertools
 import time
@@ -58,7 +57,7 @@
                         supports_dynamo)
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
-    make_kv_sharing_fast_prefill_attention_metadata,
+    create_fast_prefill_custom_backend,
     reorder_batch_to_split_decodes_and_prefills)
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 from vllm.v1.kv_cache_interface import (AttentionSpec,
@@ -84,9 +83,10 @@
     KVConnectorModelRunnerMixin, KVConnectorOutput)
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
-from .utils import (AttentionGroup, MultiModalBudget, bind_kv_cache,
-                    gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
-                    sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
+from .utils import (AttentionGroup, MultiModalBudget,
+                    add_kv_sharing_layers_to_kv_cache_groups, bind_kv_cache,
+                    gather_mm_placeholders, sanity_check_mm_encoder_outputs,
+                    scatter_mm_placeholders)
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -860,6 +860,8 @@ def _prepare_inputs(
                 max_seq_len=max_seq_len,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
+                logits_indices_padded=logits_indices_padded,
+                num_logits_indices=logits_indices.size(0),
                 causal=True,
             )
 
@@ -884,28 +886,7 @@ def _prepare_inputs(
                     common_attn_metadata=common_attn_metadata,
                 ))
 
-                fast_prefill_metadata = attn_metadata_i
-                if (self.cache_config.kv_sharing_fast_prefill
-                        and self.kv_sharing_fast_prefill_eligible_layers):
-                    # Dynamically create a a dataclass type that inherits
-                    # from attention metadata type but includes additional
-                    # fields logits_indices_padded and num_logits_indices
-                    # which are required for prefill truncation
-                    fast_prefill_metadata_type = (
-                        make_kv_sharing_fast_prefill_attention_metadata(
-                            metadata_cls=type(attn_metadata_i), ))
-                    fast_prefill_metadata = fast_prefill_metadata_type(
-                        **dataclasses.asdict(attn_metadata_i),
-                        logits_indices_padded=logits_indices_padded,
-                        num_logits_indices=logits_indices.size(0),
-                    )
-
                 for layer_name in attn_group.layer_names:
-                    if (self.cache_config.kv_sharing_fast_prefill
-                            and layer_name
-                            in self.kv_sharing_fast_prefill_eligible_layers):
-                        attn_metadata[layer_name] = fast_prefill_metadata
-                        continue
                     attn_metadata[layer_name] = attn_metadata_i
 
         # Hot-Swap lora model
@@ -1484,6 +1465,12 @@ def execute_model(
             return self.kv_connector_no_forward(scheduler_output,
                                                 self.vllm_config)
 
+        if self.cache_config.kv_sharing_fast_prefill:
+            assert not self.input_batch.num_prompt_logprobs, (
+                "--kv-sharing-fast-prefill produces incorrect logprobs for "
+                "prompt tokens, tokens, please disable it when the requests "
+                "need prompt logprobs")
+
         # Prepare the decoder inputs.
         (attn_metadata, logits_indices, spec_decode_metadata,
          num_scheduled_tokens_np, spec_decode_common_attn_metadata,
@@ -2742,6 +2729,13 @@ def get_attn_backends_for_layers(
             # layer.
             for layer_name in layer_names:
                 attn_backend = layers[layer_name].get_attn_backend()
+
+                if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
+                    attn_backend = create_fast_prefill_custom_backend(
+                        "FastPrefill",
+                        attn_backend,
+                    )
+
                 key = attn_backend.full_cls_name()
                 attn_backends[key] = attn_backend
                 attn_backend_layers[key].append(layer_name)
@@ -3074,20 +3068,40 @@ def initialize_kv_cache_tensors(
         kv_caches = self._reshape_kv_cache_tensors(kv_cache_config,
                                                    kv_cache_raw_tensors)
 
-        # Setup `kv_cache_config` and `kv_caches` for models
-        # with cross-layer KV sharing
-        if self.shared_kv_cache_layers:
-            initialize_kv_cache_for_kv_sharing(
-                self.shared_kv_cache_layers,
-                kv_cache_config.kv_cache_groups,
-                kv_caches,
-                self.attn_groups,
-                self.runner_only_attn_layers,
-            )
+        # Set up cross-layer KV cache sharing
+        for layer_name, target_layer_name in self.shared_kv_cache_layers.items(
+        ):
+            logger.debug("%s reuses KV cache of %s", layer_name,
+                         target_layer_name)
+            kv_caches[layer_name] = kv_caches[target_layer_name]
+
+        bind_kv_cache(kv_caches,
+                      self.compilation_config.static_forward_context,
+                      self.kv_caches)
+        return kv_caches
+
+    def maybe_add_kv_sharing_layers_to_kv_cache_groups(
+            self, kv_cache_config: KVCacheConfig) -> None:
+        """
+        Add layers that re-use KV cache to KV cache group of its target layer.
+        Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
+        """
+        if not self.shared_kv_cache_layers:
+            # No cross-layer KV sharing, return
+            return
+
+        add_kv_sharing_layers_to_kv_cache_groups(
+            self.shared_kv_cache_layers,
+            kv_cache_config.kv_cache_groups,
+            self.runner_only_attn_layers,
+        )
+
+        if self.cache_config.kv_sharing_fast_prefill:
+            # In You Only Cache Once (https://arxiv.org/abs/2405.05254) or other
+            # similar KV sharing setups, only the layers that generate KV caches
+            # are involved in the prefill phase, enabling prefill to early exit.
             attn_layers = get_layers_from_vllm_config(self.vllm_config,
                                                       Attention)
-            # Iterate in reversed order and add layers that re-use KV cache
-            # e.g. in YOCO-like KV sharing setups (e.g. Gemma3n)
             for layer_name in reversed(attn_layers):
                 if layer_name in self.shared_kv_cache_layers:
                     self.kv_sharing_fast_prefill_eligible_layers.add(
@@ -3095,11 +3109,6 @@ def initialize_kv_cache_tensors(
                 else:
                     break
 
-        bind_kv_cache(kv_caches,
-                      self.compilation_config.static_forward_context,
-                      self.kv_caches)
-        return kv_caches
-
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -3111,6 +3120,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         self.kv_cache_config = kv_cache_config
         self.may_reinitialize_input_batch(kv_cache_config)
         self.may_add_encoder_only_layers_to_kv_cache_config()
+        self.maybe_add_kv_sharing_layers_to_kv_cache_groups(kv_cache_config)
         self.initialize_attn_backend(kv_cache_config)
         kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 70ffde39ca33..230700612708 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -55,9 +55,8 @@
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.tpu_input_batch import CachedRequestState, InputBatch
 
-from .utils import (MultiModalBudget, bind_kv_cache,
-                    initialize_kv_cache_for_kv_sharing,
-                    sanity_check_mm_encoder_outputs)
+from .utils import (MultiModalBudget, add_kv_sharing_layers_to_kv_cache_groups,
+                    bind_kv_cache, sanity_check_mm_encoder_outputs)
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -1599,6 +1598,30 @@ def profile_run(
         self.encoder_cache.clear()
         gc.collect()
 
+    def maybe_setup_cross_layer_kv_sharing(
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        """
+        Add layers that re-use KV cache to KV cache group of its target layer.
+        Mapping of KV cache tensors happens in `initialize_kv_cache_tensors()`
+        """
+        if not self.shared_kv_cache_layers:
+            # No cross-layer KV sharing, return
+            return
+
+        add_kv_sharing_layers_to_kv_cache_groups(
+            self.shared_kv_cache_layers,
+            kv_cache_config.kv_cache_groups,
+        )
+
+        for layer_name, target_layer_name in self.shared_kv_cache_layers.items(
+        ):
+            logger.debug("%s reuses KV cache of %s", layer_name,
+                         target_layer_name)
+            kv_caches[layer_name] = kv_caches[target_layer_name]
+
     def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Initialize KV cache based on `kv_cache_config`.
@@ -1664,14 +1687,8 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 else:
                     raise NotImplementedError
 
-        # Setup `kv_cache_config` and `kv_caches` for models
-        # with cross-layer KV sharing
-        if self.shared_kv_cache_layers:
-            initialize_kv_cache_for_kv_sharing(
-                self.shared_kv_cache_layers,
-                kv_cache_config.kv_cache_groups,
-                kv_caches,
-            )
+        # Set up cross-layer KV cache sharing if needed
+        self.maybe_setup_cross_layer_kv_sharing(kv_caches, kv_cache_config)
 
         bind_kv_cache(
             kv_caches,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index a519336e4161..6767804c71b9 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -203,12 +203,9 @@ def gather_mm_placeholders(
     return placeholders[is_embed]
 
 
-def initialize_kv_cache_for_kv_sharing(
+def add_kv_sharing_layers_to_kv_cache_groups(
     shared_kv_cache_layers: dict[str, str],
     kv_cache_groups: list[KVCacheGroupSpec],
-    kv_caches: dict[str, torch.Tensor],
-    # Optional for now to avoid breaking TPU
-    attn_groups: Optional[list[list[AttentionGroup]]] = None,
     runner_only_attn_layers: Optional[set[str]] = None,
 ) -> None:
     """
@@ -223,38 +220,15 @@ def initialize_kv_cache_for_kv_sharing(
             means this layer will perform attention using the keys and values
             from the KV cache of `shared_kv_cache_layers[layer_name]`.
         kv_cache_groups: The KV cache groups of the model.
-        kv_caches: The allocated kv_caches with layer names as keys.
-            Note that layers in shared_kv_cache_layers.keys() are not
-            originally included as it only contains layers which have its own
-            KV cache allocation.
-        attn_groups: Optional list of attention groups. Layers in the same KV
-            cache group may be placed in different attention groups if they
-            have different attention backends.  Currently only provided by 
-            GPU model runner.
     """
-    # mapping from layer name to tuple of (kv_cache_group_idx, attn_group_idx)
-    layer_to_attn_group_idx: dict[str, tuple[int, int]] = {}
-    if attn_groups:
-        for kv_cache_group_idx, kv_attn_groups in enumerate(attn_groups):
-            for attn_group_idx, attn_group in enumerate(kv_attn_groups):
-                for layer_name in attn_group.layer_names:
-                    layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx,
-                                                           attn_group_idx)
-    else:
-        for kv_cache_group_idx, kv_cache_group in enumerate(kv_cache_groups):
-            for layer_name in kv_cache_group.layer_names:
-                # attn group idx default to 0 if not provided
-                layer_to_attn_group_idx[layer_name] = (kv_cache_group_idx, 0)
+    layer_to_kv_cache_group: dict[str, KVCacheGroupSpec] = {}
+    for kv_cache_group in kv_cache_groups:
+        for layer_name in kv_cache_group.layer_names:
+            layer_to_kv_cache_group[layer_name] = kv_cache_group
 
     for layer_name, target_layer_name in shared_kv_cache_layers.items():
-        kv_caches[layer_name] = kv_caches[target_layer_name]
-        kv_cache_group_idx = layer_to_attn_group_idx[target_layer_name][0]
-        kv_cache_groups[kv_cache_group_idx].layer_names.append(layer_name)
-
-        if attn_groups:
-            attn_group_idx = layer_to_attn_group_idx[target_layer_name][1]
-            attn_groups[kv_cache_group_idx][attn_group_idx].layer_names.append(
-                layer_name)
+        tgt_kv_cache_group = layer_to_kv_cache_group[target_layer_name]
+        tgt_kv_cache_group.layer_names.append(layer_name)
 
         if runner_only_attn_layers is not None:
             runner_only_attn_layers.add(layer_name)

From d3d2aad5a2a06b0ea22ae09cb0c6fb6912fa64d8 Mon Sep 17 00:00:00 2001
From: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Date: Thu, 28 Aug 2025 18:18:10 -0400
Subject: [PATCH 075/125] [Log] Use Debug Once for DeepGEMM E8M0 When not
 Enabled (#23858)

---
 vllm/utils/deep_gemm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index cd1dbfb813fe..90cdd396209c 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -36,7 +36,7 @@ def is_deep_gemm_e8m0_used() -> bool:
     "E8M0 scale on a Hopper or Blackwell-class GPU.
     """
     if not is_deep_gemm_supported():
-        logger.info_once(
+        logger.debug_once(
             "DeepGEMM E8M0 disabled: DeepGEMM not supported on this system.")
         return False
 

From b668055a114086b8968d9ff4a53586f1d8ea0b47 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 28 Aug 2025 18:05:52 -0700
Subject: [PATCH 076/125] [V0 Deprecation] Remove V0 Samplers test (#23862)

---
 tests/samplers/test_sampler.py         | 769 -------------------------
 tests/samplers/test_seeded_generate.py |  86 ---
 2 files changed, 855 deletions(-)
 delete mode 100644 tests/samplers/test_sampler.py
 delete mode 100644 tests/samplers/test_seeded_generate.py

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
deleted file mode 100644
index 520b88d03ac8..000000000000
--- a/tests/samplers/test_sampler.py
+++ /dev/null
@@ -1,769 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import itertools
-import random
-from dataclasses import dataclass
-from typing import Optional
-from unittest.mock import Mock, patch
-
-import pytest
-import torch
-from transformers import GenerationConfig, GenerationMixin
-
-import vllm.envs as envs
-from vllm.model_executor.layers.sampler import Sampler
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import Counter, is_pin_memory_available
-
-
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    This file tests V0 internals, so set VLLM_USE_V1=0.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
-class MockLogitsSampler(Sampler):
-
-    def __init__(self, fake_logits: torch.Tensor):
-        super().__init__()
-        self.fake_logits = fake_logits
-
-    def forward(self, *args, **kwargs):
-        return super().forward(*args, **kwargs)
-
-
-def _prepare_test(
-        batch_size: int
-) -> tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]:
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, VOCAB_SIZE),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-    return input_tensor, fake_logits, sampler
-
-
-VOCAB_SIZE = 32000
-RANDOM_SEEDS = list(range(128))
-CUDA_DEVICES = [
-    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
-]
-
-
-def _do_sample(
-    batch_size: int,
-    input_tensor: torch.Tensor,
-    sampler: MockLogitsSampler,
-    sampling_params: SamplingParams,
-    device: str,
-):
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-    return sampler(logits=input_tensor, sampling_metadata=sampling_metadata)
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_greedy(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(temperature=0)
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-    expected = torch.argmax(fake_logits, dim=-1)
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == expected[i].item()
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random_seed(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    for i in range(batch_size):
-        fake_logits[i, i] = 1e2
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    for i, sequence_output in enumerate(sampler_output):
-        for nth_output in sequence_output.samples:
-            assert nth_output.output_token == i
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_all_random_seed_deterministic(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    first_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                      sampling_params, device)
-
-    second_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                       sampling_params, device)
-
-    assert first_sampler_output == second_sampler_output
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_min_tokens_penalty(seed: int, device: str):
-    seq_id_counter = Counter(start=random.randint(0, 100))
-    set_random_seed(seed)
-    torch.set_default_device(device)
-
-    def create_sampling_params(min_tokens,
-                               eos_token_id=0,
-                               *,
-                               stop_token_ids: Optional[list[int]] = None,
-                               prompt_logprobs: Optional[int] = None):
-        sampling_params = SamplingParams(
-            min_tokens=min_tokens,
-            max_tokens=9999,  # keep higher than max of min_tokens
-            stop_token_ids=stop_token_ids,
-            # requesting prompt_logprobs changes the structure of `logits`
-            prompt_logprobs=prompt_logprobs,
-        )
-        sampling_params.all_stop_token_ids.add(eos_token_id)
-        return sampling_params
-
-    def create_sequence_data(num_input=3, num_generated=0):
-        seq_data = SequenceData.from_seqs(
-            random.choices(range(0, VOCAB_SIZE), k=num_input))
-        if num_generated > 0:
-            seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE),
-                                                       k=num_generated)
-        return seq_data
-
-    def generate_test_case():
-        # generate multiple seq groups but limit total batch size
-        batch_size = random.randint(1, 128)
-
-        expected_penalization = []
-        sequence_metadata_list: list[SequenceGroupMetadata] = []
-        # 20% chance to generate seq group metadata list with all prompts
-        is_prompt = random.random() < 0.2
-        while batch_size > 0:
-            num_seqs = 1 if is_prompt else random.randint(1, batch_size)
-
-            eos_token_id = random.randint(0, VOCAB_SIZE - 1)
-            min_tokens = random.randint(0, 50)
-            num_stop_tokens = random.randint(0, 8)
-            if num_stop_tokens > 0:
-                stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1),
-                                                k=num_stop_tokens)
-            else:
-                stop_token_ids = None
-
-            sampling_params = create_sampling_params(
-                min_tokens=min_tokens,
-                eos_token_id=eos_token_id,
-                stop_token_ids=stop_token_ids)
-
-            seq_data: dict[int, SequenceData] = {}
-            seq_group_penalization: list[bool] = []
-            for _ in range(num_seqs):
-                num_input = random.randint(1, 100)
-                num_generated = 0 if is_prompt else random.randint(1, 100)
-                seq_data[next(seq_id_counter)] = create_sequence_data(
-                    num_input=num_input, num_generated=num_generated)
-                seq_group_penalization.append(num_generated < min_tokens)
-
-            expected_penalization.extend(seq_group_penalization)
-            sequence_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{batch_size}",
-                    is_prompt=is_prompt,
-                    seq_data=seq_data,
-                    sampling_params=sampling_params,
-                    block_tables={},
-                ))
-            batch_size -= num_seqs
-
-        return {
-            "expected_penalization": expected_penalization,
-            "seq_group_metadata_list": sequence_metadata_list,
-        }
-
-    # define some explicit test cases for edge case behavior
-    prompt_without_penalization = {
-        "expected_penalization": [False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(0),
-                block_tables={},
-            ),
-        ]
-    }
-
-    prompt_with_penalization = {
-        "expected_penalization": [True],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(1),
-                block_tables={},
-            ),
-        ]
-    }
-
-    prompt_with_penalization_and_prompt_logprobs = {
-        "expected_penalization": [False, False, True],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(num_input=3),
-                },
-                sampling_params=create_sampling_params(1, prompt_logprobs=3),
-                block_tables={},
-            ),
-        ]
-    }
-
-    stop_penalizing_after_min_tokens = {
-        "expected_penalization": [False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                },
-                sampling_params=create_sampling_params(1),
-                block_tables={},
-            )
-        ]
-    }
-
-    stop_token_ids = [42, 99, 42, 0]  # intentional duplication
-    prompt_combination = {
-        "expected_penalization": [False, True, False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_2",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(num_input=2),
-                },
-                sampling_params=create_sampling_params(1, prompt_logprobs=3),
-                block_tables={},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_3",
-                is_prompt=True,
-                seq_data={
-                    next(seq_id_counter): create_sequence_data(),
-                },
-                sampling_params=create_sampling_params(
-                    0, stop_token_ids=stop_token_ids),
-                block_tables={},
-            )
-        ]
-    }
-
-    stop_token_ids = [1, 999, 37, 37]  # intentional duplication
-    decode_combination = {
-        "expected_penalization": [True, False, False, True, False],
-        "seq_group_metadata_list": [
-            SequenceGroupMetadata(
-                request_id="test_1",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=100),
-                },
-                sampling_params=create_sampling_params(
-                    2, stop_token_ids=stop_token_ids),
-                block_tables={},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_2",
-                is_prompt=False,
-                seq_data={
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=20),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=1),
-                    next(seq_id_counter):
-                    create_sequence_data(num_generated=10),
-                },
-                sampling_params=create_sampling_params(
-                    10, prompt_logprobs=5, stop_token_ids=stop_token_ids),
-                block_tables={},
-            ),
-        ]
-    }
-
-    if seed == 0:
-        test_cases = [
-            prompt_without_penalization,
-            prompt_with_penalization,
-            prompt_with_penalization_and_prompt_logprobs,
-            stop_penalizing_after_min_tokens,
-            prompt_combination,
-            decode_combination,
-        ]
-    else:
-        test_cases = [generate_test_case()]
-
-    def run_test_case(*, expected_penalization: list[bool],
-                      seq_group_metadata_list: list[SequenceGroupMetadata]):
-        assert expected_penalization, \
-            "Invalid test case, need expected_penalization"
-        assert seq_group_metadata_list, \
-            "Invalid test case, need seq_group_metadata_list"
-
-        batch_size = 0
-        seq_lens: list[int] = []
-        sampling_params_per_row: list[SamplingParams] = []
-        for sgm in seq_group_metadata_list:
-            sampling_params = sgm.sampling_params
-
-            num_rows = len(sgm.seq_data)
-            if sgm.is_prompt:
-                # a prompt seq_group has only one sequence
-                seq_data = next(iter(sgm.seq_data.values()))
-                prompt_len = seq_data.get_prompt_len()
-                seq_lens.append(prompt_len)
-
-                assert sgm.sampling_params is not None
-                if sgm.sampling_params.prompt_logprobs:
-                    # with prompt_logprobs each token in the prompt has a row in
-                    # logits
-                    num_rows = prompt_len
-
-            batch_size += num_rows
-            sampling_params_per_row.extend(
-                itertools.repeat(sampling_params, num_rows))
-
-        assert len(
-            expected_penalization
-        ) == batch_size, \
-            ("Invalid test case, expected_penalization does not match computed"
-             "batch size")
-
-        _, fake_logits, sampler = _prepare_test(batch_size)
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens=seq_lens if seq_lens else None,
-            query_lens=seq_lens if seq_lens else [1] * batch_size,
-            device=device,
-            pin_memory=is_pin_memory_available())
-        # the logits tensor is modified in-place by the sampler
-        _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
-
-        for logits_idx, (should_penalize, sampling_params) in enumerate(
-                zip(expected_penalization, sampling_params_per_row)):
-
-            tokens_to_check = sampling_params.all_stop_token_ids
-
-            if should_penalize:
-                for token_id in tokens_to_check:
-                    assert fake_logits[logits_idx, token_id] == -float(
-                        'inf'
-                    ), f"Expected token {token_id} for logits row {logits_idx}"
-                    " to be penalized"
-                # no other tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] == -float('inf')) == len(
-                        tokens_to_check
-                    ), f"Expected only {len(tokens_to_check)} to be penalized"
-            else:
-                # no tokens should be set to -inf
-                assert torch.count_nonzero(
-                    fake_logits[logits_idx, :] ==
-                    -float('inf')) == 0, "No tokens should have been penalized"
-
-    for test_case in test_cases:
-        run_test_case(**test_case)
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_mixed(seed: int, device: str):
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler = _prepare_test(batch_size)
-
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    expected_tokens: list[Optional[list[int]]] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        expected: Optional[list[int]] = None
-        sampling_type = random.randint(0, 2)
-        if sampling_type == 0:
-            sampling_params = SamplingParams(temperature=0)
-            expected = [int(torch.argmax(fake_logits[i], dim=-1).item())]
-        elif sampling_type in (1, 2):
-            n = random.randint(1, 10)
-            sampling_params = SamplingParams(
-                temperature=random.random() + 0.1,
-                top_p=min(random.random() + 0.1, 1),
-                top_k=random.randint(0, 10),
-                n=n,
-                presence_penalty=random.randint(0, 1),
-            )
-            if sampling_type == 2:
-                sampling_params.seed = random.randint(0, 10000)
-            else:
-                for idx in range(n):
-                    fake_logits[i, i + idx] = 1e2
-                expected = list(range(i, i + n))
-
-        expected_tokens.append(expected)
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=sampling_params,
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    generators: dict[str, torch.Generator] = {}
-
-    def test_sampling():
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=is_pin_memory_available(),
-            generators=generators)
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        for i, (sequence_output, metadata) in enumerate(
-                zip(sampler_output, seq_group_metadata_list)):
-            assert metadata.sampling_params is not None
-
-            if (metadata.sampling_params.seed is not None
-                    and expected_tokens[i] is None):
-                # Record seeded random result to compare with results of
-                # second invocation
-                expected_tokens[i] = [
-                    nth_output.output_token
-                    for nth_output in sequence_output.samples
-                ]
-                continue
-
-            expected_tokens_item = expected_tokens[i]
-            assert expected_tokens_item is not None
-
-            for n, nth_output in enumerate(sequence_output.samples):
-                assert metadata.sampling_params is not None
-
-                if (metadata.sampling_params.temperature == 0
-                        or metadata.sampling_params.seed is not None):
-                    # Ensure exact matches for greedy or random with seed
-                    assert nth_output.output_token == expected_tokens_item[n]
-                else:
-                    # For non-seeded random check that one of the high-logit
-                    # tokens were chosen
-                    assert nth_output.output_token in expected_tokens_item
-
-    # Test batch
-    test_sampling()
-
-    # Shuffle the batch and resample
-    target_index = list(range(batch_size))
-    for list_to_shuffle in (target_index, seq_group_metadata_list,
-                            expected_tokens, seq_lens):
-        random.Random(seed).shuffle(list_to_shuffle)
-    target_index = torch.tensor(target_index)
-    input_tensor.data = input_tensor.index_select(0, target_index)
-    fake_logits.data = fake_logits.index_select(0, target_index)
-
-    # This time, results of seeded random samples will be compared with
-    # the corresponding sample in the pre-shuffled batch
-    test_sampling()
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_top_k_top_p(seed: int, device: str):
-    set_random_seed(seed)
-    batch_size = random.randint(1, 256)
-    top_k = random.randint(100, 500)
-    top_p = random.random() * 0.1
-    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024),
-                              device=device,
-                              dtype=torch.float16)
-    fake_logits = torch.normal(0,
-                               5,
-                               size=(batch_size, vocab_size),
-                               device=input_tensor.device,
-                               dtype=input_tensor.dtype)
-    sampler = MockLogitsSampler(fake_logits)
-
-    generation_model = GenerationMixin()
-    generation_config = GenerationConfig(top_k=top_k,
-                                         top_p=top_p,
-                                         do_sample=True)
-
-    @dataclass
-    class MockConfig:
-        is_encoder_decoder: bool = False
-
-    generation_model.config = MockConfig()  # needed by the following method
-    generation_model._prepare_special_tokens(generation_config, device=device)
-    processors = generation_model._get_logits_processor(generation_config,
-                                                        None,
-                                                        None,
-                                                        None, [],
-                                                        device=device)
-    assert len(processors) == 2  # top_p and top_k
-
-    seq_group_metadata_list: list[SequenceGroupMetadata] = []
-    seq_lens: list[int] = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(
-                    temperature=1,
-                    top_k=top_k,
-                    top_p=top_p,
-                ),
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-
-    sample_probs = None
-
-    def mock_sample(probs, *args, **kwargs):
-        nonlocal sample_probs
-        sample_probs = probs
-        return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
-                 for prob in probs], None)
-
-    # top-k and top-p is only calculated when flashinfer kernel is not available
-    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
-         patch("vllm.model_executor.layers.sampler."
-               "flashinfer_top_k_top_p_sampling", None):
-        sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
-
-    assert sample_probs is not None
-
-    hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
-    hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
-    torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
-    assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_flashinfer_fallback(seed: int, device: str):
-    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
-        pytest.skip("Flashinfer sampler is disabled")
-
-    pytest.skip("After FlashInfer 0.2.3, sampling will never fail")
-
-    set_random_seed(seed)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-
-    def failing_flashinfer_sampling(*_args, **_kwargs):
-        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
-
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        n=random.randint(1, 10),
-        seed=random.randint(0, 10000),
-    )
-    sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                sampling_params, device)
-
-    with patch(
-            "vllm.model_executor.layers.sampler."
-            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
-        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                             sampling_params, device)
-
-    assert sampler_output == fallback_sampler_output
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_repetition_penalty_mixed(device: str):
-
-    vocab_size = 8
-
-    def test_sampling_params(sampling_params: list[SamplingParams]):
-
-        seq_group_metadata_list: list[SequenceGroupMetadata] = []
-        seq_lens: list[int] = []
-        for i in range(2):
-            seq_group_metadata_list.append(
-                SequenceGroupMetadata(
-                    request_id=f"test_{i}",
-                    is_prompt=True,
-                    seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                    sampling_params=sampling_params[i],
-                    block_tables={0: [1]},
-                ))
-            seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-        sampling_metadata = SamplingMetadata.prepare(
-            seq_group_metadata_list,
-            seq_lens,
-            query_lens=seq_lens,
-            device=device,
-            pin_memory=is_pin_memory_available())
-
-        fake_logits = torch.full((2, vocab_size),
-                                 1e-2,
-                                 device=device,
-                                 dtype=torch.float16)
-
-        fake_logits[:, 5] = 1.1e-2
-        fake_logits[:, 1] = 1.2e-2
-
-        sampler = MockLogitsSampler(fake_logits)
-
-        sampler_output = sampler(logits=fake_logits,
-                                 sampling_metadata=sampling_metadata)
-
-        generated_tokens = []
-        for output in sampler_output:
-            generated_tokens.append(output.samples[0].output_token)
-
-        return generated_tokens
-
-    # one configuration is greedy with repetition_penalty
-    sampling_params_rep = SamplingParams(
-        temperature=0.0,
-        repetition_penalty=2.0,
-    )
-
-    # other configuration is sampling w/o repetition_penalty
-    sampling_params_sample = SamplingParams(
-        temperature=1.0,
-        top_k=1,
-        seed=42,
-    )
-
-    tokens1 = test_sampling_params(
-        [sampling_params_rep, sampling_params_sample])
-
-    tokens2 = test_sampling_params(
-        [sampling_params_sample, sampling_params_rep])
-
-    assert tokens1[0] == tokens2[1]
-    assert tokens1[1] == tokens2[0]
-
-
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-def test_sampler_include_gpu_probs_tensor(device: str):
-    set_random_seed(42)
-    torch.set_default_device(device)
-    batch_size = random.randint(1, 256)
-    _, fake_logits, sampler = _prepare_test(batch_size)
-    sampler.include_gpu_probs_tensor = True
-    sampler.should_modify_greedy_probs_inplace = False
-
-    sampling_params = SamplingParams(temperature=0)
-
-    mock_inplace = Mock()
-    with patch(
-            "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace",
-            mock_inplace):
-
-        sampler_output = _do_sample(batch_size, fake_logits, sampler,
-                                    sampling_params, device)
-        mock_inplace.assert_not_called()
-
-    assert sampler_output.sampled_token_probs is not None
-    assert sampler_output.logprobs is not None
-    assert sampler_output.sampled_token_ids is not None
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
deleted file mode 100644
index 5a0efd98acc1..000000000000
--- a/tests/samplers/test_seeded_generate.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Verify that seeded random sampling is deterministic.
-
-Run `pytest tests/samplers/test_seeded_generate.py`.
-"""
-import copy
-import random
-from itertools import combinations
-
-import pytest
-
-from vllm import SamplingParams
-from vllm.model_executor.utils import set_random_seed
-
-MODEL = "facebook/opt-125m"
-RANDOM_SEEDS = list(range(5))
-
-
-@pytest.fixture
-def vllm_model(vllm_runner, monkeypatch):
-    # This file relies on V0 internals.
-    monkeypatch.setenv("VLLM_USE_V1", "0")
-    with vllm_runner(MODEL, dtype="half") as vllm_model:
-        yield vllm_model
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_random_sample_with_seed(
-    vllm_model,
-    example_prompts,
-    seed: int,
-) -> None:
-    set_random_seed(seed)
-
-    sampling_params = SamplingParams(
-        # Parameters to ensure sufficient randomness
-        temperature=3.0,
-        top_p=min(random.random() + 0.3, 1),
-        top_k=random.randint(5, 20),
-        n=random.randint(1, 10),
-        presence_penalty=random.randint(0, 1),
-        max_tokens=8,
-        ignore_eos=True,
-    )
-
-    sampling_params_seed_1 = copy.deepcopy(sampling_params)
-    sampling_params_seed_1.seed = 100
-    sampling_params_seed_2 = copy.deepcopy(sampling_params)
-    sampling_params_seed_2.seed = 200
-
-    llm = vllm_model.llm
-
-    for prompt in example_prompts:
-        for params in (
-                sampling_params,
-                sampling_params_seed_1,
-                sampling_params_seed_2,
-                sampling_params,
-                sampling_params_seed_1,
-                sampling_params_seed_2,
-        ):
-            llm._add_request(prompt, params=params)
-
-    results = llm._run_engine(use_tqdm=False)
-    all_outputs = [[out.token_ids for out in output.outputs]
-                   for output in results]
-
-    for i in range(0, len(example_prompts), 6):
-        outputs = all_outputs[i:i + 6]
-
-        # verify all non-seeded requests differ
-        for output_a, output_b in combinations(
-            (outputs[0], outputs[1], outputs[2], outputs[3]),
-                2,
-        ):
-            assert output_a != output_b
-
-        # verify requests with the same seed match
-        assert outputs[1] == outputs[4]
-        assert outputs[2] == outputs[5]
-
-        # verify generations within the same parallel sampling group differ
-        for output in outputs:
-            for sub_output_a, sub_output_b in combinations(output, 2):
-                assert sub_output_a != sub_output_b

From 235c9db8a755e0404628a568bf29a492257fe52e Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Fri, 29 Aug 2025 09:23:04 +0800
Subject: [PATCH 077/125] [XPU] support data parallel for MoE models on XPU
 (#22887)

Signed-off-by: chzhang <chaojun.zhang@intel.com>
---
 .../device_communicators/xpu_communicator.py          | 11 +++++++++++
 vllm/model_executor/layers/fused_moe/layer.py         |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
index dee5ed7a2883..067315deb773 100644
--- a/vllm/distributed/device_communicators/xpu_communicator.py
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -7,8 +7,13 @@
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 
+import vllm.envs as envs
+from vllm.logger import init_logger
+
 from .base_device_communicator import DeviceCommunicatorBase
 
+logger = init_logger(__name__)
+
 
 class XpuCommunicator(DeviceCommunicatorBase):
 
@@ -18,6 +23,12 @@ def __init__(self,
                  device_group: Optional[ProcessGroup] = None,
                  unique_name: str = ""):
         super().__init__(cpu_group, device, device_group, unique_name)
+        if self.use_all2all:
+            all2all_backend = envs.VLLM_ALL2ALL_BACKEND
+            if all2all_backend == "naive":
+                from .all2all import NaiveAll2AllManager
+                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+                logger.info("Using naive all2all manager.")
 
     def all_reduce(self, input_) -> torch.Tensor:
         dist.all_reduce(input_, group=self.device_group)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 28123d3958ad..5a87763c0721 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -655,6 +655,8 @@ def forward_tpu(
         forward_native = forward_tpu
     elif current_platform.is_cpu():
         forward_native = forward_cpu
+    elif current_platform.is_xpu():
+        forward_native = forward_xpu
     else:
         forward_native = forward_cuda
 

From de533ab2a14192e461900a4950e2b426d99a6862 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 29 Aug 2025 02:26:34 +0100
Subject: [PATCH 078/125] [Models] Improve iteration over layers (#19497)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/model_executor/models/arcee.py            | 3 ++-
 vllm/model_executor/models/arctic.py           | 3 ++-
 vllm/model_executor/models/baichuan.py         | 3 ++-
 vllm/model_executor/models/bailing_moe.py      | 4 ++--
 vllm/model_executor/models/bamba.py            | 3 +--
 vllm/model_executor/models/bloom.py            | 3 ++-
 vllm/model_executor/models/chameleon.py        | 3 ++-
 vllm/model_executor/models/chatglm.py          | 3 ++-
 vllm/model_executor/models/commandr.py         | 3 ++-
 vllm/model_executor/models/dbrx.py             | 3 ++-
 vllm/model_executor/models/deepseek.py         | 5 +++--
 vllm/model_executor/models/deepseek_v2.py      | 3 ++-
 vllm/model_executor/models/dots1.py            | 3 ++-
 vllm/model_executor/models/ernie45_moe.py      | 4 ++--
 vllm/model_executor/models/ernie45_vl_moe.py   | 4 ++--
 vllm/model_executor/models/exaone.py           | 3 ++-
 vllm/model_executor/models/exaone4.py          | 3 ++-
 vllm/model_executor/models/falcon.py           | 3 ++-
 vllm/model_executor/models/gemma.py            | 3 ++-
 vllm/model_executor/models/gemma2.py           | 3 ++-
 vllm/model_executor/models/gemma3.py           | 3 ++-
 vllm/model_executor/models/glm4_moe.py         | 4 ++--
 vllm/model_executor/models/gpt2.py             | 3 ++-
 vllm/model_executor/models/gpt_bigcode.py      | 3 ++-
 vllm/model_executor/models/gpt_j.py            | 5 +++--
 vllm/model_executor/models/gpt_neox.py         | 3 ++-
 vllm/model_executor/models/granite.py          | 3 ++-
 vllm/model_executor/models/granitemoe.py       | 3 ++-
 vllm/model_executor/models/granitemoehybrid.py | 3 +--
 vllm/model_executor/models/granitemoeshared.py | 4 ++--
 vllm/model_executor/models/grok1.py            | 4 ++--
 vllm/model_executor/models/internlm2.py        | 3 ++-
 vllm/model_executor/models/internlm2_ve.py     | 3 ++-
 vllm/model_executor/models/jais.py             | 3 ++-
 vllm/model_executor/models/jamba.py            | 3 ++-
 vllm/model_executor/models/lfm2.py             | 5 +++--
 vllm/model_executor/models/llama.py            | 3 ++-
 vllm/model_executor/models/mamba2.py           | 4 +---
 vllm/model_executor/models/mimo.py             | 3 ++-
 vllm/model_executor/models/minicpm.py          | 3 ++-
 vllm/model_executor/models/minimax_text_01.py  | 4 ++--
 vllm/model_executor/models/mixtral.py          | 3 ++-
 vllm/model_executor/models/mixtral_quant.py    | 3 ++-
 vllm/model_executor/models/molmo.py            | 3 ++-
 vllm/model_executor/models/mpt.py              | 3 ++-
 vllm/model_executor/models/nemotron.py         | 3 ++-
 vllm/model_executor/models/nemotron_h.py       | 3 +--
 vllm/model_executor/models/nemotron_nas.py     | 4 ++--
 vllm/model_executor/models/olmo.py             | 3 ++-
 vllm/model_executor/models/olmo2.py            | 3 ++-
 vllm/model_executor/models/olmoe.py            | 3 ++-
 vllm/model_executor/models/opt.py              | 3 ++-
 vllm/model_executor/models/orion.py            | 3 ++-
 vllm/model_executor/models/persimmon.py        | 3 ++-
 vllm/model_executor/models/phi.py              | 3 ++-
 vllm/model_executor/models/phimoe.py           | 3 ++-
 vllm/model_executor/models/plamo2.py           | 3 ++-
 vllm/model_executor/models/qwen.py             | 3 ++-
 vllm/model_executor/models/qwen2.py            | 3 ++-
 vllm/model_executor/models/qwen2_moe.py        | 3 ++-
 vllm/model_executor/models/qwen3_moe.py        | 4 ++--
 vllm/model_executor/models/seed_oss.py         | 3 ++-
 vllm/model_executor/models/stablelm.py         | 3 ++-
 vllm/model_executor/models/starcoder2.py       | 3 ++-
 vllm/model_executor/models/step3_text.py       | 4 ++--
 65 files changed, 129 insertions(+), 83 deletions(-)

diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 4cf73e2e0ea5..13ed4da0602a 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -9,6 +9,7 @@
 # activation.
 
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -243,7 +244,7 @@ def forward(
 
         aux_hidden_states: list[torch.Tensor] = []
         for idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
+                islice(self.layers, self.start_layer, self.end_layer)):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(
                     hidden_states +
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 4693c9487a8b..c566611266af 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Snowflake Arctic model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -403,7 +404,7 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 804a2f1785d5..4563c356666a 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -22,6 +22,7 @@
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -309,7 +310,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index 23cab3509ca8..a42640cef9d4 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only BailingMoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -359,8 +360,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 hidden_states,
                 position_ids,
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index e2cd31af5390..a72bbdebe531 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -345,8 +345,7 @@ def forward(
 
         residual = None
         num_attn = 0
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
+        for i, layer in enumerate(self.layers):
             if isinstance(layer, BambaAttentionDecoderLayer):
                 num_attn += 1
 
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 126404584892..13ecda0122be 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -20,6 +20,7 @@
 """Inference-only BLOOM model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -273,7 +274,7 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index e6914ad4c495..28a1a66c2329 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
+from itertools import islice
 from typing import Annotated, Any, Literal, Optional, Union
 
 import torch
@@ -914,7 +915,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 5470ff3e8b61..1fc2da3e4d7c 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -5,6 +5,7 @@
 """Inference-only ChatGLM model compatible with THUDM weights."""
 import json
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -281,7 +282,7 @@ def forward(
         hidden_states: torch.Tensor,
         position_ids: torch.Tensor,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states=hidden_states,
                                   position_ids=position_ids)
 
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 4dd84b8f8fdd..7f87e31abdcd 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -23,6 +23,7 @@
 # This file is based on the LLama model definition file in transformers
 """PyTorch Cohere model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -322,7 +323,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index e74d90e0b1d7..519cd522213b 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -359,7 +360,7 @@ def forward(
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
-        for block in self.blocks[self.start_layer:self.end_layer]:
+        for block in islice(self.blocks, self.start_layer, self.end_layer):
             hidden_states = block(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 2f0202f1e038..e815f13d66dc 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Deepseek model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -377,7 +378,7 @@ def forward(
         else:
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -483,4 +484,4 @@ def compute_logits(
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
\ No newline at end of file
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 7657e7cb003d..ed033954f7c0 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -25,6 +25,7 @@
 """Inference-only DeepseekV2/DeepseekV3 model."""
 import typing
 from collections.abc import Callable, Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -712,7 +713,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index 5f410c0ae5fb..c386f8db9eec 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -25,6 +25,7 @@
 # limitations under the License.
 """Inference-only dots1 model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -391,7 +392,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index 4780ea931ea5..33ec27fc630e 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only ErineMoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -419,8 +420,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index f56c09843515..780974c3b758 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only Erine VL model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -508,8 +509,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual,
                                             visual_token_mask, **kwargs)
 
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 8052b6bb8234..942db0143a45 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -26,6 +26,7 @@
 """Inference-only Exaone model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -371,7 +372,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index 827e9014184b..971fcbd2aa27 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -22,6 +22,7 @@
 """Inference-only Exaone model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -354,7 +355,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 62a93dabd5d7..a9fe0924babd 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -22,6 +22,7 @@
 
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -389,7 +390,7 @@ def forward(
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 59c3102add4c..12eb27503870 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -18,6 +18,7 @@
 """Inference-only Gemma model compatible with HuggingFace weights."""
 from collections.abc import Iterable
 from functools import cache
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -308,7 +309,7 @@ def forward(
         else:
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 8cfe92c64540..0bdb6c6bf7ae 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -17,6 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -292,7 +293,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index b762be3c5292..410c715d5241 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -16,6 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -398,7 +399,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index fe5e46a99826..fcc63815ac56 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -24,6 +24,7 @@
 """Inference-only GLM-4.5 model compatible with HuggingFace weights."""
 import typing
 from collections.abc import Callable, Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -440,8 +441,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 98d76337395b..4446b5ab181c 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -20,6 +20,7 @@
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -228,7 +229,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 036ded530f97..d5c2604145ee 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -21,6 +21,7 @@
 # limitations under the License.
 """Inference-only GPTBigCode model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -246,7 +247,7 @@ def forward(
         else:
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index bd162a5e57bc..584c7f5d8a2d 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -19,6 +19,7 @@
 # limitations under the License.
 """Inference-only GPT-J model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -223,7 +224,7 @@ def forward(
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
@@ -336,4 +337,4 @@ def compute_logits(
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(self)
-        return loader.load_weights(weights)
\ No newline at end of file
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index d418d8bb86ce..e97db188e27e 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -19,6 +19,7 @@
 # limitations under the License.
 """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -235,7 +236,7 @@ def forward(
                 hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 507a9206c428..f8ba0229210a 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only IBM Granite model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -316,7 +317,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 7d31854dce8d..07ad75bcf166 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only GraniteMoe model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional
 
 import torch
@@ -303,7 +304,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index f451e65338b7..79c6d8146ba9 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -397,8 +397,7 @@ def forward(
             residual = intermediate_tensors["residual"]
 
         num_attn = 0
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
+        for i, layer in enumerate(self.layers):
             if isinstance(layer, GraniteMoeHybridAttentionDecoderLayer):
                 num_attn += 1
 
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 1e2e8544179c..0b568a4b2268 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -6,6 +6,7 @@
 experts.
 """
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional
 
 import torch
@@ -200,8 +201,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 3659249cd8bd..a59113438337 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only Grok1 model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -347,8 +348,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 26bc48ffbd9b..320e8d9d480c 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Iterable
 from functools import partial
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -297,7 +298,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 4bbb49da0e96..d41ac2b70bc6 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -123,7 +124,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index bed4a5dff2ef..91a06dd50247 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -23,6 +23,7 @@
 
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -276,7 +277,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3c1a0b68df56..aebd2cbe2e99 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jamba model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional
 
 import torch
@@ -350,7 +351,7 @@ def forward(
 
         kv_cache_index = 0
         mamba_cache_index = 0
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             layer_mamba_cache_params = None
             if isinstance(layer, JambaAttentionDecoderLayer):
                 kv_cache_index += 1
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index 5f3148b47ead..927f78c4e4b4 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional
 
 import torch
@@ -374,7 +375,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
@@ -554,4 +555,4 @@ def load_weights(self, weights: Iterable[tuple[str,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
\ No newline at end of file
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e39a6df843cd..a22bde194f5d 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -383,7 +384,7 @@ def forward(
 
         aux_hidden_states = []
         for idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
+                islice(self.layers, self.start_layer, self.end_layer)):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
             hidden_states, residual = layer(positions, hidden_states, residual)
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index 3432cf29feac..81b9a125380a 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -164,9 +164,7 @@ def forward(
             # v1 get mamba2_metadata from forward_context
             mamba2_metadata = None
 
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
-
+        for i, layer in enumerate(self.layers):
             hidden_states, residual = layer(
                 positions=positions,
                 hidden_states=hidden_states,
diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py
index 5b497dd9d89f..ea5292d0df20 100644
--- a/vllm/model_executor/models/mimo.py
+++ b/vllm/model_executor/models/mimo.py
@@ -26,6 +26,7 @@
 # limitations under the License.
 """Inference-only MiMo model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -74,7 +75,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index d398a5d12bbc..5632f8c8cc4f 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -25,6 +25,7 @@
 """Inference-only MiniCPM model compatible with HuggingFace weights."""
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -414,7 +415,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 176a40179bca..93ef13d5d16a 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -3,6 +3,7 @@
 """Inference-only MiniMaxText01 model."""
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import TYPE_CHECKING, Optional, Union
 
 if TYPE_CHECKING:
@@ -1019,8 +1020,7 @@ def forward(self,
 
         minimax_cache_index = 0
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             _caches = None
             if not envs.VLLM_USE_V1 and isinstance(
                     layer.self_attn, MiniMaxText01LinearAttention):
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 30de83da49e0..52fcbbfc58be 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Mixtral model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -307,7 +308,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index c8ad358c622d..692267b4d727 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Mixtral model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import numpy as np
@@ -346,7 +347,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5fc28ed0e493..b2fc7be1af22 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -5,6 +5,7 @@
 from collections.abc import Iterable, Mapping, Sequence
 from dataclasses import dataclass
 from functools import cached_property, partial
+from itertools import islice
 from typing import Annotated, Optional, Union
 
 import numpy as np
@@ -842,7 +843,7 @@ def forward(
             residual = intermediate_tensors["residual"]
 
         # Apply blocks one-by-one.
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 8db52a69924c..48ac91fa6dde 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -4,6 +4,7 @@
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -260,7 +261,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for block in self.blocks[self.start_layer:self.end_layer]:
+        for block in islice(self.blocks, self.start_layer, self.end_layer):
             hidden_states = block(position_ids, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index eabf47b1aede..10adc62d3de3 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Nemotron model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -353,7 +354,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index 07cd5a4c6e24..8a563288cb4d 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -399,8 +399,7 @@ def forward(
 
         residual = None
         num_non_mamba_layers = 0
-        for i in range(len(self.layers)):
-            layer = self.layers[i]
+        for i, layer in enumerate(self.layers):
             layer_mamba_cache_params = None
             if isinstance(layer,
                           NemotronHMambaDecoderLayer) and mamba_cache_params:
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index a766ed9476a6..f8e38dcd80b5 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only deci model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -287,8 +288,7 @@ def forward(
             residual = intermediate_tensors["residual"]
 
         kv_cache_index = 0
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             if not layer._is_no_op_attention:
                 hidden_states, residual = layer(positions, hidden_states,
                                                 residual)
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 01639d398126..71575989565a 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only OLMo model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -280,7 +281,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
 
         # Apply blocks one-by-one.
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             # shape: (batch_size, seq_len, d_model)
             hidden_states = layer(positions, hidden_states)
 
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 66a0f9115585..bccd1b87043a 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -26,6 +26,7 @@
 
 from collections.abc import Iterable
 from functools import partial
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -305,7 +306,7 @@ def forward(
             assert isinstance(hidden_states, torch.Tensor)
 
         # Apply blocks one-by-one.
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             # shape: (batch_size, seq_len, d_model)
             hidden_states = layer(positions, hidden_states)
 
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index a47c3bd41645..9b8525bfadec 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -15,6 +15,7 @@
 """Inference-only OLMoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
 from functools import partial
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -314,7 +315,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 9eaac1e28dcd..b92e586f0bf2 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -20,6 +20,7 @@
 # limitations under the License.
 """Inference-only OPT model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -269,7 +270,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index d121188ba5d4..add751ebf09c 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -7,6 +7,7 @@
 # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
 """Inference-only Orion-14B model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -252,7 +253,7 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index f8db99eb92ba..6bdd38d06880 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only persimmon model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -255,7 +256,7 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 21d517b3a490..789b24eb0f6b 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -38,6 +38,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """Inference-only Phi-1.5 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -240,7 +241,7 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
 
         if not get_pp_group().is_last_rank:
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index cfe0982204fa..15ae081a9f5f 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only PhiMoE model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -500,7 +501,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index e5034b536266..7f70e44b10a6 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only PLaMo2 model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional
 
 import torch
@@ -614,7 +615,7 @@ def forward(
         mamba2_metadata: Mamba2Metadata,
     ) -> torch.Tensor:
         mamba_cache_index = 0
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             layer_mamba_cache_params = None
             if layer.is_mamba:
                 layer_mamba_cache_params = mamba_cache_params.at_layer_idx(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index e804f03e014e..e32dc51f00c0 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -8,6 +8,7 @@
 """Inference-only QWen model compatible with HuggingFace weights."""
 import json
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -234,7 +235,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for layer in self.h[self.start_layer:self.end_layer]:
+        for layer in islice(self.h, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 27c1e68c6704..54dc0bebd9c5 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -25,6 +25,7 @@
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -358,7 +359,7 @@ def forward(
 
         aux_hidden_states = []
         for idx, layer in enumerate(
-                self.layers[self.start_layer:self.end_layer]):
+                islice(self.layers, self.start_layer, self.end_layer)):
             if idx in self.aux_hidden_state_layers:
                 aux_hidden_states.append(hidden_states + residual)
             hidden_states, residual = layer(positions, hidden_states, residual)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 5c4ad34246d6..5551ad8c3232 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -25,6 +25,7 @@
 # limitations under the License.
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -381,7 +382,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 8498f61b35fd..94e6a66bea5c 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -24,6 +24,7 @@
 """Inference-only Qwen3MoE model compatible with HuggingFace weights."""
 import typing
 from collections.abc import Callable, Iterable
+from itertools import islice
 from typing import Any, Optional, Union
 
 import torch
@@ -420,8 +421,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index 34a87a6a69a3..e3c7c700f8fa 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -23,6 +23,7 @@
 # limitations under the License.
 """Inference-only SeedOss model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -340,7 +341,7 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index d6ec743ce845..9e880ebd5081 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -22,6 +22,7 @@
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -247,7 +248,7 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 9d9a2bff0e43..62ff9b618275 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -21,6 +21,7 @@
 # limitations under the License.
 """ PyTorch Starcoder2 model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Optional, Union
 
 import torch
@@ -250,7 +251,7 @@ def forward(
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
-        for layer in self.layers[self.start_layer:self.end_layer]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states = layer(positions, hidden_states)
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({"hidden_states": hidden_states})
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 47d2af5c2a14..97611d3e140e 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only Jurassic model."""
 from collections.abc import Iterable
+from itertools import islice
 from typing import Any, Optional
 
 import torch
@@ -346,8 +347,7 @@ def forward(
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
 
-        for i in range(self.start_layer, self.end_layer):
-            layer = self.layers[i]
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
             hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:

From 006477e60b49babfca96352c7c648f10fff4a053 Mon Sep 17 00:00:00 2001
From: Charlie Fu <charlifu@amd.com>
Date: Thu, 28 Aug 2025 21:52:27 -0500
Subject: [PATCH 079/125] [ROCm][Fix] Fix rocm build caused by #23791 (#23847)

Signed-off-by: charlifu <charlifu@amd.com>
---
 csrc/cache_kernels.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index fc82a1fa8ed7..fbb022464ef2 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -913,7 +913,6 @@ __global__ void cp_gather_cache(
   const int32_t split_end = min((split + 1) * split_slots, tot_slots);
 
   const bool is_active_split = (split_start < tot_slots);
-  const bool is_last_split = (split_end == tot_slots);
 
   if (!is_active_split) return;
 

From c8b3b299c9f3142546e0a41f835e561af1aaffb7 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 29 Aug 2025 00:25:33 -0400
Subject: [PATCH 080/125] [tests] Improve speed and reliability of
 test_transcription_api_correctness (#23854)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .../correctness/test_transcription_api_correctness.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
index 58195f98bd35..0d0ce0be8c5f 100644
--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
@@ -49,8 +49,7 @@ async def transcribe_audio(client, tokenizer, y, sr):
     return latency, num_output_tokens, transcription.text
 
 
-async def bound_transcribe(model_name, sem, client, audio, reference):
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
+async def bound_transcribe(sem, client, tokenizer, audio, reference):
     # Use semaphore to limit concurrent requests.
     async with sem:
         result = await transcribe_audio(client, tokenizer, *audio)
@@ -63,15 +62,19 @@ async def bound_transcribe(model_name, sem, client, audio, reference):
 async def process_dataset(model, client, data, concurrent_request):
     sem = asyncio.Semaphore(concurrent_request)
 
+    # Load tokenizer once outside the loop
+    tokenizer = AutoTokenizer.from_pretrained(model)
+
     # Warmup call as the first `librosa.load` server-side is quite slow.
     audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
-    _ = await bound_transcribe(model, sem, client, (audio, sr), "")
+    _ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")
 
     tasks: list[asyncio.Task] = []
     for sample in data:
         audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
         task = asyncio.create_task(
-            bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
+            bound_transcribe(sem, client, tokenizer, (audio, sr),
+                             sample["text"]))
         tasks.append(task)
     return await asyncio.gather(*tasks)
 

From 98ac0cb32d9462e50bd998f9f2eb6e4c09232c95 Mon Sep 17 00:00:00 2001
From: Isotr0py <mozf@mail2.sysu.edu.cn>
Date: Fri, 29 Aug 2025 12:41:20 +0800
Subject: [PATCH 081/125] [Bugfix] Use `ReplicatedLinear` for
 SequenceClassification head (#23836)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
---
 tests/models/language/pooling/test_qwen3_reranker.py | 7 ++-----
 vllm/model_executor/models/adapters.py               | 5 ++---
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tests/models/language/pooling/test_qwen3_reranker.py b/tests/models/language/pooling/test_qwen3_reranker.py
index 8c6537f3193f..5dd2d9eae911 100644
--- a/tests/models/language/pooling/test_qwen3_reranker.py
+++ b/tests/models/language/pooling/test_qwen3_reranker.py
@@ -96,8 +96,5 @@ def test_rerank_models_mteb_tp(vllm_runner,
         "tensor_parallel_size": 2,
     }
 
-    mteb_test_rerank_models(Qwen3RerankerHfRunner,
-                            vllm_runner,
-                            model_info,
-                            vllm_extra_kwargs,
-                            atol=1.2e-2)
+    mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
+                            vllm_extra_kwargs)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 49e9a2d65ea1..50c2cd97f3d0 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -248,7 +248,7 @@ def as_seq_cls_model(cls: _T) -> _T:
         return cls
 
     # Lazy import
-    from vllm.model_executor.layers.linear import RowParallelLinear
+    from vllm.model_executor.layers.linear import ReplicatedLinear
     from vllm.model_executor.layers.pooler import (ClassifierPooler,
                                                    DispatchPooler, Pooler,
                                                    PoolingMethod, PoolingType)
@@ -264,10 +264,9 @@ def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             config = vllm_config.model_config.hf_config
             quant_config = vllm_config.quant_config
 
-            self.score = RowParallelLinear(
+            self.score = ReplicatedLinear(
                 config.hidden_size,
                 config.num_labels,
-                input_is_parallel=False,
                 bias=False,
                 params_dtype=torch.float32,
                 quant_config=quant_config,

From 5264015d74f2e0213a1e7d51041a558d7ea580e8 Mon Sep 17 00:00:00 2001
From: Jinghui Zhang <jinghuizhang0804@gmail.com>
Date: Thu, 28 Aug 2025 22:54:12 -0700
Subject: [PATCH 082/125] [BugFix][AMD][Deepseek] fix a dtype mismatch error
 for deepseek running on AMD (#23864)

Signed-off-by: Jinghui Zhang <jinghuizhang0804@gmail.com>
---
 .../layers/fused_moe/rocm_aiter_fused_moe.py              | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index 93e20c3477bb..b838fd798bbc 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -279,7 +279,7 @@ def rocm_aiter_grouped_topk(
     if e_score_correction_bias is not None:
         torch.ops.vllm.rocm_aiter_biased_grouped_topk(
             gating_output,
-            e_score_correction_bias,
+            e_score_correction_bias.to(gating_output.dtype),
             topk_weights,
             topk_ids,
             num_expert_group,
@@ -409,15 +409,15 @@ def shuffle_weights(
     *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
 ) -> tuple[torch.Tensor, ...]:
     """
-    Applies shuffle_weight function from AITER to each 
+    Applies shuffle_weight function from AITER to each
     input tensor and returns them.
-    
+
     Rearranges (shuffles) the input tensor/s
     into a specified block layout for optimized computation.
 
     Args:
         *tensors: Variable number of torch.Tensor objects.
-        layout: A pair of integers specifying the 
+        layout: A pair of integers specifying the
         block sizes used to divide the tensors during shuffling.
         Default is (16, 16).
 

From 6597d7a4566d344835f5a90621397d8fee490b10 Mon Sep 17 00:00:00 2001
From: wangxiyuan <wangxiyuan1007@gmail.com>
Date: Fri, 29 Aug 2025 13:54:16 +0800
Subject: [PATCH 083/125] [Platform] import activation_quant_fusion for CUDA
 only (#23882)

Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
---
 vllm/compilation/pass_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index e07e52be9fdf..1b1cbe4fa12c 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -8,13 +8,13 @@
 from vllm.platforms import current_platform
 
 if current_platform.is_cuda_alike():
+    from .activation_quant_fusion import ActivationQuantFusionPass
     from .fusion import FusionPass
     from .fusion_attn import AttnFusionPass
 
 if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
 
-from .activation_quant_fusion import ActivationQuantFusionPass
 from .fix_functionalization import FixFunctionalizationPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
 from .noop_elimination import NoOpEliminationPass

From 05d839c19e9582d62c860686678bac68240d7254 Mon Sep 17 00:00:00 2001
From: Raghavan <oneraghavan@gmail.com>
Date: Fri, 29 Aug 2025 11:25:06 +0530
Subject: [PATCH 084/125] Fix(async): Add support for truncate_prompt_tokens in
 AsyncLLM (#23800)

---
 vllm/v1/engine/async_llm.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 7440fe1f07e9..2a9fa1fd9172 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -15,6 +15,7 @@
 from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE
 from vllm.inputs import PromptType
 from vllm.inputs.preprocess import InputPreprocessor
@@ -348,6 +349,15 @@ async def generate(
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            tokenization_kwargs: dict[str, Any] = {}
+            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
+
+            _validate_truncation_size(
+                self.model_config.max_model_len,
+                truncate_prompt_tokens,
+                tokenization_kwargs,
+            )
+
             q = await self.add_request(
                 request_id,
                 prompt,
@@ -355,6 +365,7 @@ async def generate(
                 lora_request=lora_request,
                 trace_headers=trace_headers,
                 priority=priority,
+                tokenization_kwargs=tokenization_kwargs,
                 data_parallel_rank=data_parallel_rank,
             )
 
@@ -481,6 +492,7 @@ async def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
+        truncate_prompt_tokens: Optional[int] = None,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """
@@ -503,6 +515,14 @@ async def encode(
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            if tokenization_kwargs is None:
+                tokenization_kwargs = dict[str, Any]()
+            _validate_truncation_size(
+                self.model_config.max_model_len,
+                truncate_prompt_tokens,
+                tokenization_kwargs,
+            )
+
             q = await self.add_request(
                 request_id,
                 prompt,

From b4f9e9631c84c73cbf05f18402074be1abf0471d Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 29 Aug 2025 14:28:35 +0800
Subject: [PATCH 085/125] [CI/Build] Clean up LoRA test (#23890)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |  1 -
 .buildkite/test-pipeline.yaml                 |  9 +--
 .../llm/test_generate_multiple_loras.py       | 80 -------------------
 ...ith_tp.py => test_llm_with_multi_loras.py} | 37 ++++++++-
 4 files changed, 40 insertions(+), 87 deletions(-)
 delete mode 100644 tests/entrypoints/llm/test_generate_multiple_loras.py
 rename tests/lora/{test_multi_loras_with_tp.py => test_llm_with_multi_loras.py} (80%)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index df0bae0c9cbf..c395011a2448 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -164,7 +164,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
   --ignore=entrypoints/llm/test_chat.py \
   --ignore=entrypoints/llm/test_accuracy.py \
   --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
   --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 454aaca0a112..f2652045526b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -109,10 +109,9 @@ steps:
   - tests/entrypoints/offline_mode
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
 - label: Entrypoints Test (API Server) # 40min
@@ -326,7 +325,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py
   parallelism: 4
 
 - label: PyTorch Compilation Unit Tests
@@ -807,13 +806,13 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_multi_loras_with_tp.py
+    - pytest -v -s -x lora/test_llm_with_multi_loras.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
   mirror_hardwares: [amdexperimental]
   working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
+  num_gpus: 2 
   optional: true
   source_file_dependencies:
   - vllm/
diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py
deleted file mode 100644
index a04f195692e9..000000000000
--- a/tests/entrypoints/llm/test_generate_multiple_loras.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import weakref
-
-import pytest
-# downloading lora to test lora requests
-from huggingface_hub import snapshot_download
-
-from vllm import LLM
-from vllm.distributed import cleanup_dist_env_and_memory
-from vllm.lora.request import LoRARequest
-
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-
-PROMPTS = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
-
-
-@pytest.fixture(scope="module")
-def monkeypatch_module():
-    from _pytest.monkeypatch import MonkeyPatch
-    mpatch = MonkeyPatch()
-    yield mpatch
-    mpatch.undo()
-
-
-@pytest.fixture(scope="module", params=[False, True])
-def llm(request, monkeypatch_module):
-
-    use_v1 = request.param
-    monkeypatch_module.setenv('VLLM_USE_V1', '1' if use_v1 else '0')
-
-    # pytest caches the fixture so we use weakref.proxy to
-    # enable garbage collection
-    llm = LLM(model=MODEL_NAME,
-              tensor_parallel_size=1,
-              max_model_len=8192,
-              enable_lora=True,
-              max_loras=4,
-              max_lora_rank=64,
-              max_num_seqs=128,
-              enforce_eager=True)
-
-    yield weakref.proxy(llm)
-
-    del llm
-
-    cleanup_dist_env_and_memory()
-
-
-@pytest.fixture(scope="module")
-def zephyr_lora_files():
-    return snapshot_download(repo_id=LORA_NAME)
-
-
-@pytest.mark.skip_global_cleanup
-def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
-    lora_request = [
-        LoRARequest(LORA_NAME + str(idx), idx + 1, zephyr_lora_files)
-        for idx in range(len(PROMPTS))
-    ]
-    # Multiple SamplingParams should be matched with each prompt
-    outputs = llm.generate(PROMPTS, lora_request=lora_request)
-    assert len(PROMPTS) == len(outputs)
-
-    # Exception raised, if the size of params does not match the size of prompts
-    with pytest.raises(ValueError):
-        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
-
-    # Single LoRARequest should be applied to every prompt
-    single_lora_request = lora_request[0]
-    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
-    assert len(PROMPTS) == len(outputs)
diff --git a/tests/lora/test_multi_loras_with_tp.py b/tests/lora/test_llm_with_multi_loras.py
similarity index 80%
rename from tests/lora/test_multi_loras_with_tp.py
rename to tests/lora/test_llm_with_multi_loras.py
index fe9bd3f26951..3d8dd512a201 100644
--- a/tests/lora/test_multi_loras_with_tp.py
+++ b/tests/lora/test_llm_with_multi_loras.py
@@ -1,8 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-Script to test multi loras service with tp >= 2
+This script contains:
+1. test multi loras service with tp >= 2
+2. test multi loras request
 """
+import pytest
+
 from tests.utils import multi_gpu_test
 from vllm import LLM, SamplingParams
 from vllm.lora.request import LoRARequest
@@ -156,3 +160,34 @@ def check_outputs(outputs: str, expected: str):
 
         output_text = call_llm_get_outputs(prompt, "Alice")
         check_outputs(output_text, expected_output)
+
+
+def test_multiple_lora_requests():
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=4,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    PROMPTS = ["Hello, my name is"] * 2
+    LORA_NAME = "Alice"
+    lora_request = [
+        LoRARequest(LORA_NAME + str(idx), idx + 1,
+                    LORA_NAME_PATH_MAP[LORA_NAME])
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)

From 2d0afcc9dc925928ee8764c826a3661e487f9f82 Mon Sep 17 00:00:00 2001
From: Chenheli Hua <huachenheli@outlook.com>
Date: Thu, 28 Aug 2025 23:29:13 -0700
Subject: [PATCH 086/125] [mrope][Qwen2-VL] Fix edge case where getting index
 of image/video token can potentially throw in default vl mrope
 implementation.  (#23895)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
---
 .../layers/rotary_embedding/mrope.py               | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/rotary_embedding/mrope.py b/vllm/model_executor/layers/rotary_embedding/mrope.py
index e374aa9bebf9..5686ec7b35de 100644
--- a/vllm/model_executor/layers/rotary_embedding/mrope.py
+++ b/vllm/model_executor/layers/rotary_embedding/mrope.py
@@ -670,12 +670,18 @@ def _vl_get_input_positions_tensor(
         image_index, video_index = 0, 0
         for _ in range(image_nums + video_nums):
             video_second_per_grid_t = 0.0
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
+            if remain_images > 0:
+                try:
+                    ed_image = input_tokens.index(image_token_id, st)
+                except ValueError:
+                    ed_image = len(input_tokens) + 1
             else:
                 ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
+            if remain_videos > 0:
+                try:
+                    ed_video = input_tokens.index(video_token_id, st)
+                except ValueError:
+                    ed_video = len(input_tokens) + 1
             else:
                 ed_video = len(input_tokens) + 1
             if ed_image < ed_video:

From 885ca6d31db8816ee08e3fa634fbb58add289898 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Fri, 29 Aug 2025 14:58:48 +0800
Subject: [PATCH 087/125] [Misc] Fix warnings for mistral model (#23552)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
---
 vllm/model_executor/models/pixtral.py         | 12 ++++----
 vllm/model_executor/models/voxtral.py         | 12 ++++----
 vllm/transformers_utils/tokenizers/mistral.py | 30 +++++++++++--------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a74e01a59697..e7f5799a8006 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -15,7 +15,7 @@
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.multimodal import ImageEncoder
 from PIL import Image
-from transformers import PixtralVisionConfig, TensorType
+from transformers import BatchFeature, PixtralVisionConfig, TensorType
 from transformers.image_utils import ImageInput
 from transformers.models.pixtral.image_processing_pixtral import (
     _num_image_tokens as _get_pixtral_hf_num_image_tokens)
@@ -163,10 +163,12 @@ def __call__(
             images_processed.append(image_processed)
             images_tokens.append(image_tokens)
 
-        return {
-            "input_ids": torch.cat(images_tokens)[None].expand(len(text), -1),
-            "images": images_processed,
-        }
+        return BatchFeature({
+            "input_ids":
+            torch.cat(images_tokens)[None].expand(len(text), -1),
+            "images":
+            images_processed,
+        })
 
 
 class PixtralProcessingInfo(BaseProcessingInfo):
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index eed8d89ca4f5..6bc748407a7d 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -17,7 +17,7 @@
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.protocol.transcription.request import TranscriptionRequest
 from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
-from transformers import TensorType, WhisperConfig
+from transformers import BatchFeature, TensorType, WhisperConfig
 from transformers.tokenization_utils_base import TextInput
 
 from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
@@ -156,10 +156,12 @@ def __call__(
             audios_tokens.append(torch.tensor(audio_tokens))
             audios_processed.append(torch.tensor(audio))
 
-        return {
-            "input_ids": torch.cat(audios_tokens)[None].expand(len(text), -1),
-            "audio_arrays": audios_processed,
-        }
+        return BatchFeature({
+            "input_ids":
+            torch.cat(audios_tokens)[None].expand(len(text), -1),
+            "audio_arrays":
+            audios_processed,
+        })
 
 
 class VoxtralProcessingInfo(BaseProcessingInfo):
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 4dd8b2439b3f..f545993a5a98 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -204,18 +204,16 @@ def __init__(self, tokenizer: "PublicMistralTokenizer") -> None:
         self.version: int = int(_mistral_version_str.split("v")[-1])
 
         tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
-        from mistral_common.tokens.tokenizers.tekken import (
-            SpecialTokenPolicy, Tekkenizer)
+        from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
+        from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+
         self.is_tekken = isinstance(tokenizer_, Tekkenizer)
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer)
         self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
-        if self.is_tekken:
-            # Make sure special tokens will not raise
-            tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
-        elif self.is_spm:
-            pass
-        else:
+        self._special_token_policy = (SpecialTokenPolicy.IGNORE
+                                      if self.is_tekken else None)
+        if not (self.is_tekken or self.is_spm):
             raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
 
         self._vocab = tokenizer_.vocab()
@@ -430,7 +428,8 @@ def _token_to_id(t: str):
                         return self.tokenizer.unk_id
 
                 ids = [_token_to_id(t) for t in tokens]
-                decoded = self.tokenizer.decode(ids)
+                decoded = self.tokenizer.decode(ids,
+                                                self._special_token_policy)
             else:
                 decoded = "".join(tokens)
         else:
@@ -444,7 +443,8 @@ def _token_to_id(t: str):
                 if token in special_tokens:
                     if regular_tokens:
                         decoded_list.append(
-                            self.tokenizer.decode(regular_tokens))
+                            self.tokenizer.decode(regular_tokens,
+                                                  self._special_token_policy))
                         regular_tokens = []
                     decoded_list.append(token)
                 else:
@@ -452,7 +452,8 @@ def _token_to_id(t: str):
 
             if regular_tokens:
                 decoded_list.append(
-                    self.tokenizer.decode(regular_tokens))  # type: ignore
+                    self.tokenizer.decode(regular_tokens,
+                                          self._special_token_policy))
 
             decoded = ''.join(decoded_list)
 
@@ -470,7 +471,7 @@ def decode(self,
 
         if isinstance(ids, int):
             ids = [ids]
-        return self.tokenizer.decode(ids)
+        return self.tokenizer.decode(ids, self._special_token_policy)
 
     def convert_ids_to_tokens(
         self,
@@ -511,6 +512,9 @@ def convert_ids_to_tokens(
             # See: https://github.com/vllm-project/vllm/pull/8640
             #      https://github.com/vllm-project/vllm/pull/9625
             # if underlying tokenizeir is sentencepiece, we just add "�"
-            tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
+            tokens = [
+                self.tokenizer.id_to_byte_piece(id, self._special_token_policy)
+                for id in ids
+            ]
 
         return tokens

From 934bebf19252da6e1f2583d92e31d583b49498a2 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 29 Aug 2025 08:01:40 +0100
Subject: [PATCH 088/125] Better errors for Transformers backend missing
 features (#23759)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 vllm/model_executor/models/transformers.py | 25 ++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index f7ced6134da5..5ad0482330ec 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -17,6 +17,7 @@
 """Wrapper around `transformers` models"""
 from collections.abc import Iterable, Mapping
 from contextlib import contextmanager
+from pathlib import Path
 from typing import Literal, Optional, Union
 
 import regex as re
@@ -60,6 +61,21 @@
 logger = init_logger(__name__)
 
 
+def get_feature_request_tip(
+    model: str,
+    trust_remote_code: bool,
+) -> str:
+    hf_url = f"a discussion at https://huggingface.co/{model}/discussions/new"
+    gh_url = "an issue at https://github.com/huggingface/transformers/issues/new/choose"
+    url = hf_url if trust_remote_code else gh_url
+    prefix = f"Please open {url} to request support for this feature. "
+    if Path(model).exists():
+        prefix = ""
+    doc_url = "https://docs.vllm.ai/en/latest/models/supported_models.html#writing-custom-models"
+    tip = f"See {doc_url} for instructions on how to add support yourself."
+    return f"{prefix}{tip}"
+
+
 def vllm_flash_attention_forward(
         # Transformers args
         module: torch.nn.Module,
@@ -480,8 +496,11 @@ def pipeline_parallel(self):
             return
 
         if not self.model.supports_pp_plan:
+            tip = get_feature_request_tip(self.model_config.model,
+                                          self.model_config.trust_remote_code)
             raise ValueError(
-                f"{type(self.model)} does not support pipeline parallel yet!")
+                f"{type(self.model)} does not support pipeline parallel. {tip}"
+            )
 
         module_lists = []
         module_list_idx = None
@@ -535,8 +554,10 @@ def tensor_parallel(self):
         models_with_tp_plan = filter(supports_tp_plan, pretrained_models)
 
         if not any(models_with_tp_plan) and self.tp_size > 1:
+            tip = get_feature_request_tip(self.model_config.model,
+                                          self.model_config.trust_remote_code)
             raise ValueError(
-                f"{type(self.model)} does not support tensor parallel yet!")
+                f"{type(self.model)} does not support tensor parallel. {tip}")
 
         def _tensor_parallel(module: nn.Module,
                              prefix: str = "",

From 2554b27baa58b15843367f92d7f73d71bb89033d Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Fri, 29 Aug 2025 04:04:02 -0300
Subject: [PATCH 089/125] [V0 Deprecation] Remove pooling model support in V0 
 (#23434)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/distributed/test_pipeline_parallel.py   |   8 +-
 tests/entrypoints/llm/test_classify.py        |   8 -
 tests/entrypoints/llm/test_encode.py          |   8 -
 tests/entrypoints/llm/test_reward.py          |   8 -
 tests/entrypoints/llm/test_score.py           |   8 -
 .../offline_mode/test_offline_mode.py         |  19 +-
 tests/entrypoints/openai/test_embedding.py    |   8 -
 tests/entrypoints/openai/test_rerank.py       |   8 -
 tests/entrypoints/openai/test_score.py        |   9 -
 .../models/language/pooling/test_embedding.py |  20 +-
 tests/models/language/pooling/test_reward.py  |   8 -
 tests/models/language/pooling/test_scoring.py |   9 -
 tests/models/registry.py                      |  23 +-
 tests/worker/test_model_input.py              |  54 -----
 vllm/core/scheduler.py                        |   1 -
 vllm/engine/arg_utils.py                      |   7 +-
 vllm/engine/async_llm_engine.py               | 111 ++-------
 vllm/engine/llm_engine.py                     |  82 ++-----
 vllm/engine/multiprocessing/__init__.py       |   1 +
 vllm/engine/multiprocessing/client.py         |  51 +---
 vllm/engine/multiprocessing/engine.py         |   5 +-
 vllm/engine/protocol.py                       |   5 +-
 vllm/entrypoints/llm.py                       |   3 +-
 vllm/entrypoints/openai/serving_score.py      |   4 +-
 vllm/inputs/data.py                           |   6 -
 vllm/inputs/preprocess.py                     |  12 +-
 vllm/model_executor/layers/pooler.py          |  32 +--
 vllm/model_executor/models/bert.py            |   2 +-
 vllm/model_executor/models/gritlm.py          |   2 +-
 vllm/model_executor/models/modernbert.py      |   2 +-
 vllm/model_executor/pooling_metadata.py       |  90 -------
 vllm/multimodal/inputs.py                     |   6 -
 vllm/sequence.py                              |  11 -
 vllm/worker/enc_dec_model_runner.py           |   5 +-
 vllm/worker/model_runner.py                   |  29 ---
 vllm/worker/model_runner_base.py              |  14 +-
 vllm/worker/pooling_model_runner.py           | 222 ------------------
 vllm/worker/worker.py                         |   6 +-
 38 files changed, 99 insertions(+), 808 deletions(-)
 delete mode 100644 vllm/model_executor/pooling_metadata.py
 delete mode 100644 vllm/worker/pooling_model_runner.py

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 28150d768237..1afe9ea970c9 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -118,6 +118,8 @@ def fast(
         multi_node_only: bool = False,
         load_format: Optional[str] = None,
     ):
+        vllm_major_versions = ["1"] if runner == "pooling" else ["0"]
+
         return PPTestSettings(
             parallel_setups=[
                 ParallelSetup(tp_size=tp_base,
@@ -126,7 +128,7 @@ def fast(
                               chunked_prefill=False),
             ],
             distributed_backends=["mp"],
-            vllm_major_versions=["0"],
+            vllm_major_versions=vllm_major_versions,
             runner=runner,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        load_format=load_format),
@@ -213,7 +215,9 @@ def iter_params(self, model_id: str):
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
     # [Text-only]
     "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(runner="pooling"),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
+    # TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
+    # is fixed
+    #"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
     "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
         load_format="dummy", runner="pooling"
     ),
diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py
index 57705ff66907..7c261a2a5794 100644
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@@ -16,14 +16,6 @@
 prompts = ["The chef prepared a delicious meal."]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py
index cb54b16b0b04..eae3e234378f 100644
--- a/tests/entrypoints/llm/test_encode.py
+++ b/tests/entrypoints/llm/test_encode.py
@@ -27,14 +27,6 @@
 ]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/llm/test_reward.py b/tests/entrypoints/llm/test_reward.py
index de82cf8d4038..2cee3c8d94e3 100644
--- a/tests/entrypoints/llm/test_reward.py
+++ b/tests/entrypoints/llm/test_reward.py
@@ -16,14 +16,6 @@
 prompts = ["The chef prepared a delicious meal."]
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/llm/test_score.py b/tests/entrypoints/llm/test_score.py
index 5a1339b2addf..f715dacacb8f 100644
--- a/tests/entrypoints/llm/test_score.py
+++ b/tests/entrypoints/llm/test_score.py
@@ -14,14 +14,6 @@
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def llm():
     # pytest caches the fixture so we use weakref.proxy to
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index dd8d63ad319a..a154bb1059aa 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -32,15 +32,16 @@
         "tensor_parallel_size": 1,
         "tokenizer_mode": "mistral",
     },
-    {
-        "model": "sentence-transformers/all-MiniLM-L12-v2",
-        "enforce_eager": True,
-        "gpu_memory_utilization": 0.20,
-        "max_model_len": 64,
-        "max_num_batched_tokens": 64,
-        "max_num_seqs": 64,
-        "tensor_parallel_size": 1,
-    },
+    # TODO: re-enable once these tests are run with V1
+    # {
+    #     "model": "sentence-transformers/all-MiniLM-L12-v2",
+    #     "enforce_eager": True,
+    #     "gpu_memory_utilization": 0.20,
+    #     "max_model_len": 64,
+    #     "max_num_batched_tokens": 64,
+    #     "max_num_seqs": 64,
+    #     "tensor_parallel_size": 1,
+    # },
 ]
 
 
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index cf2442a56938..d46ab304ba6d 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -24,14 +24,6 @@
 DTYPE = "bfloat16"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def server():
     args = [
diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py
index 73364294cbcd..ce4d6c5f5d33 100644
--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -14,14 +14,6 @@
 DTYPE = "bfloat16"
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture(scope="module")
 def server():
     args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index cb6ec795ae96..4fafcfb45fa2 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -12,15 +12,6 @@
 
 from ...utils import RemoteOpenAIServer
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 MODELS = [
     {
         "name": "BAAI/bge-reranker-v2-m3",
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 2dd35c415158..f918b2b91bcc 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -10,14 +10,6 @@
 from ...utils import check_embeddings_close, check_transformers_version
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.mark.parametrize(
     "model",
     [
@@ -32,21 +24,15 @@ def v1(run_with_both_engines):
             "intfloat/e5-mistral-7b-instruct",
             # CPU v1 doesn't support sliding window
             marks=[pytest.mark.core_model]),
-        # the qwen models interfere with each other (see PR
-        # https://github.com/vllm-project/vllm/pull/18720).
-        # To avoid this problem, for now we skip v0 since it will be
-        # deprecated anyway.
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base",
-                     marks=[pytest.mark.skip_v0, pytest.mark.cpu_model]),
+                     marks=[pytest.mark.cpu_model]),
         # [Encoder-only]
         pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         # [Cross-Encoder]
-        pytest.param("sentence-transformers/stsb-roberta-base-v2",
-                     marks=[pytest.mark.skip_v1]),
+        pytest.param("sentence-transformers/stsb-roberta-base-v2"),
     ],
 )
 def test_models(
diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py
index beafa0aed986..08722ac98b7e 100644
--- a/tests/models/language/pooling/test_reward.py
+++ b/tests/models/language/pooling/test_reward.py
@@ -13,14 +13,6 @@
 from ...utils import check_transformers_version
 
 
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 @pytest.fixture
 def math_step_prompts():
     # ruff: noqa: E501
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index 6b5ff7068145..ef9d5530cde1 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -23,15 +23,6 @@
     "The capital of Germany is Berlin.",
 ]
 
-
-@pytest.fixture(autouse=True)
-def v1(run_with_both_engines):
-    # Simple autouse wrapper to run both engines for each test
-    # This can be promoted up to conftest.py to run for every
-    # test in a package
-    pass
-
-
 DTYPE = "half"
 
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 85b4c96e3b1c..13eb4872e7d8 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -323,8 +323,8 @@ def check_available_online(
 
 _EMBEDDING_EXAMPLE_MODELS = {
     # [Text-only]
-    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True),
-    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True),  # noqa: E501
+    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
+    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
     "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
     "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                                trust_remote_code=True),
@@ -337,9 +337,9 @@ def check_available_online(
     "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
     "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base",
-                                trust_remote_code=True, v0_only=True),
+                                trust_remote_code=True),
     "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe",
-                                               trust_remote_code=True, v0_only=True),  # noqa: E501
+                                               trust_remote_code=True),  # noqa: E501
     "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B",
                                            max_transformers_version="4.53",
@@ -347,9 +347,9 @@ def check_available_online(
     "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B",
                                                   max_transformers_version="4.53",
                                                   transformers_version_reason="HF model uses remote code that is not compatible with latest Transformers"),  # noqa: E501
-    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True),  # noqa: E501
-    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True),  # noqa: E501
-    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True),  # noqa: E501
+    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
+    "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"),  # noqa: E501
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),  # noqa: E501
     # [Multimodal]
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
@@ -364,20 +364,19 @@ def check_available_online(
     "GPT2ForSequenceClassification": _HfExamplesInfo("nie3e/sentiment-polish-gpt2-small"),  # noqa: E501
 
     # [Cross-encoder]
-    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True),  # noqa: E501
+    "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"),  # noqa: E501
     "GteNewForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-multilingual-reranker-base",  # noqa: E501
                                                        trust_remote_code=True,
                                                        hf_overrides={
                                                            "architectures": ["GteNewForSequenceClassification"]}),# noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501
-    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True),  # noqa: E501
-    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True),  # noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"), # noqa: E501
+    "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"),  # noqa: E501
+    "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"),  # noqa: E501
 }
 
 _AUTOMATIC_CONVERTED_MODELS = {
     # Use as_seq_cls_model for automatic conversion
     "GemmaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-gemma",  # noqa: E501
-                                                      v0_only=True,
                                                       hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
                                                                     "classifier_from_token": ["Yes"],  # noqa: E501
                                                                     "method": "no_post_processing"}),  # noqa: E501
diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py
index 2031f41fab87..0f28ef2ba857 100644
--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -9,10 +9,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
-from vllm.worker.pooling_model_runner import (
-    ModelInputForGPUWithPoolingMetadata)
 
 
 class MockAttentionBackend(AttentionBackend):
@@ -114,54 +111,3 @@ def test_model_runner_input():
     assert (received_model_input.sampling_metadata.selected_token_indices ==
             sampling_metadata.selected_token_indices)
     assert received_model_input.sampling_metadata.seq_groups is None
-
-
-def test_embedding_model_runner_input():
-    pooling_metadata = PoolingMetadata(
-        seq_groups=[[0]],
-        seq_data={},
-        prompt_lens=[1],
-    )
-    attn_metadata = AttentionMetadata(
-        num_prefills=1,
-        num_prefill_tokens=2,
-        num_decode_tokens=3,
-        slot_mapping=torch.zeros(1),
-        multi_modal_placeholder_index_maps=None,
-        enable_kv_scales_calculation=True,
-    )
-    model_input = ModelInputForGPUWithPoolingMetadata(
-        input_tokens=torch.ones(10),
-        input_positions=torch.ones(10),
-        pooling_metadata=pooling_metadata,
-        attn_metadata=attn_metadata)
-
-    assert isinstance(model_input, ModelInputForGPUWithPoolingMetadata)
-
-    # Test round trip serialization.
-    tensor_dict = model_input.as_broadcastable_tensor_dict()
-    attn_backend = MockAttentionBackend()
-    received_model_input = (
-        ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict, attn_backend=attn_backend))
-    # Check that received copy has correct values.
-    assert isinstance(received_model_input,
-                      ModelInputForGPUWithPoolingMetadata)
-    assert received_model_input.input_tokens is not None
-    assert (
-        received_model_input.input_tokens == model_input.input_tokens).all()
-    assert received_model_input.input_positions is not None
-    assert (received_model_input.input_positions == model_input.input_positions
-            ).all()
-    assert received_model_input.multi_modal_kwargs is None
-    assert (received_model_input.multi_modal_kwargs ==
-            model_input.multi_modal_kwargs)
-    assert received_model_input.lora_requests is None
-    assert received_model_input.lora_requests == model_input.lora_requests
-    assert received_model_input.lora_mapping is None
-    assert received_model_input.lora_mapping == model_input.lora_mapping
-    for field in dataclasses.fields(AttentionMetadata):
-        assert getattr(received_model_input.attn_metadata, field.name,
-                       None) == getattr(attn_metadata, field.name, None)
-    # Pooling metadata is not broadcast.
-    assert received_model_input.pooling_metadata is None
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index c89f3f663264..d7864293e964 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1591,7 +1591,6 @@ def schedule(
                     encoder_seq_data=encoder_seq_data,
                     cross_block_table=cross_block_table,
                     state=seq_group.state,
-                    token_type_ids=seq_group.token_type_ids,
                     # `multi_modal_data` will only be present for the 1st comm
                     # between engine and worker.
                     # the subsequent comms can still use delta, but
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7802802f138b..06bd97dd6abe 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1566,8 +1566,7 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
                 use_spec_decode = self.speculative_config is not None
 
                 if (is_gpu and not use_sliding_window and not use_spec_decode
-                        and not self.enable_lora
-                        and model_config.runner_type != "pooling"):
+                        and not self.enable_lora):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models "
@@ -1585,10 +1584,6 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None:
                 "OOM during the initial memory profiling phase, or result "
                 "in low performance due to small KV cache size. Consider "
                 "setting --max-model-len to a smaller value.", max_model_len)
-        elif (self.enable_chunked_prefill
-              and model_config.runner_type == "pooling"):
-            msg = "Chunked prefill is not supported for pooling models"
-            raise ValueError(msg)
 
         # if using prefix caching, we must set a hash algo
         if self.enable_prefix_caching:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 4fb028627a8c..9f9ad1854c3b 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -72,8 +72,8 @@ def _log_task_completion(task: asyncio.Task,
 
 
 class AsyncStream:
-    """A stream of RequestOutputs or PoolingRequestOutputs for a request
-    that can be iterated over asynchronously via an async generator."""
+    """A stream of RequestOutputs for a request that can be iterated over
+    asynchronously via an async generator."""
 
     def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
         self.request_id = request_id
@@ -81,8 +81,7 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
         self._queue: asyncio.Queue = asyncio.Queue()
         self._finished = False
 
-    def put(self, item: Union[RequestOutput, PoolingRequestOutput,
-                              Exception]) -> None:
+    def put(self, item: Union[RequestOutput, Exception]) -> None:
         if not self._finished:
             self._queue.put_nowait(item)
 
@@ -99,9 +98,7 @@ def finish(
     def finished(self) -> bool:
         return self._finished
 
-    async def generator(
-        self
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    async def generator(self) -> AsyncGenerator[RequestOutput, None]:
         try:
             while True:
                 result = await self._queue.get()
@@ -151,8 +148,7 @@ def propagate_exception(self,
                 self.abort_request(rid, exception=exc)
 
     def process_request_output(self,
-                               request_output: Union[RequestOutput,
-                                                     PoolingRequestOutput],
+                               request_output: RequestOutput,
                                *,
                                verbose: bool = False) -> None:
         """Process a request output from the engine."""
@@ -261,9 +257,7 @@ class _AsyncLLMEngine(LLMEngine):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    async def step_async(
-        self, virtual_engine: int
-    ) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+    async def step_async(self, virtual_engine: int) -> List[RequestOutput]:
         """Performs one decoding iteration and returns newly generated results.
         The workers are ran asynchronously if possible.
 
@@ -405,7 +399,7 @@ async def add_request_async(
         self,
         request_id: str,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -779,14 +773,14 @@ async def add_request(
         self,
         request_id: str,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
         data_parallel_rank: Optional[int] = None,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
-    ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
+    ) -> AsyncGenerator[RequestOutput, None]:
         if not self.is_running:
             if self.start_engine_loop:
                 self.start_background_loop()
@@ -908,7 +902,7 @@ async def generate(
             await self.abort(request_id)
             raise
 
-    async def encode(
+    def encode(
         self,
         prompt: PromptType,
         pooling_params: PoolingParams,
@@ -918,85 +912,8 @@ async def encode(
         priority: int = 0,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            pooling_params: The pooling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-            priority: The priority of the request.
-                Only applicable with priority scheduling.
-
-        Yields:
-            The output `PoolingRequestOutput` objects from the LLMEngine
-            for the request.
-
-        Details:
-            - If the engine is not running, start the background loop,
-                which iteratively invokes
-                [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
-                to process the waiting requests.
-            - Add the request to the engine's `RequestTracker`.
-                On the next background loop, this request will be sent to
-                the underlying engine.
-                Also, a corresponding `AsyncStream` will be created.
-            - Wait for the request outputs from `AsyncStream` and yield them.
-
-        Example:
-        ```
-        # Please refer to entrypoints/api_server.py for
-        # the complete example.
-
-        # initialize the engine and the example input
-        # note that engine_args here is AsyncEngineArgs instance
-        engine = AsyncLLMEngine.from_engine_args(engine_args)
-        example_input = {
-            "input": "What is LLM?",
-            "request_id": 0,
-        }
-
-        # start the generation
-        results_generator = engine.encode(
-        example_input["input"],
-        PoolingParams(),
-        example_input["request_id"])
-
-        # get the results
-        final_output = None
-        async for request_output in results_generator:
-            if await request.is_disconnected():
-                # Abort the request if the client disconnects.
-                await engine.abort(request_id)
-                # Return or raise an error
-                ...
-            final_output = request_output
-
-        # Process and return the final output
-        ...
-        ```
-        """
-        try:
-            async for output in await self.add_request(
-                    request_id,
-                    prompt,
-                    pooling_params,
-                    lora_request=lora_request,
-                    trace_headers=trace_headers,
-                    priority=priority,
-                    tokenization_kwargs=tokenization_kwargs,
-            ):
-                yield LLMEngine.validate_output(output, PoolingRequestOutput)
-        except asyncio.CancelledError:
-            await self.abort(request_id)
-            raise
+        raise NotImplementedError(
+            "Pooling models are not supported in vLLM V0")
 
     async def abort(self, request_id: Union[str, Iterable[str]]) -> None:
         """Abort a request.
@@ -1104,8 +1021,8 @@ async def wake_up(self, tags: Optional[list[str]] = None) -> None:
     async def is_sleeping(self) -> bool:
         return self.engine.is_sleeping()
 
-    async def add_lora(self, lora_request: LoRARequest) -> None:
-        self.engine.add_lora(lora_request)
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.engine.add_lora(lora_request)
 
     async def collective_rpc(self,
                              method: str,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 03c2f0375da4..7a5130af0bbb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -40,12 +40,11 @@
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.outputs import (PoolingRequestOutput, RequestOutput,
                           RequestOutputFactory)
-from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
-                           PoolingSequenceGroupOutput, Sequence, SequenceGroup,
-                           SequenceGroupBase, SequenceGroupMetadata,
-                           SequenceGroupOutput, SequenceStatus)
+                           Sequence, SequenceGroup, SequenceGroupBase,
+                           SequenceGroupMetadata, SequenceGroupOutput,
+                           SequenceStatus)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                           init_tracer)
 from vllm.transformers_utils.detokenizer import Detokenizer
@@ -93,8 +92,7 @@ class SchedulerContext:
 
     def __init__(self) -> None:
         self.output_queue: Deque[OutputData] = deque()
-        self.request_outputs: List[Union[RequestOutput,
-                                         PoolingRequestOutput]] = []
+        self.request_outputs: List[RequestOutput] = []
         self.seq_group_metadata_list: Optional[
             List[SequenceGroupMetadata]] = None
         self.scheduler_outputs: Optional[SchedulerOutputs] = None
@@ -261,8 +259,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
 
         self.model_executor = executor_class(vllm_config=vllm_config)
 
-        if self.model_config.runner_type != "pooling":
-            self._initialize_kv_caches()
+        self._initialize_kv_caches()
 
         # If usage stat is enabled, collect relevant info.
         if is_usage_stats_enabled():
@@ -541,7 +538,7 @@ def _add_processed_request(
         self,
         request_id: str,
         processed_inputs: ProcessorInputs,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         arrival_time: float,
         lora_request: Optional[LoRARequest],
         trace_headers: Optional[Mapping[str, str]] = None,
@@ -577,7 +574,7 @@ def _add_processed_request(
         encoder_seq = (None if encoder_inputs is None else Sequence(
             seq_id, encoder_inputs, block_size, eos_token_id, lora_request))
 
-        # Create a SequenceGroup based on SamplingParams or PoolingParams
+        # Create a SequenceGroup based on SamplingParams
         if isinstance(params, SamplingParams):
             seq_group = self._create_sequence_group_with_sampling(
                 request_id,
@@ -588,18 +585,8 @@ def _add_processed_request(
                 trace_headers=trace_headers,
                 encoder_seq=encoder_seq,
                 priority=priority)
-        elif isinstance(params, PoolingParams):
-            seq_group = self._create_sequence_group_with_pooling(
-                request_id,
-                seq,
-                params,
-                arrival_time=arrival_time,
-                lora_request=lora_request,
-                encoder_seq=encoder_seq,
-                priority=priority)
         else:
-            raise ValueError(
-                "Either SamplingParams or PoolingParams must be provided.")
+            raise ValueError("SamplingParams must be provided.")
 
         # Add the sequence group to the scheduler with least unfinished seqs.
         costs = [
@@ -618,7 +605,7 @@ def add_request(
         self,
         request_id: str,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         arrival_time: Optional[float] = None,
         lora_request: Optional[LoRARequest] = None,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
@@ -636,9 +623,8 @@ def add_request(
             prompt: The prompt to the LLM. See
                 [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each input.
-            params: Parameters for sampling or pooling.
+            params: Parameters for sampling.
                 [SamplingParams][vllm.SamplingParams] for text generation.
-                [PoolingParams][vllm.PoolingParams] for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
             lora_request: The LoRA request to add.
@@ -760,29 +746,6 @@ def _create_sequence_group_with_sampling(
 
         return seq_group
 
-    def _create_sequence_group_with_pooling(
-        self,
-        request_id: str,
-        seq: Sequence,
-        pooling_params: PoolingParams,
-        arrival_time: float,
-        lora_request: Optional[LoRARequest],
-        encoder_seq: Optional[Sequence] = None,
-        priority: int = 0,
-    ) -> SequenceGroup:
-        """Creates a SequenceGroup with PoolingParams."""
-        # Defensive copy of PoolingParams, which are used by the pooler
-        pooling_params = pooling_params.clone()
-        # Create the sequence group.
-        seq_group = SequenceGroup(request_id=request_id,
-                                  seqs=[seq],
-                                  arrival_time=arrival_time,
-                                  lora_request=lora_request,
-                                  pooling_params=pooling_params,
-                                  encoder_seq=encoder_seq,
-                                  priority=priority)
-        return seq_group
-
     def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
         """Aborts a request(s) with the given ID.
 
@@ -856,18 +819,6 @@ def reset_prefix_cache(self, device: Optional[Device] = None) -> bool:
             success = success and scheduler.reset_prefix_cache(device)
         return success
 
-    @staticmethod
-    def _process_sequence_group_outputs(
-        seq_group: SequenceGroup,
-        outputs: List[PoolingSequenceGroupOutput],
-    ) -> None:
-        seq_group.pooled_data = outputs[0].data
-
-        for seq in seq_group.get_seqs():
-            seq.status = SequenceStatus.FINISHED_STOPPED
-
-        return
-
     def _process_model_outputs(self,
                                ctx: SchedulerContext,
                                request_id: Optional[str] = None) -> None:
@@ -962,13 +913,10 @@ def _process_model_outputs(self,
                             seq_group.metrics.model_execute_time = (
                                 o.model_execute_time)
 
-            if self.model_config.runner_type == "pooling":
-                self._process_sequence_group_outputs(seq_group, output)
-            else:
-                self.output_processor.process_prompt_logprob(seq_group, output)
-                if seq_group_meta.do_sample:
-                    self.output_processor.process_outputs(
-                        seq_group, output, is_async)
+            self.output_processor.process_prompt_logprob(seq_group, output)
+            if seq_group_meta.do_sample:
+                self.output_processor.process_outputs(seq_group, output,
+                                                      is_async)
 
             if seq_group.is_finished():
                 finished_now.append(i)
@@ -1090,7 +1038,7 @@ def _advance_to_next_step(
                 seq.append_token_id(sample.output_token, sample.logprobs,
                                     sample.output_embed)
 
-    def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
+    def step(self) -> List[RequestOutput]:
         """Performs one decoding iteration and returns newly generated results.
 
         <figure markdown="span">
diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py
index ff0405d2f843..9f64ee0808df 100644
--- a/vllm/engine/multiprocessing/__init__.py
+++ b/vllm/engine/multiprocessing/__init__.py
@@ -120,6 +120,7 @@ class RPCLoadAdapterRequest:
 @dataclass
 class RPCAdapterLoadedResponse:
     request_id: str
+    lora_loaded: bool
 
 
 RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest,
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 0bb11328b1db..2d3248859c94 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -6,7 +6,7 @@
 import pickle
 from contextlib import contextmanager, suppress
 from typing import (Any, AsyncGenerator, Dict, Iterable, Iterator, List,
-                    Mapping, Optional, Union, cast)
+                    Mapping, Optional, Union)
 
 import cloudpickle
 import psutil
@@ -477,10 +477,8 @@ def generate(
                 Any priority other than 0 will lead to an error if the
                 scheduling policy is not "priority".
         """
-        return cast(
-            AsyncGenerator[RequestOutput, None],
-            self._process_request(prompt, sampling_params, request_id,
-                                  lora_request, trace_headers, priority))
+        return self._process_request(prompt, sampling_params, request_id,
+                                     lora_request, trace_headers, priority)
 
     def encode(
         self,
@@ -490,45 +488,20 @@ def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
-        """Generate outputs for a request from a pooling model.
-
-        Generate outputs for a request. This method is a coroutine. It adds the
-        request into the waiting queue of the LLMEngine and streams the outputs
-        from the LLMEngine to the caller.
-
-        Args:
-            prompt: The prompt to the LLM. See
-                [`PromptType`][vllm.inputs.PromptType] for more details about
-                the format of each input.
-            pooling_params: The pooling parameters of the request.
-            request_id: The unique id of the request.
-            lora_request: LoRA request to use for generation, if any.
-            trace_headers: OpenTelemetry trace headers.
-
-        Yields:
-            The output `PoolingRequestOutput` objects from the LLMEngine
-            for the request.
-        """
-        return cast(
-            AsyncGenerator[PoolingRequestOutput, None],
-            self._process_request(prompt,
-                                  pooling_params,
-                                  request_id,
-                                  lora_request,
-                                  trace_headers,
-                                  priority=priority))
+        raise NotImplementedError(
+            "Pooling models are not supported in vLLM V0")
 
     async def _process_request(
         self,
         prompt: PromptType,
-        params: Union[SamplingParams, PoolingParams],
+        params: SamplingParams,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
-    ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[
-            PoolingRequestOutput, None]]:
+    ) -> AsyncGenerator[RequestOutput, None]:
         """Send an RPCGenerateRequest to the RPCServer and stream responses."""
 
         # If already dead, error out.
@@ -547,7 +520,7 @@ async def _process_request(
         try:
             # 2) Detach logits processors so that they can be pickled
             # separately (may require cloudpickle which is slower)
-            if isinstance(params, SamplingParams) and params.logits_processors:
+            if params.logits_processors:
                 # Defensive shallow copy
                 params = copy.copy(params)
                 logits_processors = params.logits_processors
@@ -646,13 +619,14 @@ async def is_sleeping(self) -> bool:
             raise request_output
         return request_output.is_sleeping
 
-    async def add_lora(self, lora_request: LoRARequest) -> None:
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
         # Uses the same I/O as generate requests
         request = RPCLoadAdapterRequest(lora_request)
 
         # Create output queue for this request.
-        queue: asyncio.Queue[Union[None, BaseException]] = asyncio.Queue()
+        queue: asyncio.Queue[Union[
+            BaseException, RPCAdapterLoadedResponse]] = asyncio.Queue()
         self.output_queues[request.request_id] = queue
 
         # Send the request
@@ -666,3 +640,4 @@ async def add_lora(self, lora_request: LoRARequest) -> None:
         # Raise on error, otherwise happily return None
         if isinstance(request_output, BaseException):
             raise request_output
+        return request_output.lora_loaded
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 903f3fd71ebc..343b8df7e87b 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -347,7 +347,7 @@ def _handle_abort_request(self, request: RPCAbortRequest):
 
     def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
         try:
-            self.engine.add_lora(request.lora_request)
+            lora_loaded = self.engine.add_lora(request.lora_request)
         except BaseException as e:
             # Send back an error if the adater fails to load
             rpc_err = RPCError(request_id=request.request_id,
@@ -357,7 +357,8 @@ def _handle_load_adapter_request(self, request: RPCLoadAdapterRequest):
             return
         # Otherwise, send back the successful load message
         self._send_outputs(
-            RPCAdapterLoadedResponse(request_id=request.request_id))
+            RPCAdapterLoadedResponse(request_id=request.request_id,
+                                     lora_loaded=lora_loaded))
 
     def _handle_is_sleeping_request(self, request: RPCIsSleepingRequest):
         is_sleeping = self.is_sleeping()
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 5e8ac9c0b398..31c36b856231 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -3,7 +3,7 @@
 
 import asyncio
 from abc import ABC, abstractmethod
-from typing import AsyncGenerator, Iterable, Mapping, Optional, Union
+from typing import Any, AsyncGenerator, Iterable, Mapping, Optional, Union
 
 from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function
 from vllm.config import DecodingConfig, ModelConfig, VllmConfig
@@ -224,6 +224,7 @@ def encode(
         lora_request: Optional[LoRARequest] = None,
         trace_headers: Optional[Mapping[str, str]] = None,
         priority: int = 0,
+        tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> AsyncGenerator[PoolingRequestOutput, None]:
         """Generate outputs for a request from a pooling model."""
         ...
@@ -320,7 +321,7 @@ async def is_sleeping(self) -> bool:
         ...
 
     @abstractmethod
-    async def add_lora(self, lora_request: LoRARequest) -> None:
+    async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
         ...
 
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 72b6123670b7..9d900e691b0a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1156,8 +1156,7 @@ def _cross_encoding_score(
                 tokenization_kwargs=tokenization_kwargs,
             )
 
-            if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
-                    "token_type_ids", None)):
+            if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
                 params = pooling_params.clone()
                 compressed = compress_token_type_ids(token_type_ids)
                 params.extra_kwargs = {"compressed_token_type_ids": compressed}
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index 37838e22a400..c54deb371d54 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -7,7 +7,6 @@
 
 from fastapi import Request
 
-from vllm import envs
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
@@ -229,8 +228,7 @@ async def _cross_encoding_score(
                              params=default_pooling_params,
                              lora_request=lora_request)
 
-            if envs.VLLM_USE_V1 and (token_type_ids := engine_prompt.pop(
-                    "token_type_ids", None)):
+            if (token_type_ids := engine_prompt.pop("token_type_ids", None)):
                 pooling_params = default_pooling_params.clone()
                 compressed = compress_token_type_ids(token_type_ids)
                 pooling_params.extra_kwargs = {
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 23cb5e5022f1..8e6d3136d5e9 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -174,9 +174,6 @@ class TokenInputs(TypedDict):
     prompt_token_ids: list[int]
     """The token IDs of the prompt."""
 
-    token_type_ids: NotRequired[list[int]]
-    """The token type IDs of the prompt."""
-
     prompt: NotRequired[str]
     """
     The original prompt text corresponding to the token IDs, if available.
@@ -190,7 +187,6 @@ class TokenInputs(TypedDict):
 
 def token_inputs(
     prompt_token_ids: list[int],
-    token_type_ids: Optional[list[int]] = None,
     prompt: Optional[str] = None,
     cache_salt: Optional[str] = None,
 ) -> TokenInputs:
@@ -200,8 +196,6 @@ def token_inputs(
 
     if prompt is not None:
         inputs["prompt"] = prompt
-    if token_type_ids is not None:
-        inputs["token_type_ids"] = token_type_ids
     if cache_salt is not None:
         inputs["cache_salt"] = cache_salt
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index fff9c42fe36f..3dbd9057fe0f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -355,7 +355,6 @@ def _process_tokens(
         mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
-        token_type_ids = parsed_content.get("token_type_ids")
 
         inputs: Union[TokenInputs, MultiModalInputs]
         if multi_modal_data := parsed_content.get("multi_modal_data"):
@@ -368,10 +367,7 @@ def _process_tokens(
                 mm_hash_overrides=mm_hash_overrides,
             )
         else:
-            inputs = token_inputs(
-                prompt_token_ids=prompt_token_ids,
-                token_type_ids=token_type_ids,
-            )
+            inputs = token_inputs(prompt_token_ids=prompt_token_ids)
 
         if cache_salt := parsed_content.get("cache_salt"):
             inputs["cache_salt"] = cache_salt
@@ -387,7 +383,6 @@ async def _process_tokens_async(
         mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = parsed_content["prompt_token_ids"]
-        token_type_ids = parsed_content.get("token_type_ids")
 
         inputs: Union[TokenInputs, MultiModalInputs]
         if multi_modal_data := parsed_content.get("multi_modal_data"):
@@ -400,10 +395,7 @@ async def _process_tokens_async(
                 mm_hash_overrides=mm_hash_overrides,
             )
         else:
-            inputs = token_inputs(
-                prompt_token_ids=prompt_token_ids,
-                token_type_ids=token_type_ids,
-            )
+            inputs = token_inputs(prompt_token_ids=prompt_token_ids, )
 
         if cache_salt := parsed_content.get("cache_salt"):
             inputs["cache_salt"] = cache_salt
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index eebf7b2508db..66101e1a9924 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -13,17 +13,12 @@
 from transformers import PretrainedConfig
 
 from vllm.config import ModelConfig, PoolerConfig
-from vllm.model_executor.pooling_metadata import (  # noqa: E501
-    PoolingMetadata as V0PoolingMetadata)
-from vllm.model_executor.pooling_metadata import PoolingTensors
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.tasks import PoolingTask
 from vllm.utils import current_stream, resolve_obj_by_qualname
-from vllm.v1.pool.metadata import PoolingCursor
-from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata
+from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata
 
-PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata]
 PoolingFn = Callable[
     [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
     Union[torch.Tensor, list[torch.Tensor]]]
@@ -127,36 +122,23 @@ def get_prompt_lens(
     hidden_states: Union[torch.Tensor, list[torch.Tensor]],
     pooling_metadata: PoolingMetadata,
 ) -> torch.Tensor:
-    if isinstance(pooling_metadata, V1PoolingMetadata):
-        return pooling_metadata.prompt_lens
-
-    return PoolingTensors.from_pooling_metadata(
-        pooling_metadata, hidden_states[0].device).prompt_lens
+    return pooling_metadata.prompt_lens
 
 
 def get_prompt_token_ids(
         pooling_metadata: PoolingMetadata) -> list[torch.Tensor]:
-    if isinstance(pooling_metadata, V1PoolingMetadata):
-        assert pooling_metadata.prompt_token_ids is not None, (
-            "Please set `requires_token_ids=True` in `get_pooling_updates`")
-
-        return [
-            pooling_metadata.prompt_token_ids[i, :num]
-            for i, num in enumerate(pooling_metadata.prompt_lens)
-        ]
+    assert pooling_metadata.prompt_token_ids is not None, (
+        "Please set `requires_token_ids=True` in `get_pooling_updates`")
 
     return [
-        torch.tensor(seq_data_i.prompt_token_ids)
-        for seq_data_i in pooling_metadata.seq_data.values()
+        pooling_metadata.prompt_token_ids[i, :num]
+        for i, num in enumerate(pooling_metadata.prompt_lens)
     ]
 
 
 def get_pooling_params(
         pooling_metadata: PoolingMetadata) -> list[PoolingParams]:
-    if isinstance(pooling_metadata, V0PoolingMetadata):
-        pooling_params = [p for _, p in pooling_metadata.seq_groups]
-    else:
-        pooling_params = pooling_metadata.pooling_params
+    pooling_params = pooling_metadata.pooling_params
     return pooling_params
 
 
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index b34ca5cbe963..8f23439655ed 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -24,9 +24,9 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding, SupportsQuant
 from .interfaces_base import default_pooling_type
diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py
index 1b3d541c65cf..a7b324f0a5b4 100644
--- a/vllm/model_executor/models/gritlm.py
+++ b/vllm/model_executor/models/gritlm.py
@@ -15,10 +15,10 @@
                                                build_output, get_prompt_lens,
                                                get_prompt_token_ids)
 from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import PoolerOutput
 from vllm.tasks import PoolingTask
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
+from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces_base import default_pooling_type
 
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 477855586128..776287589808 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -22,9 +22,9 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import PoolingTask
+from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding
 from .interfaces_base import default_pooling_type
diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py
deleted file mode 100644
index 320987919345..000000000000
--- a/vllm/model_executor/pooling_metadata.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from dataclasses import dataclass
-from typing import Any, Optional
-
-import torch
-
-from vllm.pooling_params import PoolingParams
-from vllm.utils import is_pin_memory_available
-from vllm.v1.pool.metadata import PoolingCursor, build_pooling_cursor
-
-
-class PoolingMetadata:
-    """Metadata for pooling operations in the Pooler layer.
-
-    This class holds the necessary information for pooling operations,
-    providing context for how to perform pooling and other related operations.
-
-    Attributes:
-        seq_groups: List of (seq_ids, pooling_params).
-        seq_data: A mapping of sequence ID to additional sequence data.
-        prompt_lens: List of the lengths of each prompt.
-    """
-
-    def __init__(
-            self,
-            seq_groups: list[tuple[list[int], PoolingParams]],
-            seq_data: dict[int, Any],  # Specific data related to sequences
-            prompt_lens: list[int],
-            pooling_cursor: Optional[PoolingCursor] = None) -> None:
-        self.seq_groups = seq_groups
-        self.seq_data = seq_data
-        self.prompt_lens = prompt_lens
-        self.pooling_cursor: Optional[PoolingCursor] = pooling_cursor
-
-    def __repr__(self) -> str:
-        return ("PoolingMetadata("
-                f"seq_groups={self.seq_groups}, "
-                f"seq_data={self.seq_data}, "
-                f"prompt_lens={self.prompt_lens})")
-
-    def __getitem__(self, indices: slice):
-        return PoolingMetadata(
-            seq_groups=self.seq_groups[indices],
-            seq_data=dict(list(self.seq_data.items())[indices]),
-            prompt_lens=self.prompt_lens[indices],
-            pooling_cursor=None
-            if self.pooling_cursor is None else self.pooling_cursor[indices],
-        )
-
-    def build_pooling_cursor(self, num_scheduled_tokens: list[int],
-                             device: torch.device):
-        prompt_lens = torch.tensor(self.prompt_lens, device="cpu")
-        self.pooling_cursor = build_pooling_cursor(num_scheduled_tokens,
-                                                   prompt_lens,
-                                                   device=device)
-
-
-@dataclass
-class PoolingTensors:
-    """Tensors for pooling."""
-
-    prompt_lens: torch.Tensor
-
-    @classmethod
-    def from_pooling_metadata(
-        cls,
-        pooling_metadata: "PoolingMetadata",
-        device: torch.device,
-    ) -> "PoolingTensors":
-        """
-        Create PoolingTensors from PoolingMetadata.
-
-        Args:
-            pooling_metadata: PoolingMetadata instance to convert.
-            device: Device to store the tensors.
-        """
-        # Convert prompt lengths to tensor
-        pin_memory = is_pin_memory_available()
-
-        prompt_lens_t = torch.tensor(
-            pooling_metadata.prompt_lens,
-            device="cpu",
-            dtype=torch.long,
-            pin_memory=pin_memory,
-        )
-
-        return cls(prompt_lens=prompt_lens_t.to(device=device,
-                                                non_blocking=True), )
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 2c0ebaced67e..cf6ab6c8dea2 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -913,9 +913,6 @@ class MultiModalInputs(TypedDict):
     prompt_token_ids: list[int]
     """The processed token IDs which includes placeholder tokens."""
 
-    token_type_ids: NotRequired[list[int]]
-    """The token type IDs of the prompt."""
-
     mm_kwargs: MultiModalKwargsOptionalItems
     """Keyword arguments to be directly passed to the model after batching."""
 
@@ -946,6 +943,3 @@ class MultiModalEncDecInputs(MultiModalInputs):
 
     encoder_prompt_token_ids: list[int]
     """The processed token IDs of the encoder prompt."""
-
-    encoder_token_type_ids: NotRequired[list[int]]
-    """The token type IDs of the encoder prompt."""
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 36b1b198bd5a..7b48b7be9f51 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -508,12 +508,6 @@ def prompt_token_ids(self) -> list[int]:
             return [0] * len(self.inputs["prompt_embeds"])
         return self.inputs["prompt_token_ids"]
 
-    @property
-    def token_type_ids(self) -> list[int]:
-        if self.inputs["type"] == "embeds":
-            return []
-        return self.inputs.get("token_type_ids", [])
-
     @property
     def multi_modal_data(self) -> MultiModalKwargs:
         if self.inputs["type"] == "multimodal":
@@ -765,10 +759,6 @@ def encoder_prompt_token_ids(self) -> Optional[list[int]]:
         return (self.encoder_seq.prompt_token_ids
                 if self.encoder_seq is not None else None)
 
-    @property
-    def token_type_ids(self) -> Optional[list[int]]:
-        return self.first_seq.token_type_ids
-
     @property
     def multi_modal_data(self) -> MultiModalKwargs:
         if self.first_seq.multi_modal_data:
@@ -972,7 +962,6 @@ class SequenceGroupMetadata(
     computed_block_nums: Optional[list[int]] = None
     state: Optional[SequenceGroupState] = msgspec.field(
         default_factory=lambda: SequenceGroupState())
-    token_type_ids: Optional[list[int]] = None
     multi_modal_data: Optional[MultiModalKwargs] = None
     multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
     encoder_seq_data: Optional[SequenceData] = None
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index cb5d5664ab5c..12fd25f4de2a 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -24,8 +24,7 @@
                              MultiModalRegistry)
 from vllm.platforms import _Backend
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (IntermediateTensors, PoolerOutput,
-                           SequenceGroupMetadata)
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad
 from vllm.worker.model_runner import (GPUModelRunnerBase,
                                       ModelInputForGPUBuilder,
@@ -161,7 +160,7 @@ def execute_model(
         kv_caches: List[torch.Tensor],
         intermediate_tensors: Optional[IntermediateTensors] = None,
         num_steps: int = 1,
-    ) -> Optional[List[PoolerOutput]]:
+    ) -> Optional[List[SamplerOutput]]:
         if num_steps > 1:
             raise ValueError("num_steps > 1 is not supported in "
                              "EncoderDecoderModelRunner")
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a1c08fa814db..f05401fd0132 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -86,7 +86,6 @@ class ModelInputForGPU(ModelRunnerInputBase):
     input_tokens: Optional[torch.Tensor] = None
     inputs_embeds: Optional[torch.Tensor] = None
     input_positions: Optional[torch.Tensor] = None
-    token_types: Optional[torch.Tensor] = None
     seq_lens: Optional[List[int]] = None
     query_lens: Optional[List[int]] = None
     lora_mapping: Optional["LoRAMapping"] = None
@@ -192,7 +191,6 @@ def simple_reinit(self):
             self.input_tokens[0].clear()  # type: ignore
             self.inputs_embeds = None  # type: ignore
             self.input_positions[0].clear()  # type: ignore
-            self.token_types[0].clear()  # type: ignore
             self.mrope_input_positions = None  # type: ignore
             self.seq_lens[0] = 0  # type: ignore
             self.orig_seq_lens[0] = 0  # type: ignore
@@ -219,7 +217,6 @@ def __init__(
             input_tokens: Optional[List[List[int]]] = None,
             inputs_embeds: Optional[torch.Tensor] = None,
             input_positions: Optional[List[List[int]]] = None,
-            token_types: Optional[List[List[int]]] = None,
             mrope_input_positions: Optional[List[List[List[int]]]] = None,
 
             # The sequence length (may be capped to the sliding window).
@@ -284,12 +281,6 @@ def __init__(
                         for seq_id in range(len(self.seq_ids)):
                             self.input_positions[seq_id].clear()
 
-                    if token_types:
-                        self.token_types = token_types
-                    else:
-                        for seq_id in range(len(self.seq_ids)):
-                            self.token_types[seq_id].clear()
-
                     self.mrope_input_positions = None
 
                     if seq_lens:
@@ -348,7 +339,6 @@ def __init__(
                 self.input_tokens = input_tokens or []
                 self.inputs_embeds = inputs_embeds
                 self.input_positions = input_positions or []
-                self.token_types = token_types or []
                 self.mrope_input_positions = mrope_input_positions or None
                 self.seq_lens = seq_lens or []
                 self.orig_seq_lens = orig_seq_lens or []
@@ -376,7 +366,6 @@ def __post_init__(self):
 
             self.input_tokens = [[] for _ in range(self.n_seqs)]
             self.input_positions = [[] for _ in range(self.n_seqs)]
-            self.token_types = [[] for _ in range(self.n_seqs)]
             self.mrope_input_positions = None
             self.seq_lens = [0] * self.n_seqs
             self.orig_seq_lens = [0] * self.n_seqs
@@ -400,7 +389,6 @@ def __repr__(self) -> str:
                     f"inputs_embeds.shape="
                     f"{getattr(self.inputs_embeds, 'shape', None)}, "
                     f"input_positions={self.input_positions}, "
-                    f"token_types={self.token_types}, "
                     f"mrope_input_positions={self.mrope_input_positions}, "
                     f"seq_lens={self.seq_lens}, "
                     f"orig_seq_lens={self.orig_seq_lens}, "
@@ -522,8 +510,6 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
             prompt_embeds = seq_data.get_token_embeddings(
             )[context_len:seq_len]
 
-        token_types = seq_group_metadata.token_type_ids
-
         inter_data.seq_lens[seq_idx] = seq_len
         inter_data.orig_seq_lens[seq_idx] = seq_len
         inter_data.prompt_lens[seq_idx] = seq_data.get_prompt_len()
@@ -531,8 +517,6 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
         inter_data.input_tokens[seq_idx].extend(tokens)
         inter_data.inputs_embeds = prompt_embeds
         inter_data.input_positions[seq_idx].extend(range(context_len, seq_len))
-        inter_data.token_types[seq_idx].extend(
-            token_types if token_types else [])
         inter_data.query_lens[seq_idx] = seq_len - context_len
 
         if seq_data.mrope_position_delta is not None:
@@ -590,8 +574,6 @@ def _compute_for_prefix_cache_hit(
                 seq_idx][uncomputed_start:]
             inter_data.input_positions[seq_idx] = inter_data.input_positions[
                 seq_idx][uncomputed_start:]
-            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
-                uncomputed_start:]
             context_len = prefix_cache_len
 
             inter_data.context_lens[seq_idx] = context_len
@@ -606,8 +588,6 @@ def _compute_for_prefix_cache_hit(
                 seq_idx][-1:]
             inter_data.input_positions[seq_idx] = inter_data.input_positions[
                 seq_idx][-1:]
-            inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][
-                -1:]
             inter_data.query_lens[seq_idx] = 1
             inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1
 
@@ -802,12 +782,9 @@ def build(self) -> ModelInputForGPU:
         # Combine and flatten intermediate data.
         input_tokens = list[int]()
         inputs_embeds_list = list[torch.Tensor]()
-        token_types = list[int]()
         for inter_data in self.inter_data_list:
             for cur_input_tokens in inter_data.input_tokens:
                 input_tokens.extend(cur_input_tokens)
-            for cur_token_types in inter_data.token_types:
-                token_types.extend(cur_token_types)
             if inter_data.inputs_embeds is not None:
                 inputs_embeds_list.append(
                     inter_data.inputs_embeds.to(
@@ -890,11 +867,6 @@ def build(self) -> ModelInputForGPU:
                                                self.runner.device,
                                                self.runner.pin_memory)
 
-        token_types_tensor = async_tensor_h2d(token_types, torch.long,
-                                               self.runner.device,
-                                               self.runner.pin_memory) \
-                                                if token_types else None
-
         if mrope_input_positions is not None:
             for idx in range(3):
                 mrope_input_positions[idx].extend(
@@ -951,7 +923,6 @@ def build(self) -> ModelInputForGPU:
             input_tokens=input_tokens_tensor,
             inputs_embeds=inputs_embeds,
             input_positions=input_positions_tensor,
-            token_types=token_types_tensor,
             attn_metadata=attn_metadata,
             seq_lens=seq_lens,
             query_lens=query_lens,
diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py
index 7b8fe2f802d6..1008b743619a 100644
--- a/vllm/worker/model_runner_base.py
+++ b/vllm/worker/model_runner_base.py
@@ -13,10 +13,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.interfaces import supports_transcription
-from vllm.model_executor.models.interfaces_base import (
-    is_pooling_model, is_text_generation_model)
+from vllm.model_executor.models.interfaces_base import is_text_generation_model
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
+from vllm.tasks import GenerationTask, SupportedTask
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
@@ -241,20 +240,11 @@ def get_supported_generation_tasks(self) -> list[GenerationTask]:
 
         return supported_tasks
 
-    def get_supported_pooling_tasks(self) -> list[PoolingTask]:
-        model = self.get_model()
-        if not is_pooling_model(model):
-            return []
-
-        return list(model.pooler.get_supported_tasks())
-
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         tasks = list[SupportedTask]()
 
         if self.model_config.runner_type == "generate":
             tasks.extend(self.get_supported_generation_tasks())
-        if self.model_config.runner_type == "pooling":
-            tasks.extend(self.get_supported_pooling_tasks())
 
         return tuple(tasks)
 
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
deleted file mode 100644
index 3e1950798dbf..000000000000
--- a/vllm/worker/pooling_model_runner.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import dataclasses
-from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
-
-import torch
-
-from vllm.config import VllmConfig
-from vllm.distributed import get_pp_group
-from vllm.forward_context import set_forward_context
-from vllm.logger import init_logger
-from vllm.model_executor.models.interfaces_base import VllmModelForPooling
-from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.multimodal import MultiModalKwargs
-from vllm.pooling_params import PoolingParams
-from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
-                           SequenceGroupMetadata)
-from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPU,
-                                      ModelInputForGPUBuilder)
-
-logger = init_logger(__name__)
-
-
-@dataclasses.dataclass(frozen=True)
-class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU):
-    """
-    Used by the PoolingModelRunner.
-    """
-    pooling_metadata: Optional["PoolingMetadata"] = None
-
-
-class PoolingModelRunner(
-        GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]):
-    _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = (
-        ModelInputForGPUWithPoolingMetadata)
-    _builder_cls: Type[ModelInputForGPUBuilder] = ModelInputForGPUBuilder
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
-    ):
-        super().__init__(vllm_config=vllm_config,
-                         kv_cache_dtype=kv_cache_dtype,
-                         is_driver_worker=is_driver_worker)
-
-    @torch.inference_mode()
-    def execute_model(
-        self,
-        model_input: ModelInputForGPUWithPoolingMetadata,
-        kv_caches: List[torch.Tensor],
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        num_steps: int = 1,
-    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
-        if num_steps > 1:
-            raise ValueError(
-                "PoolingModelRunner does not support multi-step execution.")
-
-        if self.lora_config:
-            assert model_input.lora_requests is not None
-            assert model_input.lora_mapping is not None
-            self.set_active_loras(model_input.lora_requests,
-                                  model_input.lora_mapping)
-
-        # Currently cuda graph is only supported by the decode phase.
-        assert model_input.attn_metadata is not None
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-        decode_meta = model_input.attn_metadata.decode_metadata
-        virtual_engine = model_input.virtual_engine
-        # Pooling models are (ab-)used also to integrate non text models that
-        # are not autoregressive (PrithviGeosaptialMAE).
-        # These model might not use attention and do not really have a prefill
-        # and decode phase. The model input is processed in one shot and both
-        # decode_metadata and prefill_metadata would be None for such models.
-        # See the PlaceholderAttentionMetadata class.
-        # TODO: Figure out if cuda_graph is of any use for these models and
-        #  explore how to leverage it.
-        if (prefill_meta is None and decode_meta is not None
-                and decode_meta.use_cuda_graph):
-            if model_input.inputs_embeds is None:
-                assert model_input.input_tokens is not None
-                graph_batch_size = model_input.input_tokens.shape[0]
-                model_executable = (
-                    self.graph_runners[model_input.virtual_engine][(
-                        graph_batch_size, False)])
-            else:
-                graph_batch_size = model_input.inputs_embeds.shape[0]
-                model_executable = (
-                    self.graph_runners[model_input.virtual_engine][(
-                        graph_batch_size, True)])
-        else:
-            model_executable = self.model
-
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-        seqlen_agnostic_kwargs = {
-            "finished_requests_ids": model_input.finished_requests_ids,
-            "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-        } if self.has_inner_state else {}
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_start = torch.cuda.Event(enable_timing=True)
-            model_forward_end = torch.cuda.Event(enable_timing=True)
-            model_forward_start.record()
-
-        cross_enc_kwargs = {}
-        if model_input.token_types is not None:
-            cross_enc_kwargs["token_type_ids"] = model_input.token_types
-
-        with set_forward_context(model_input.attn_metadata, self.vllm_config,
-                                 virtual_engine):
-            hidden_or_intermediate_states = model_executable(
-                input_ids=model_input.input_tokens,
-                positions=model_input.input_positions,
-                intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(
-                    multi_modal_kwargs,
-                    device=self.device,
-                ),
-                **cross_enc_kwargs,
-                **seqlen_agnostic_kwargs,
-            )
-
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time):
-            model_forward_end.record()
-
-        # Only perform pooling in the last pipeline stage.
-        if not get_pp_group().is_last_rank:
-            if (self.is_driver_worker
-                    and hidden_or_intermediate_states is not None
-                    and isinstance(hidden_or_intermediate_states,
-                                   IntermediateTensors)
-                    and self.observability_config is not None
-                    and self.observability_config.collect_model_forward_time):
-                model_forward_end.synchronize()
-                model_forward_time = model_forward_start.elapsed_time(
-                    model_forward_end)
-                orig_model_forward_time = 0.0
-                if intermediate_tensors is not None:
-                    orig_model_forward_time = intermediate_tensors.tensors.get(
-                        "model_forward_time", torch.tensor(0.0)).item()
-                hidden_or_intermediate_states.tensors["model_forward_time"] = (
-                    torch.tensor(model_forward_time + orig_model_forward_time))
-            return hidden_or_intermediate_states
-
-        # Only perform pooling in the driver worker.
-        if not self.is_driver_worker:
-            return []
-
-        pooling_metadata = model_input.pooling_metadata
-        assert pooling_metadata is not None
-
-        pooling_metadata.build_pooling_cursor(
-            num_scheduled_tokens=pooling_metadata.prompt_lens,
-            device=hidden_or_intermediate_states.device)
-
-        return [
-            self.model.pooler(hidden_states=hidden_or_intermediate_states,
-                              pooling_metadata=pooling_metadata)
-        ]
-
-    def make_model_input_from_broadcasted_tensor_dict(
-            self,
-            tensor_dict: Dict[str,
-                              Any]) -> ModelInputForGPUWithPoolingMetadata:
-        return ModelInputForGPUWithPoolingMetadata.from_broadcasted_tensor_dict(
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
-    def prepare_model_input(
-        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        virtual_engine: int = 0,
-        finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForGPUWithPoolingMetadata:
-        assert seq_group_metadata_list is not None
-        model_input = self._prepare_model_input_tensors(
-            seq_group_metadata_list, finished_requests_ids)
-        # Prepare PoolingMetadata.
-        assert model_input.seq_lens is not None
-        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
-                                                 model_input.seq_lens)
-
-        return dataclasses.replace(model_input,
-                                   pooling_metadata=pooling_metadata)
-
-    def _prepare_pooling(
-        self,
-        seq_group_metadata_list: List[SequenceGroupMetadata],
-        prompt_lens: List[int],
-    ) -> PoolingMetadata:
-        """Prepare PoolingMetadata for the sequence group metadata list."""
-        seq_groups: List[Tuple[List[int], PoolingParams]] = []
-        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            seq_ids = list(seq_group_metadata.seq_data.keys())
-
-            pooling_params = seq_group_metadata.pooling_params
-            assert pooling_params is not None
-
-            task = pooling_params.task
-            assert task is not None, "You did not set `task` in the API"
-
-            model = cast(VllmModelForPooling, self.model)
-            to_update = model.pooler.get_pooling_updates(task)
-            to_update.apply(pooling_params)
-
-            seq_groups.append((seq_ids, pooling_params))
-
-        seq_data: Dict[int, SequenceData] = {}
-        for seq_group_metadata in seq_group_metadata_list:
-            seq_data.update(seq_group_metadata.seq_data)
-
-        pooling_metadata = PoolingMetadata(
-            seq_groups=seq_groups,
-            seq_data=seq_data,
-            prompt_lens=prompt_lens,
-        )
-
-        return pooling_metadata
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index fc24d95b80f2..2e20c89c632c 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -30,7 +30,6 @@
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
-from vllm.worker.pooling_model_runner import PoolingModelRunner
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
                                      WorkerInput)
 
@@ -83,9 +82,7 @@ def __init__(
                     else {"return_hidden_states": True}
 
         ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
-        if model_config.runner_type == "pooling":
-            ModelRunnerClass = PoolingModelRunner
-        elif self.model_config.is_encoder_decoder:
+        if self.model_config.is_encoder_decoder:
             ModelRunnerClass = EncoderDecoderModelRunner
         self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
             vllm_config=self.vllm_config,
@@ -99,7 +96,6 @@ def __init__(
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as pooling models don't initialize kv_caches
         self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
         self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
 

From ad39106b16fee0074e814f06ec7a517399ea154d Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Fri, 29 Aug 2025 17:19:58 +0800
Subject: [PATCH 090/125] [CPU] Enable data parallel for CPU backend (#23903)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .../scripts/hardware_ci/run-cpu-test.sh       | 24 +++++++++++++++----
 docs/getting_started/installation/cpu.md      |  3 ++-
 .../installation/cpu/x86.inc.md               |  2 +-
 vllm/platforms/cpu.py                         |  8 +++++++
 vllm/v1/worker/cpu_model_runner.py            |  7 +++++-
 vllm/v1/worker/cpu_worker.py                  | 13 ++++++++--
 6 files changed, 48 insertions(+), 9 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 8b8f0e8c6578..0f734763f13f 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 
 function cpu_tests() {
   set -e
@@ -89,17 +89,33 @@ function cpu_tests() {
     pytest -x -s -v \
     tests/lora/test_qwen2vl.py"
 
-  # online serving
+  # online serving: tp+pp
   docker exec cpu-test-"$NUMA_NODE" bash -c '
     set -e
     VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
+    server_pid=$!
     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
     vllm bench serve \
       --backend vllm \
       --dataset-name random \
       --model meta-llama/Llama-3.2-3B-Instruct \
       --num-prompts 20 \
-      --endpoint /v1/completions'
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+
+  # online serving: tp+dp
+  docker exec cpu-test-"$NUMA_NODE" bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model meta-llama/Llama-3.2-3B-Instruct \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index e76ec35e1edc..7f0ecb2bc0b7 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -96,6 +96,7 @@ Currently, there are no pre-built CPU wheels.
 - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
+- `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
 - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).
 
@@ -179,7 +180,7 @@ Inference batch size is an important parameter for the performance. Larger batch
     - Offline Inference: `256 * world_size`
     - Online Serving: `128 * world_size`
 
-vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP together if there are enough CPU sockets and memory nodes.
+vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more details of tuning DP, TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use DP, TP and PP together if there are enough CPU sockets and memory nodes.
 
 ### Which quantization configs does vLLM CPU support?
 
diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md
index 6dc6f94249c3..f7af259ace62 100644
--- a/docs/getting_started/installation/cpu/x86.inc.md
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@@ -43,7 +43,7 @@ docker build -f docker/Dockerfile.cpu \
 
 # Launching OpenAI server
 docker run --rm \
-            --privileged=true \
+            --security-opt seccomp=unconfined \
             --shm-size=4g \
             -p 8000:8000 \
             -e VLLM_CPU_KVCACHE_SPACE=<KV cache space> \
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 5686fae5cd7d..12d5e0bf0865 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -69,6 +69,7 @@ class CpuPlatform(Platform):
     device_type: str = "cpu"
     dispatch_key: str = "CPU"
     dist_backend: str = "gloo"
+    device_control_env_var = "CPU_VISIBLE_MEMORY_NODES"
 
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
@@ -297,6 +298,13 @@ def get_allowed_cpu_core_node_list(
             allowed_numa_nodes.add(x.numa_node)  # type: ignore
         allowed_numa_nodes_list = sorted(allowed_numa_nodes)
 
+        env_key = CpuPlatform.device_control_env_var
+        if (env_key in os.environ and os.environ[env_key] != ""):
+            visible_nodes = [int(s) for s in os.environ[env_key].split(',')]
+            allowed_numa_nodes_list = [
+                x for x in visible_nodes if x in allowed_cpu_id_list
+            ]
+
         return allowed_numa_nodes_list, logical_cpu_list
 
     @classmethod
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 7d0726112704..226d7792a42f 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 import torch.nn as nn
@@ -113,6 +113,11 @@ def _sync_device(self) -> None:
     def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
         return sampled_token_ids.tolist()
 
+    def get_dp_padding(self,
+                       num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
+        # Note: For CPU backend, dp padding is not required for now.
+        return 0, None
+
 
 @contextmanager
 def _torch_cuda_wrapper():
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index be78597926e0..b87c4fe09bb9 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -55,7 +55,14 @@ def init_device(self):
             else:
                 self.local_omp_cpuid = "all"
         else:
-            self.local_omp_cpuid = omp_cpuids.split("|")[self.rank]
+            local_dp_rank = self.parallel_config.data_parallel_rank_local
+            omp_cpuids = omp_cpuids.split("|")
+            if local_dp_rank is not None:
+                world_size = self.parallel_config.world_size
+                omp_cpuids = omp_cpuids[local_dp_rank *
+                                        world_size:(local_dp_rank + 1) *
+                                        world_size]
+            self.local_omp_cpuid = omp_cpuids[self.rank]
 
         if self.local_omp_cpuid != "all":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
@@ -162,7 +169,9 @@ def _get_autobind_cpu_ids(
         # Reserve CPUs for other processes
         reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU
         if reserve_cpu_num is None:
-            reserve_cpu_num = 1 if self.parallel_config.world_size > 1 else 0
+            need_reserve = (self.parallel_config.world_size > 1 or
+                            self.parallel_config.data_parallel_size_local > 1)
+            reserve_cpu_num = 1 if need_reserve else 0
         assert len(logical_cpu_list) > reserve_cpu_num, (
             f"VLLM_CPU_NUM_OF_RESERVED_CPU ({reserve_cpu_num}) "
             f"should less than {len(logical_cpu_list)}.")

From d9e00dbd1fcf9e4b6b0b42a228d7bb26175cbba4 Mon Sep 17 00:00:00 2001
From: "wang.yuqi" <noooop@126.com>
Date: Fri, 29 Aug 2025 18:12:32 +0800
Subject: [PATCH 091/125] [Performance] V1 Classify Models E2E Performance
 Optimization (#23541)

Signed-off-by: wang.yuqi <noooop@126.com>
---
 tests/entrypoints/llm/test_classify.py        |  6 ++
 .../entrypoints/openai/test_classification.py | 30 ++++++++++
 vllm/entrypoints/openai/api_server.py         |  6 +-
 vllm/model_executor/layers/pooler.py          | 60 ++++++++++---------
 vllm/v1/worker/gpu_model_runner.py            | 15 +++--
 5 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/tests/entrypoints/llm/test_classify.py b/tests/entrypoints/llm/test_classify.py
index 7c261a2a5794..6c0c9cd01580 100644
--- a/tests/entrypoints/llm/test_classify.py
+++ b/tests/entrypoints/llm/test_classify.py
@@ -62,3 +62,9 @@ def test_encode_api(llm: LLM):
     err_msg = "pooling_task must be one of.+"
     with pytest.raises(ValueError, match=err_msg):
         llm.encode(prompts, use_tqdm=False)
+
+
+def test_score_api(llm: LLM):
+    err_msg = "Score API is only enabled for num_labels == 1."
+    with pytest.raises(ValueError, match=err_msg):
+        llm.score("ping", "pong", use_tqdm=False)
diff --git a/tests/entrypoints/openai/test_classification.py b/tests/entrypoints/openai/test_classification.py
index 30078fe90257..36c96d76c2e5 100644
--- a/tests/entrypoints/openai/test_classification.py
+++ b/tests/entrypoints/openai/test_classification.py
@@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
         },
     )
     assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_score(server: RemoteOpenAIServer, model_name: str):
+    # score api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("score"),
+        json={
+            "model": model_name,
+            "text_1": "ping",
+            "text_2": "pong",
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_rerank(server: RemoteOpenAIServer, model_name: str):
+    # rerank api is only enabled for num_labels == 1.
+    response = requests.post(
+        server.url_for("rerank"),
+        json={
+            "model": model_name,
+            "query": "ping",
+            "documents": ["pong"],
+        },
+    )
+    assert response.json()["error"]["type"] == "BadRequestError"
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index a28d38729f9f..ca7d1539ddb7 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1805,17 +1805,13 @@ async def init_app_state(
         request_logger=request_logger,
         log_error_stack=args.log_error_stack,
     ) if "classify" in supported_tasks else None
-
-    enable_serving_reranking = ("classify" in supported_tasks and getattr(
-        model_config.hf_config, "num_labels", 0) == 1)
     state.openai_serving_scores = ServingScores(
         engine_client,
         model_config,
         state.openai_serving_models,
         request_logger=request_logger,
         log_error_stack=args.log_error_stack,
-    ) if ("embed" in supported_tasks or enable_serving_reranking) else None
-
+    ) if ("embed" in supported_tasks or "score" in supported_tasks) else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
         model_config,
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 66101e1a9924..62b3ee1abaca 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -13,12 +13,15 @@
 from transformers import PretrainedConfig
 
 from vllm.config import ModelConfig, PoolerConfig
+from vllm.logger import init_logger
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput
 from vllm.tasks import PoolingTask
 from vllm.utils import current_stream, resolve_obj_by_qualname
 from vllm.v1.pool.metadata import PoolingCursor, PoolingMetadata
 
+logger = init_logger(__name__)
+
 PoolingFn = Callable[
     [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata],
     Union[torch.Tensor, list[torch.Tensor]]]
@@ -183,7 +186,7 @@ def get_cross_encoder_activation_function(config: PretrainedConfig):
         fn = resolve_obj_by_qualname(function_name)()
         return PoolerActivation.wraps(fn)
 
-    return PoolerScore()
+    return PoolerClassify()
 
 
 def build_output(
@@ -371,22 +374,29 @@ def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
 
 class PoolerClassify(PoolerActivation):
 
-    def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        num_labels = pooled_data.shape[-1]
-        if num_labels < 2:
-            return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
-
-        return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype)
-
+    def __init__(self, *, static_num_labels: bool = True) -> None:
+        super().__init__()
 
-class PoolerScore(PoolerActivation):
+        if static_num_labels:
+            from vllm.config import get_current_vllm_config
+            vllm_config = get_current_vllm_config()
+            self.num_labels = getattr(vllm_config.model_config.hf_config,
+                                      "num_labels", 0)
+            if self.num_labels == 0:
+                logger.warning("num_labels should be > 0 for classification"
+                               "models, falling back to softmax. "
+                               "Please check if the configuration is correct.")
+        else:
+            self.num_labels = None
 
     def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
-        num_labels = pooled_data.shape[-1]
+        num_labels = (self.num_labels if self.num_labels is not None else
+                      pooled_data.shape[-1])
+
         if num_labels < 2:
             return F.sigmoid(pooled_data.float()).to(pooled_data.dtype)
 
-        return pooled_data
+        return F.softmax(pooled_data.float(), dim=-1).to(pooled_data.dtype)
 
 
 class LambdaPoolerActivation(PoolerActivation):
@@ -428,6 +438,10 @@ def __init__(self) -> None:
     def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 pooling_metadata: PoolingMetadata):
 
+        if isinstance(pooled_data, list):
+            pooled_data = torch.stack(pooled_data)
+        # pooled_data shape: [batchsize, hidden_dimension]
+
         # Apply ST projector
         if self.projector is not None:
             projector = cast(nn.Module, self.projector)
@@ -437,17 +451,11 @@ def _proj(x: torch.Tensor) -> torch.Tensor:
                 y = projector(x.to(torch.float32))
                 return y.to(orig_dtype)
 
-            if isinstance(pooled_data, torch.Tensor):
-                pooled_data = _proj(pooled_data)
-            else:
-                pooled_data = [_proj(t) for t in pooled_data]
+            pooled_data = _proj(pooled_data)
+        # pooled_data shape: [batchsize, embedding_dimension]
 
         pooling_params = get_pooling_params(pooling_metadata)
 
-        if isinstance(pooled_data, list):
-            pooled_data = torch.stack(pooled_data)
-        # pooled_data shape: [batchsize, embedding_dimension]
-
         # for matryoshka representation
         dimensions_list = [
             pooling_param.dimensions for pooling_param in pooling_params
@@ -477,13 +485,14 @@ def _proj(x: torch.Tensor) -> torch.Tensor:
                 for vecs, f in zip(pooled_data, flags)
             ]
 
+        # pooled_data shape: [batchsize, embedding_dimension]
         return pooled_data
 
 
 class RewardPoolerHead(PoolerHead):
 
     def __init__(self) -> None:
-        super().__init__(activation=PoolerClassify())
+        super().__init__(activation=PoolerClassify(static_num_labels=False))
 
     def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor],
                 pooling_metadata: PoolingMetadata):
@@ -637,19 +646,13 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
-
         if isinstance(pooled_data, list):
             pooled_data = torch.stack(pooled_data)
         # pooled_data shape: [batchsize, hidden_size]
 
         if self.classifier is not None:
-            # apply classifier once on the full batch if possible
-            if isinstance(pooled_data, torch.Tensor):
-                pooled_data = self.classifier(pooled_data)
-            elif len({data.shape for data in pooled_data}) <= 1:
-                pooled_data = self.classifier(torch.stack(pooled_data))
-            else:
-                pooled_data = [self.classifier(data) for data in pooled_data]
+            pooled_data = self.classifier(pooled_data)
+        # pooled_data shape: [batchsize, num_labels]
 
         pooling_params = get_pooling_params(pooling_metadata)
         flags = [p.activation for p in pooling_params]
@@ -662,6 +665,7 @@ def forward(
                 for vecs, f in zip(pooled_data, flags)
             ]
 
+        # scores shape: [batchsize, num_labels]
         return build_output(scores)
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0250a4e19a02..c6d50c17f2b4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1248,10 +1248,17 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]:
                 and "encode" in supported_tasks):
             supported_tasks.remove("encode")
 
-            logger.info_once("Chunked prefill is not supported with "
-                             "encode task which using ALL pooling. "
-                             "Please turn off chunked prefill by "
-                             "`--no-enable-chunked-prefill` before using it.")
+            logger.debug_once("Chunked prefill is not supported with "
+                              "encode task which using ALL pooling. "
+                              "Please turn off chunked prefill by "
+                              "`--no-enable-chunked-prefill` before using it.")
+
+        if "score" in supported_tasks:
+            num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
+            if num_labels != 1:
+                supported_tasks.remove("score")
+                logger.debug_once(
+                    "Score API is only enabled for num_labels == 1.")
 
         return supported_tasks
 

From 69f46359dd5b36c1a059a0a8b729be1bd86394e8 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Fri, 29 Aug 2025 03:36:57 -0700
Subject: [PATCH 092/125] [Multimodal] Consolidate mm inputs into
 MultiModalFeatureSpec (#23779)

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 tests/tokenization/test_detokenize.py         |  2 --
 tests/v1/core/test_kv_cache_utils.py          | 22 +++++++-----
 tests/v1/core/test_prefix_caching.py          | 22 +++++++-----
 tests/v1/core/test_scheduler.py               | 26 +++++++-------
 tests/v1/core/utils.py                        | 30 ++++++++--------
 tests/v1/engine/test_engine_core.py           |  4 +--
 tests/v1/engine/test_engine_core_client.py    |  4 +--
 .../v1/engine/test_fast_incdec_prefix_err.py  | 18 +++++-----
 tests/v1/engine/test_output_processor.py      | 30 ++++++----------
 tests/v1/kv_connector/unit/utils.py           |  4 +--
 vllm/multimodal/cache.py                      | 16 +++++++--
 vllm/multimodal/inputs.py                     | 23 +++++++++++++
 vllm/v1/engine/__init__.py                    |  7 ++--
 vllm/v1/engine/core.py                        | 16 ++++-----
 vllm/v1/engine/processor.py                   | 31 +++++++----------
 vllm/v1/request.py                            | 34 ++++++-------------
 16 files changed, 143 insertions(+), 146 deletions(-)

diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index ccafc8846127..ea7ccfbb2b45 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -64,8 +64,6 @@ def _run_incremental_decode(tokenizer,
     request = EngineCoreRequest("",
                                 prompt_token_ids,
                                 None,
-                                None,
-                                None,
                                 params,
                                 None,
                                 None,
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 47c74aff1e75..c4f927d69c2d 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -7,7 +7,8 @@
 import torch
 
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import GiB_bytes, sha256, sha256_cbor_64bit
 from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -37,17 +38,20 @@ def make_request(
     mm_hashes: Optional[list[str]] = None,
     cache_salt: Optional[str] = None,
 ):
-    if mm_positions is None:
-        mm_kwargs = None
-    else:
-        mm_item = MultiModalKwargsItem.dummy("dummy_m")
-        mm_kwargs = [mm_item] * len(mm_positions)
+    mm_features = []
+    if mm_positions is not None:
+        for j, position in enumerate(mm_positions):
+            identifier = mm_hashes[j] if mm_hashes else f"hash_{j}"
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image")
+            mm_features.append(mm_feature)
 
     return Request(request_id=request_id,
                    prompt_token_ids=prompt_token_ids,
-                   multi_modal_kwargs=mm_kwargs,
-                   multi_modal_hashes=mm_hashes,
-                   multi_modal_placeholders=mm_positions,
+                   mm_features=mm_features if mm_features else None,
                    sampling_params=SamplingParams(max_tokens=17),
                    pooling_params=None,
                    eos_token_id=100,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 89824768ed90..e7a8f63702b3 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -9,7 +9,8 @@
 import torch
 
 from vllm.distributed.kv_events import AllBlocksCleared, BlockRemoved
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.utils import sha256, sha256_cbor_64bit
 from vllm.v1.core.block_pool import BlockPool
@@ -32,17 +33,20 @@ def make_request(
     prompt_logprobs: Optional[int] = None,
     cache_salt: Optional[str] = None,
 ):
-    if mm_positions is None:
-        mm_kwargs = None
-    else:
-        mm_item = MultiModalKwargsItem.dummy("dummy_m")
-        mm_kwargs = [mm_item] * len(mm_positions)
+    mm_features = []
+    if mm_positions is not None:
+        for j, position in enumerate(mm_positions):
+            identifier = mm_hashes[j] if mm_hashes else f"hash_{j}"
+            mm_feature = MultiModalFeatureSpec(
+                data=MultiModalKwargsItem.dummy("dummy_m"),
+                mm_position=position,
+                identifier=identifier,
+                modality="image")
+            mm_features.append(mm_feature)
 
     return Request(request_id=request_id,
                    prompt_token_ids=prompt_token_ids,
-                   multi_modal_kwargs=mm_kwargs,
-                   multi_modal_hashes=mm_hashes,
-                   multi_modal_placeholders=mm_positions,
+                   mm_features=mm_features if mm_features else None,
                    sampling_params=SamplingParams(
                        max_tokens=17, prompt_logprobs=prompt_logprobs),
                    pooling_params=None,
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index 70e869178804..572d6c9c889f 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -8,7 +8,8 @@
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
@@ -1308,21 +1309,24 @@ def create_requests_with_priority(
                                      prompt_logprobs=prompt_logprobs)
     requests = []
     for i in range(num_requests):
+        mm_features = []
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_item = MultiModalKwargsItem.dummy("dummy_m")
-            mm_kwargs = [mm_item] * len(mm_position)
-        else:
-            mm_position = None
-            mm_kwargs = None
+            for j, position in enumerate(mm_position):
+                identifier = f"hash{i}_{j}"
+                mm_feature = MultiModalFeatureSpec(
+                    data=MultiModalKwargsItem.dummy("dummy_m"),
+                    mm_position=position,
+                    identifier=identifier,
+                    modality="image")
+                mm_features.append(mm_feature)
+
         request = Request(
             request_id=f"{i + starting_idx}",
             prompt_token_ids=[i + starting_idx] * num_tokens,
             sampling_params=sampling_params,
             pooling_params=None,
-            multi_modal_kwargs=mm_kwargs,
-            multi_modal_placeholders=mm_position,
-            multi_modal_hashes=None,
+            mm_features=mm_features if mm_features else None,
             eos_token_id=EOS_TOKEN_ID,
             arrival_time=arrival_times[i],
             priority=priorities[i],
@@ -1801,9 +1805,7 @@ def test_schedule_skip_tokenizer_init_structured_output_request():
     request = Request(
         request_id="0",
         prompt_token_ids=[0, 1],
-        multi_modal_kwargs=None,
-        multi_modal_hashes=None,
-        multi_modal_placeholders=None,
+        mm_features=None,
         sampling_params=sampling_params,
         pooling_params=None,
         eos_token_id=EOS_TOKEN_ID,
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 78a71f10a594..e392c2c336e9 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -6,7 +6,8 @@
 
 from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
                          SchedulerConfig, SpeculativeConfig, VllmConfig)
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import (MultiModalFeatureSpec,
+                                    MultiModalKwargsItem, PlaceholderRange)
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
                                          init_none_hash)
@@ -139,19 +140,20 @@ def create_requests(
                                      prompt_logprobs=prompt_logprobs)
     requests = []
     for i in range(num_requests):
+        mm_features = []
         if mm_positions is not None:
             mm_position = mm_positions[i]
-            mm_item = MultiModalKwargsItem.dummy("dummy_m")
-            mm_kwargs = [mm_item] * len(mm_position)
-            # Dummy hash for each mm item should be unique
-            # since encoder cache tracks entries by hash
-            mm_hashes = [
-                "hash" + str(i) + "_" + str(j) for j in range(len(mm_position))
-            ]
-        else:
-            mm_position = None
-            mm_kwargs = None
-            mm_hashes = None
+            for j, position in enumerate(mm_position):
+                # Dummy hash for each mm item should be unique
+                # since encoder cache tracks entries by hash
+                identifier = f"hash{i}_{j}"
+                mm_feature = MultiModalFeatureSpec(
+                    data=MultiModalKwargsItem.dummy("dummy_m"),
+                    mm_position=position,
+                    identifier=identifier,
+                    modality="image")
+                mm_features.append(mm_feature)
+
         prompt_token_ids = ([0] * num_tokens if same_prompt else [i] *
                             num_tokens)
         request = Request(
@@ -159,9 +161,7 @@ def create_requests(
             prompt_token_ids=prompt_token_ids,
             sampling_params=sampling_params,
             pooling_params=None,
-            multi_modal_kwargs=mm_kwargs,
-            multi_modal_placeholders=mm_position,
-            multi_modal_hashes=mm_hashes,
+            mm_features=mm_features if mm_features else None,
             eos_token_id=EOS_TOKEN_ID,
             block_hasher=block_hasher,
         )
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 2ea957a3e230..e6f7ebf25970 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -35,9 +35,7 @@ def make_request() -> EngineCoreRequest:
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=PROMPT_TOKENS,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
         sampling_params=SamplingParams(),
         pooling_params=None,
         eos_token_id=None,
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index 37eb869fe69a..625a3470e802 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -52,9 +52,7 @@ def make_request(
     return EngineCoreRequest(
         request_id=str(uuid.uuid4()),
         prompt_token_ids=prompt_tokens_ids,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
         sampling_params=params,
         pooling_params=None,
         eos_token_id=None,
diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py
index f028b4ab1d73..f3d8e13088b0 100644
--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -26,16 +26,14 @@ def test_fast_inc_detok_invalid_utf8_err_case():
     prompt_token_ids = [107, 4606, 236787, 107]
     params = SamplingParams(skip_special_tokens=True)
     request = EngineCoreRequest(
-        "test",
-        prompt_token_ids,
-        None,
-        None,
-        None,
-        params,
-        None,
-        None,
-        0.0,
-        None,
+        request_id="test",
+        prompt_token_ids=prompt_token_ids,
+        mm_features=None,
+        sampling_params=params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
         cache_salt=None,
         data_parallel_rank=None,
     )
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index c113439a7022..6544e8b017e7 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -52,11 +52,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
     requests = [
         EngineCoreRequest(request_id=f"request-{idx}",
                           prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_kwargs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
+                          mm_features=None,
                           eos_token_id=None,
+                          arrival_time=0,
                           lora_request=None,
                           cache_salt=None,
                           data_parallel_rank=None,
@@ -401,11 +399,9 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
     requests = [
         EngineCoreRequest(request_id=request_id_list[idx],
                           prompt_token_ids=prompt_tokens,
-                          arrival_time=0,
-                          mm_kwargs=None,
-                          mm_hashes=None,
-                          mm_placeholders=None,
+                          mm_features=None,
                           eos_token_id=None,
+                          arrival_time=0,
                           lora_request=None,
                           cache_salt=None,
                           data_parallel_rank=None,
@@ -566,11 +562,9 @@ def test_stop_token(include_stop_str_in_output: bool,
     request = EngineCoreRequest(
         request_id=request_id,
         prompt_token_ids=prompt_tokens,
-        arrival_time=0,
-        mm_kwargs=None,
-        mm_hashes=None,
-        mm_placeholders=None,
+        mm_features=None,
         eos_token_id=eos_token_id,
+        arrival_time=0,
         lora_request=None,
         cache_salt=None,
         data_parallel_rank=None,
@@ -665,11 +659,9 @@ def test_stop_string(include_stop_str_in_output: bool,
         EngineCoreRequest(
             request_id=request_id_list[idx],
             prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_kwargs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
+            mm_features=None,
             eos_token_id=None,
+            arrival_time=0,
             lora_request=None,
             cache_salt=None,
             data_parallel_rank=None,
@@ -781,11 +773,9 @@ def test_iteration_stats(dummy_test_vectors):
         EngineCoreRequest(
             request_id=f"request-{idx}",
             prompt_token_ids=prompt_tokens,
-            arrival_time=0,
-            mm_kwargs=None,
-            mm_hashes=None,
-            mm_placeholders=None,
+            mm_features=None,
             eos_token_id=None,
+            arrival_time=0,
             lora_request=None,
             cache_salt=None,
             data_parallel_rank=None,
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index a47f583b329e..3f068d5e8c7e 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -162,9 +162,7 @@ def create_request(request_id: int,
         prompt_token_ids=prompt_token_ids,
         sampling_params=sampling_params,
         pooling_params=None,
-        multi_modal_kwargs=None,
-        multi_modal_placeholders=None,
-        multi_modal_hashes=None,
+        mm_features=None,
         eos_token_id=EOS_TOKEN_ID,
         block_hasher=get_request_block_hasher(block_size, hash_fn),
     )
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index 0e81cb6d4d19..d385fcf61c9f 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -12,9 +12,9 @@
 from vllm.utils import GiB_bytes, LRUCache
 from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
 
-from .inputs import (MultiModalFieldElem, MultiModalKwargs,
-                     MultiModalKwargsItem, MultiModalKwargsItems,
-                     NestedTensors)
+from .inputs import (MultiModalFeatureSpec, MultiModalFieldElem,
+                     MultiModalKwargs, MultiModalKwargsItem,
+                     MultiModalKwargsItems, NestedTensors)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
@@ -418,6 +418,16 @@ class BaseMultiModalReceiverCache(
                             MultiModalKwargsItem]):
     """The required interface for caches on P1."""
 
+    def get_and_update_features(
+        self,
+        mm_features: list["MultiModalFeatureSpec"],
+    ) -> list["MultiModalFeatureSpec"]:
+        """Update multimodal features with cached encoder outputs."""
+        for feature in mm_features:
+            feature.data = self.get_and_update_item(feature.data,
+                                                    feature.identifier)
+        return mm_features
+
 
 class MultiModalReceiverCache(BaseMultiModalReceiverCache):
     """
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index cf6ab6c8dea2..6fcc5bc77214 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -198,6 +198,29 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 """
 
 
+@dataclass
+class MultiModalFeatureSpec:
+    """
+    Represents a single multimodal input with its processed data and metadata.
+    
+    Used by the V1 engine to track multimodal data through processing and
+    caching. A request containing multiple multimodal items will have one
+    MultiModalFeatureSpec per item.
+    """
+
+    data: Optional["MultiModalKwargsItem"]
+    """Multimodal data for this feature"""
+
+    modality: str
+    """Based on the input, e.g., "image", "audio", "video"."""
+
+    identifier: str
+    """mm_hash or uuid for caching encoder outputs."""
+
+    mm_position: PlaceholderRange
+    """e.g., PlaceholderRange(offset=2, length=336)"""
+
+
 @dataclass
 class MultiModalFieldElem:
     """
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index f7ec982db41b..5d8959a3cd3f 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -3,14 +3,13 @@
 
 import enum
 import time
-from collections.abc import Sequence
 from typing import Any, Optional, Union
 
 import msgspec
 import torch
 
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
@@ -48,9 +47,7 @@ class EngineCoreRequest(
 
     request_id: str
     prompt_token_ids: list[int]
-    mm_kwargs: Optional[Sequence[Optional[MultiModalKwargsItem]]]
-    mm_hashes: Optional[list[str]]
-    mm_placeholders: Optional[list[PlaceholderRange]]
+    mm_features: Optional[list[MultiModalFeatureSpec]]
     sampling_params: Optional[SamplingParams]
     pooling_params: Optional[PoolingParams]
     eos_token_id: Optional[int]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index a7038e2d2c26..785cbc9d8d56 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -434,15 +434,13 @@ def preprocess_add_request(
         This function could be directly used in input processing thread to allow
         request initialization running in parallel with Model forward
         """
-        if request.mm_hashes is not None:
-            assert request.mm_kwargs is not None
-
-            # Note on thread safety: no race condition.
-            # `mm_receiver_cache` is reset at the end of LLMEngine init,
-            # and will only accessed in the input processing thread afterwards.
-            if self.mm_receiver_cache is not None:
-                request.mm_kwargs = self.mm_receiver_cache.get_and_update(
-                    request.mm_kwargs, request.mm_hashes)
+        # Note on thread safety: no race condition.
+        # `mm_receiver_cache` is reset at the end of LLMEngine init,
+        # and will only accessed in the input processing thread afterwards.
+        if self.mm_receiver_cache is not None and request.mm_features:
+            request.mm_features = (
+                self.mm_receiver_cache.get_and_update_features(
+                    request.mm_features))
 
         req = Request.from_engine_core_request(request,
                                                self.request_block_hasher)
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index df915258d863..6cff95c39344 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -12,7 +12,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import processor_cache_from_config
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.multimodal.processing import EncDecMultiModalProcessor
 from vllm.multimodal.utils import argsort_mm_positions
 from vllm.pooling_params import PoolingParams
@@ -346,9 +346,8 @@ def process_inputs(
             pooling_params = params.clone()
 
         # Multimodal related.
-        sorted_mm_inputs: Optional[list[Optional[MultiModalKwargsItem]]] = None
-        sorted_mm_positions: Optional[list[PlaceholderRange]] = None
-        sorted_mm_hashes: Optional[list[str]] = None
+        mm_features: Optional[list[MultiModalFeatureSpec]] = None
+
         if decoder_inputs["type"] == "multimodal":
             decoder_mm_inputs = decoder_inputs["mm_kwargs"]
             decoder_mm_positions = decoder_inputs["mm_placeholders"]
@@ -359,25 +358,19 @@ def process_inputs(
             # in the input sequence.
             sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
 
-            sorted_mm_inputs = [
-                decoder_mm_inputs[modality][idx]
-                for modality, idx in sorted_mm_idxs
-            ]
-            sorted_mm_positions = [
-                decoder_mm_positions[modality][idx]
-                for modality, idx in sorted_mm_idxs
-            ]
-            sorted_mm_hashes = [
-                decoder_mm_hashes[modality][idx]
-                for modality, idx in sorted_mm_idxs
-            ]
+            mm_features = []
+            for modality, idx in sorted_mm_idxs:
+                mm_features.append(
+                    MultiModalFeatureSpec(
+                        data=decoder_mm_inputs[modality][idx],
+                        modality=modality,
+                        identifier=decoder_mm_hashes[modality][idx],
+                        mm_position=decoder_mm_positions[modality][idx]))
 
         return decoder_inputs.get("prompt"), EngineCoreRequest(
             request_id=request_id,
             prompt_token_ids=decoder_inputs["prompt_token_ids"],
-            mm_kwargs=sorted_mm_inputs,
-            mm_hashes=sorted_mm_hashes,
-            mm_placeholders=sorted_mm_positions,
+            mm_features=mm_features,
             sampling_params=sampling_params,
             pooling_params=pooling_params,
             eos_token_id=eos_token_id,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 4e99a9ccef46..ad7477241ebb 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -6,10 +6,9 @@
 from functools import partial
 from typing import TYPE_CHECKING, Any, Callable, Optional, Union
 
-from vllm.multimodal.inputs import MultiModalKwargsItem, PlaceholderRange
+from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_list_of
 from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
                             EngineCoreRequest, FinishReason)
 from vllm.v1.structured_output.request import StructuredOutputRequest
@@ -26,14 +25,12 @@ def __init__(
         self,
         request_id: str,
         prompt_token_ids: list[int],
-        multi_modal_kwargs: Optional[list[MultiModalKwargsItem]],
-        multi_modal_hashes: Optional[list[str]],
-        multi_modal_placeholders: Optional[list[PlaceholderRange]],
         sampling_params: Optional[SamplingParams],
         pooling_params: Optional[PoolingParams],
         eos_token_id: Optional[int],
         client_index: int = 0,
         arrival_time: Optional[float] = None,
+        mm_features: Optional[list[MultiModalFeatureSpec]] = None,
         lora_request: Optional["LoRARequest"] = None,
         structured_output_request: Optional["StructuredOutputRequest"] = None,
         cache_salt: Optional[str] = None,
@@ -89,16 +86,14 @@ def __init__(
         self.cache_salt: Optional[str] = cache_salt
 
         # Multi-modal related
-        self.mm_positions = multi_modal_placeholders or []
-        self.mm_kwargs = multi_modal_kwargs or []
-        self.mm_hashes: list[str] = multi_modal_hashes or []
-        self.num_encoder_inputs = len(self.mm_kwargs)
+        self.mm_features = mm_features or []
+        self.num_encoder_inputs = len(self.mm_features)
         self.has_encoder_inputs = self.num_encoder_inputs > 0
-
-        # Sanity check
-        assert len(self.mm_kwargs) == len(self.mm_positions)
-        if self.mm_hashes:
-            assert len(self.mm_kwargs) == len(self.mm_hashes)
+        # TODO(sfeng33): Remove these legacy fields after clearing out all
+        # references in scheduler and model runner
+        self.mm_positions = [f.mm_position for f in self.mm_features]
+        self.mm_kwargs = [f.data for f in self.mm_features]
+        self.mm_hashes = [f.identifier for f in self.mm_features]
 
         # Read-only views
         # Prevent directly appending to these lists since
@@ -126,20 +121,11 @@ def from_engine_core_request(
         cls, request: EngineCoreRequest,
         block_hasher: Optional[Callable[["Request"], list["BlockHash"]]]
     ) -> "Request":
-        if request.mm_kwargs is not None:
-            mm_kwargs_lst = list(request.mm_kwargs)
-            assert is_list_of(mm_kwargs_lst, MultiModalKwargsItem), (
-                "mm_kwargs was not updated in EngineCore.add_request")
-        else:
-            mm_kwargs_lst = None
-
         return cls(
             request_id=request.request_id,
             client_index=request.client_index,
             prompt_token_ids=request.prompt_token_ids,
-            multi_modal_kwargs=mm_kwargs_lst,
-            multi_modal_hashes=request.mm_hashes,
-            multi_modal_placeholders=request.mm_placeholders,
+            mm_features=request.mm_features,
             sampling_params=request.sampling_params,
             pooling_params=request.pooling_params,
             eos_token_id=request.eos_token_id,

From 67c14906aaa480d4fee2606f31c784ae21f8a633 Mon Sep 17 00:00:00 2001
From: Huy Do <huydhn@gmail.com>
Date: Fri, 29 Aug 2025 03:57:35 -0700
Subject: [PATCH 093/125] Update PyTorch to 2.8.0 (#20358)

Signed-off-by: Huy Do <huydhn@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
---
 .buildkite/test-pipeline.yaml                |  4 +--
 CMakeLists.txt                               |  4 +--
 pyproject.toml                               |  2 +-
 requirements/build.txt                       |  3 +-
 requirements/cpu.txt                         |  9 +++--
 requirements/cuda.txt                        | 10 +++---
 requirements/rocm-build.txt                  |  8 ++---
 requirements/test.in                         |  6 ++--
 requirements/test.txt                        | 36 ++++++++++----------
 tests/distributed/test_sequence_parallel.py  |  2 +-
 tests/lora/test_chatglm3_tp.py               |  6 +++-
 vllm/v1/attention/backends/flex_attention.py |  5 +--
 12 files changed, 50 insertions(+), 45 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f2652045526b..482808cd07e8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -462,8 +462,8 @@ steps:
   - tests/quantization
   commands:
   # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release
-  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
+  # after torchao 0.12 release, and pin a working version of torchao nightly here
+  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
 - label: LM Eval Small Models # 53min
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e92e08f0d0ec..3f1f9a781a07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
 
 #
 # Try to find python package with an executable that exactly matches
diff --git a/pyproject.toml b/pyproject.toml
index 013f2a6cd59e..e63f8aeae278 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<80.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.7.1",
+    "torch == 2.8.0",
     "wheel",
     "jinja2",
 ]
diff --git a/requirements/build.txt b/requirements/build.txt
index dd644d621efc..5f826a1afa14 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -4,7 +4,8 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.7.1
+torch==2.8.0
 wheel
 jinja2>=3.1.6
 regex
+build
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index f4b95b72898c..a48cb9fde000 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -9,17 +9,16 @@ packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
-torch==2.7.0; platform_system == "Darwin"
-torch==2.7.0; platform_machine == "ppc64le"
-torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
+torch==2.8.0; platform_system == "Darwin"
+torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.7.0; platform_machine == "ppc64le"
+torchaudio==2.8.0; platform_machine == "ppc64le"
 
 # required for the image processor of phi3v, this must be updated alongside torch
 torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.22.0; platform_machine == "ppc64le"
+torchvision==0.23.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index fb30e493f80b..3f8b8fca3209 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.7.1
-torchaudio==2.7.1
+torch==2.8.0
+torchaudio==2.8.0
 # These must be updated alongside torch
-torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
-xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
\ No newline at end of file
+torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
+xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.8
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index cbae9bbb8a9b..affe562c24f6 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -1,10 +1,10 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/rocm6.2.4
-torch==2.7.0
-torchvision==0.22.0
-torchaudio==2.7.0
+--extra-index-url https://download.pytorch.org/whl/rocm6.3
+torch==2.8.0
+torchvision==0.23.0
+torchaudio==2.8.0
 
 triton==3.3.0
 cmake>=3.26.1,<4
diff --git a/requirements/test.in b/requirements/test.in
index 92c577c50163..5b1688c76c95 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
 timm >=1.0.17 # required for internvl and gemma3n-mm test
-torch==2.7.1
-torchaudio==2.7.1
-torchvision==0.22.1
+torch==2.8.0
+torchaudio==2.8.0
+torchvision==0.23.0
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
 mistral_common[image,audio] >= 1.8.2 # required for voxtral test
diff --git a/requirements/test.txt b/requirements/test.txt
index 0c27c9bb67e8..0b728ebfb007 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -541,42 +541,42 @@ numpy==1.26.4
     #   tritonclient
     #   vocos
     #   xarray
-nvidia-cublas-cu12==12.8.3.14
+nvidia-cublas-cu12==12.8.4.1
     # via
     #   nvidia-cudnn-cu12
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cuda-cupti-cu12==12.8.57
+nvidia-cuda-cupti-cu12==12.8.90
     # via torch
-nvidia-cuda-nvrtc-cu12==12.8.61
+nvidia-cuda-nvrtc-cu12==12.8.93
     # via torch
-nvidia-cuda-runtime-cu12==12.8.57
+nvidia-cuda-runtime-cu12==12.8.90
     # via torch
-nvidia-cudnn-cu12==9.7.1.26
+nvidia-cudnn-cu12==9.10.2.21
     # via torch
-nvidia-cufft-cu12==11.3.3.41
+nvidia-cufft-cu12==11.3.3.83
     # via torch
-nvidia-cufile-cu12==1.13.0.11
+nvidia-cufile-cu12==1.13.1.3
     # via torch
-nvidia-curand-cu12==10.3.9.55
+nvidia-curand-cu12==10.3.9.90
     # via torch
-nvidia-cusolver-cu12==11.7.2.55
+nvidia-cusolver-cu12==11.7.3.90
     # via torch
-nvidia-cusparse-cu12==12.5.7.53
+nvidia-cusparse-cu12==12.5.8.93
     # via
     #   nvidia-cusolver-cu12
     #   torch
-nvidia-cusparselt-cu12==0.6.3
+nvidia-cusparselt-cu12==0.7.1
     # via torch
-nvidia-nccl-cu12==2.26.2
+nvidia-nccl-cu12==2.27.3
     # via torch
-nvidia-nvjitlink-cu12==12.8.61
+nvidia-nvjitlink-cu12==12.8.93
     # via
     #   nvidia-cufft-cu12
     #   nvidia-cusolver-cu12
     #   nvidia-cusparse-cu12
     #   torch
-nvidia-nvtx-cu12==12.8.55
+nvidia-nvtx-cu12==12.8.90
     # via torch
 omegaconf==2.3.0
     # via
@@ -1069,7 +1069,7 @@ tomli==2.2.1
     # via schemathesis
 tomli-w==1.2.0
     # via schemathesis
-torch==2.7.1+cu128
+torch==2.8.0+cu128
     # via
     #   -r requirements/test.in
     #   accelerate
@@ -1098,7 +1098,7 @@ torch==2.7.1+cu128
     #   torchvision
     #   vector-quantize-pytorch
     #   vocos
-torchaudio==2.7.1+cu128
+torchaudio==2.8.0+cu128
     # via
     #   -r requirements/test.in
     #   encodec
@@ -1111,7 +1111,7 @@ torchmetrics==1.7.4
     #   pytorch-lightning
     #   terratorch
     #   torchgeo
-torchvision==0.22.1+cu128
+torchvision==0.23.0+cu128
     # via
     #   -r requirements/test.in
     #   lightly
@@ -1152,7 +1152,7 @@ transformers==4.55.2
     #   transformers-stream-generator
 transformers-stream-generator==0.0.5
     # via -r requirements/test.in
-triton==3.3.1
+triton==3.4.0
     # via torch
 tritonclient==2.51.0
     # via
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index 49b8eddecb4a..c93b436f384b 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -292,7 +292,7 @@ def _compare_sp(
     # TODO support other models
     # [LANGUAGE GENERATION]
     "meta-llama/Llama-3.2-1B-Instruct",
-    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
+    "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
 ]
 
 
diff --git a/tests/lora/test_chatglm3_tp.py b/tests/lora/test_chatglm3_tp.py
index fb00e7b65b04..5cffb8cfcc26 100644
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
 @multi_gpu_test(num_gpus=4)
 @create_new_process_for_each_test()
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
+    # https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
+    # gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
+    # more GPU memory causing vLLM to OOM
     llm = vllm.LLM(MODEL_PATH,
                    max_model_len=1024,
                    enable_lora=True,
@@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
                    tensor_parallel_size=4,
                    trust_remote_code=True,
                    fully_sharded_loras=True,
-                   enable_chunked_prefill=True)
+                   enable_chunked_prefill=True,
+                   gpu_memory_utilization=0.85)
     output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         assert output1[i] == EXPECTED_LORA_OUTPUT[i]
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index a596f6b2b32a..d5b1c15e68d0 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -789,6 +789,7 @@ def get_kernel_options(query, block_m, block_n,
             device_props = torch.cuda.get_device_properties()
             max_shared_memory = device_props.shared_memory_per_block_optin
             if max_shared_memory < 144 * 1024:
-                kernel_options["BLOCK_M"] = 32
-                kernel_options["BLOCK_N"] = 32
+                kernel_options["BLOCK_M"] = kernel_options["BLOCK_M"] // 2
+                kernel_options["BLOCK_N"] = kernel_options["BLOCK_N"] // 2
+
     return kernel_options

From 4f7cde7272359d886d8dd178eebb19d94b3cdd6c Mon Sep 17 00:00:00 2001
From: Adit Chawdhary <25533953+aditchawdhary@users.noreply.github.com>
Date: Fri, 29 Aug 2025 17:58:13 +0530
Subject: [PATCH 094/125] Adds `json_count_leaves` utility function  (#23899)

Signed-off-by: aditchawdhary <aditxy@hotmail.com>
---
 tests/utils_/test_utils.py | 36 +++++++++++++++++++++++++++++++++---
 vllm/multimodal/cache.py   | 32 +++++++++++++++++++++++++++-----
 vllm/utils/jsontree.py     | 14 ++++++++++++--
 3 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/tests/utils_/test_utils.py b/tests/utils_/test_utils.py
index 04195ea0cf92..66124dd854ee 100644
--- a/tests/utils_/test_utils.py
+++ b/tests/utils_/test_utils.py
@@ -379,9 +379,9 @@ def test_duplicate_dict_args(caplog_vllm, parser):
 def test_supports_kw(callable,kw_name,requires_kw_only,
                      allow_var_kwargs,is_supported):
     assert supports_kw(
-        callable=callable,
-        kw_name=kw_name,
-        requires_kw_only=requires_kw_only,
+            callable=callable,
+            kw_name=kw_name,
+            requires_kw_only=requires_kw_only,
         allow_var_kwargs=allow_var_kwargs
     ) == is_supported
 
@@ -948,6 +948,36 @@ def test_join_host_port():
     assert join_host_port("::1", 5555) == "[::1]:5555"
 
 
+def test_json_count_leaves():
+    """Test json_count_leaves function from jsontree utility."""
+    from vllm.utils.jsontree import json_count_leaves
+
+    # Single leaf values
+    assert json_count_leaves(42) == 1
+    assert json_count_leaves("hello") == 1
+    assert json_count_leaves(None) == 1
+
+    # Empty containers
+    assert json_count_leaves([]) == 0
+    assert json_count_leaves({}) == 0
+    assert json_count_leaves(()) == 0
+
+    # Flat structures
+    assert json_count_leaves([1, 2, 3]) == 3
+    assert json_count_leaves({"a": 1, "b": 2}) == 2
+    assert json_count_leaves((1, 2, 3)) == 3
+
+    # Nested structures
+    nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
+    assert json_count_leaves(nested_dict) == 3
+
+    nested_list = [1, [2, 3], 4]
+    assert json_count_leaves(nested_list) == 4
+
+    mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
+    assert json_count_leaves(mixed_nested) == 4
+
+
 def test_convert_ids_list_to_tokens():
     tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
     token_ids = tokenizer.encode("Hello, world!")
diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py
index d385fcf61c9f..35b743ed21d9 100644
--- a/vllm/multimodal/cache.py
+++ b/vllm/multimodal/cache.py
@@ -10,7 +10,8 @@
 
 from vllm.logger import init_logger
 from vllm.utils import GiB_bytes, LRUCache
-from vllm.utils.jsontree import json_map_leaves, json_reduce_leaves
+from vllm.utils.jsontree import (json_count_leaves, json_map_leaves,
+                                 json_reduce_leaves)
 
 from .inputs import (MultiModalFeatureSpec, MultiModalFieldElem,
                      MultiModalKwargs, MultiModalKwargsItem,
@@ -127,11 +128,32 @@ def get_item_size(
         )
 
         if debug:
-            logger.debug("Calculated size of %s to be %.2f GiB", type(value),
-                         size / GiB_bytes)
+            leaf_count = json_count_leaves(value)
+            logger.debug(
+                "Calculated size of %s to be %.2f GiB (%d leaves)",
+                type(value),
+                size / GiB_bytes,
+                leaf_count,
+            )
 
         return size
 
+    @classmethod
+    def get_item_complexity(cls, value: MultiModalCacheValue) -> int:
+        """
+        Get the number of leaf elements in a multi-modal cache value.
+
+        This provides a measure of structural complexity that can be useful
+        for debugging cache performance and understanding data patterns.
+
+        Args:
+            value: The multi-modal cache value to analyze.
+
+        Returns:
+            The number of leaf elements in the nested structure.
+        """
+        return json_count_leaves(value)
+
     @classmethod
     def get_lru_cache(
         cls,
@@ -184,7 +206,7 @@ def get_and_update_item(
         """
         Possibly update a multi-modal item based on whether it is
         in the underlying cache.
-        
+
         This update is done out-of-place and updates the cache eviction order.
 
         Args:
@@ -262,7 +284,7 @@ def is_cached(self, mm_hashes: list[str]) -> list[bool]:
         in the underlying cache.
 
         This **DOES NOT** update the cache eviction order.
-    
+
         Args:
             mm_hashes: The hash of each item to check.
 
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index 4cbe0f76e006..457afb7e2c6f 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Helper functions to work with nested JSON structures."""
+
 from collections.abc import Iterable
 from functools import reduce
 from typing import Callable, TypeVar, Union, overload
@@ -8,8 +9,12 @@
 _T = TypeVar("_T")
 _U = TypeVar("_U")
 
-JSONTree = Union[dict[str, "JSONTree[_T]"], list["JSONTree[_T]"],
-                 tuple["JSONTree[_T]", ...], _T]
+JSONTree = Union[
+    dict[str, "JSONTree[_T]"],
+    list["JSONTree[_T]"],
+    tuple["JSONTree[_T]", ...],
+    _T,
+]
 """A nested JSON structure where the leaves need not be JSON-serializable."""
 
 
@@ -78,3 +83,8 @@ def json_reduce_leaves(
         json_iter_leaves(value),
         initial,
     )
+
+
+def json_count_leaves(value: JSONTree[_T]) -> int:
+    """Count the number of leaves in a nested JSON structure."""
+    return sum(1 for _ in json_iter_leaves(value))

From 1cf3753b901ba874a830c19555bb31fe37f91231 Mon Sep 17 00:00:00 2001
From: EduardDurech <39579228+EduardDurech@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:29:18 +0200
Subject: [PATCH 095/125] [MODEL] `Apertus` and `XIELU` (#23068)

Signed-off-by: EduardDurech <39579228+EduardDurech@users.noreply.github.com>
Co-authored-by: AllenHaoHuang <allenhuangdd@gmail.com>
---
 .../models/language/generation/test_common.py |   3 +-
 tests/models/registry.py                      |   3 +
 tests/models/test_registry.py                 |   3 +
 vllm/model_executor/layers/activation.py      | 111 ++++
 vllm/model_executor/models/apertus.py         | 576 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 6 files changed, 696 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/apertus.py

diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 57382914bfea..4c4434c94145 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -92,7 +92,8 @@
         pytest.param(
             "allenai/OLMoE-1B-7B-0924-Instruct",
             marks=[pytest.mark.cpu_model],
-        )
+        ),
+        pytest.param("swiss-ai/Apertus-8B"),  # apertus
     ])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 13eb4872e7d8..a37ffdc31151 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -137,6 +137,9 @@ def check_available_online(
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
+    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
+                                          min_transformers_version="4.56.0",
+                                          trust_remote_code=True),
     "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
                                    trust_remote_code=True),
     "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index 8769ad45eb93..36882aba5e94 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -24,6 +24,9 @@
 
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
+    # Skip if transformers version is incompatible
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    model_info.check_transformers_version(on_fail="skip")
     # Ensure all model classes can be imported successfully
     model_cls = ModelRegistry._try_load_model_cls(model_arch)
     assert model_cls is not None
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index f3248589abc4..eb7e494e3286 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -10,11 +10,14 @@
 
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
+from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import LazyDict
 
+logger = init_logger(__name__)
+
 
 @CustomOp.register("fatrelu_and_mul")
 class FatreluAndMul(CustomOp):
@@ -363,6 +366,112 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
         return self.forward_native(x)
 
 
+@CustomOp.register("xielu")
+class XIELU(CustomOp):
+    """
+    Applies the xIELU activation function introduced in https://arxiv.org/abs/2411.13010
+    If the user has installed the nickjbrowning/XIELU, we import xIELU CUDA
+    Otherwise, we emit a single warning and use xIELU Python
+    """
+
+    def __init__(
+        self,
+        alpha_p_init: float = 0.8,
+        alpha_n_init: float = 0.8,
+        beta: float = 0.5,
+        eps: float = -1e-6,
+        dtype: torch.dtype = torch.bfloat16,
+        with_vector_loads: bool = False,
+    ):
+        super().__init__()
+        self.alpha_p = nn.Parameter(
+            torch.log(torch.exp(torch.tensor(alpha_p_init, dtype=dtype)) -
+                      1).unsqueeze(0))
+        self.alpha_n = nn.Parameter(
+            torch.log(
+                torch.exp(torch.tensor(alpha_n_init - beta, dtype=dtype)) -
+                1).unsqueeze(0))
+        self.register_buffer("beta", torch.tensor(beta, dtype=dtype))
+        self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
+        self.with_vector_loads = with_vector_loads
+        # Temporary until xIELU CUDA fully implemented
+        self._beta_scalar = float(self.beta.detach().cpu().float().item())
+        self._eps_scalar = float(self.eps.detach().cpu().float().item())
+
+        self._xielu_cuda_obj = None
+        try:
+            import xielu.ops  # noqa: F401
+
+            self._xielu_cuda_obj = torch.classes.xielu.XIELU()
+            msg = "Using experimental xIELU CUDA."
+            try:
+                from torch._dynamo import allow_in_graph
+
+                self._xielu_cuda_fn = allow_in_graph(self._xielu_cuda)
+                msg += " Enabled torch._dynamo for xIELU CUDA."
+            except Exception as err:
+                msg += (f" Could not enable torch._dynamo for xIELU ({err}) - "
+                        "this may result in slower performance.")
+                self._xielu_cuda_fn = self._xielu_cuda
+            logger.warning_once(msg)
+        except Exception as err:
+            logger.warning_once(
+                "CUDA-fused xIELU not available (%s) –"
+                " falling back to a Python version.\n"
+                "For CUDA xIELU (experimental), `pip install git+https://github.com/nickjbrowning/XIELU`",
+                str(err),
+            )
+
+    def _xielu_python(self, x: torch.Tensor) -> torch.Tensor:
+        alpha_p = nn.functional.softplus(self.alpha_p)
+        alpha_n = self.beta + nn.functional.softplus(self.alpha_n)
+        return torch.where(
+            x > 0,
+            alpha_p * x * x + self.beta * x,
+            (torch.expm1(torch.min(x, self.eps)) - x) * alpha_n +
+            self.beta * x,
+        )
+
+    def _xielu_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        """Firewall function to prevent torch.compile from seeing .item()"""
+        assert self._xielu_cuda_obj is not None, (
+            "XIELU CUDA object must not be None")
+        original_shape = x.shape
+        # CUDA kernel expects 3D tensors, reshape if needed
+        while x.dim() < 3:
+            x = x.unsqueeze(0)
+        if x.dim() > 3:
+            x = x.view(-1, 1, x.size(-1))
+        if original_shape != x.shape:
+            logger.warning_once(
+                "Warning: xIELU input tensor expects 3 dimensions"
+                " but got (shape: %s). Reshaping to (shape: %s).",
+                original_shape,
+                x.shape,
+            )
+        result = self._xielu_cuda_obj.forward(
+            x,
+            self.alpha_p,
+            self.alpha_n,
+            # Temporary until xIELU CUDA fully implemented ->
+            # self.{beta,eps}.item()
+            self._beta_scalar,
+            self._eps_scalar,
+            self.with_vector_loads,
+        )
+        return result.view(original_shape)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self._xielu_cuda_obj is not None and input.is_cuda:
+            if not torch._dynamo.is_compiling():
+                return self._xielu_cuda_fn(input)
+            else:
+                logger.warning_once(
+                    "torch._dynamo is compiling, using Python version of xIELU."
+                )
+        return self._xielu_python(input)
+
+
 class ScaledActivation(nn.Module):
     """An activation function with post-scale parameters.
 
@@ -426,6 +535,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
     lambda: nn.Tanh(),
     "sigmoid":
     lambda: nn.Sigmoid(),
+    "xielu":
+    lambda: XIELU(),
 })
 
 
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
new file mode 100644
index 000000000000..0de683d2cd06
--- /dev/null
+++ b/vllm/model_executor/models/apertus.py
@@ -0,0 +1,576 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2025 The Swiss AI Initiative.
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate the architectural differences made by
+# the Swiss AI Initiative that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Apertus model compatible with HuggingFace weights."""
+from collections.abc import Iterable
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from transformers import ApertusConfig
+
+from vllm.attention import Attention, AttentionType
+from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.activation import XIELU
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import SupportsLoRA, SupportsPP
+from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index,
+                    is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class ApertusMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        prefix: str = "",
+        reduce_results: bool = True,
+    ) -> None:
+        super().__init__()
+        self.up_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=intermediate_size,
+            output_size=hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+            reduce_results=reduce_results,
+            prefix=f"{prefix}.down_proj",
+        )
+        if hidden_act != "xielu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only xIELU is supported for now.")
+        self.act_fn = XIELU()
+
+    def forward(self, x):
+        x, _ = self.up_proj(x)
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class ApertusAttention(nn.Module):
+
+    def __init__(
+        self,
+        config: ApertusConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        layer_idx = extract_layer_index(prefix)
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
+        head_dim = getattr(config, "head_dim", None)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
+        # Phi models introduced a partial_rotary_factor parameter in the config
+        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
+                                             1)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=hidden_size,
+            head_size=self.head_dim,
+            total_num_heads=self.total_num_heads,
+            total_num_kv_heads=self.total_num_kv_heads,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=self.total_num_heads * self.head_dim,
+            output_size=hidden_size,
+            bias=bias_o_proj,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self._init_rotary_emb(config,
+                              rope_scaling=rope_scaling,
+                              quant_config=quant_config)
+
+        sliding_window = None
+        if layer_types := getattr(config, "layer_types", None):
+            is_sliding = layer_types[layer_idx] == "sliding_attention"
+            if is_sliding:
+                sliding_window = config.sliding_window
+
+        attn_cls = (EncoderOnlyAttention
+                    if attn_type == AttentionType.ENCODER_ONLY else Attention)
+
+        self.attn = attn_cls(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=sliding_window,
+            attn_type=attn_type,
+            prefix=f"{prefix}.attn",
+        )
+
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q = self.q_norm(q.contiguous().view(-1, self.head_dim)).view_as(q)
+        k = self.k_norm(k.contiguous().view(-1, self.head_dim)).view_as(k)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def _init_rotary_emb(self, config: ApertusConfig,
+                         rope_scaling: Optional[dict[str, Any]],
+                         quant_config: Optional[QuantizationConfig]) -> None:
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "apertus":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=int(self.partial_rotary_factor * self.head_dim),
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+
+
+class ApertusDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: ApertusConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        rope_theta = getattr(config, "rope_theta", 10000)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        if rope_scaling is not None and getattr(
+                config, "original_max_position_embeddings", None):
+            rope_scaling["original_max_position_embeddings"] = (
+                config.original_max_position_embeddings)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        # Support abacusai/Smaug-72B-v0.1 with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False)
+        bias_o_proj = attention_bias
+        # support internlm/internlm3-8b with qkv_bias
+        if hasattr(config, 'qkv_bias'):
+            attention_bias = config.qkv_bias
+
+        # Apertus defaults to causal attention as it is a decoder-only model.
+        # You can override the HF config with `is_causal=False` to enable
+        # bidirectional attention, which is used in some embedding models
+        # (e.g. parasail-ai/GritLM-7B-vllm)
+        if getattr(config, "is_causal", True):
+            attn_type = AttentionType.DECODER
+        else:
+            attn_type = AttentionType.ENCODER_ONLY
+
+        self.self_attn = ApertusAttention(
+            config=config,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=getattr(config, "num_key_value_heads",
+                                 config.num_attention_heads),
+            rope_theta=rope_theta,
+            rope_scaling=rope_scaling,
+            max_position_embeddings=max_position_embeddings,
+            quant_config=quant_config,
+            bias=attention_bias,
+            bias_o_proj=bias_o_proj,
+            cache_config=cache_config,
+            prefix=f"{prefix}.self_attn",
+            attn_type=attn_type,
+        )
+        self.mlp = ApertusMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            quant_config=quant_config,
+            bias=getattr(config, "mlp_bias", False),
+            prefix=f"{prefix}.mlp",
+        )
+        self.attention_layernorm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+        self.feedforward_layernorm = RMSNorm(config.hidden_size,
+                                             eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.attention_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.attention_layernorm(
+                hidden_states, residual)
+        hidden_states = self.self_attn(positions=positions,
+                                       hidden_states=hidden_states)
+
+        # Fully Connected
+        hidden_states, residual = self.feedforward_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+
+
+@support_torch_compile
+class ApertusModel(nn.Module):
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = ApertusDecoderLayer):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        self.quant_config = quant_config
+        lora_vocab = (lora_config.lora_extra_vocab_size *
+                      (lora_config.max_loras or 1)) if lora_config else 0
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank or (config.tie_word_embeddings
+                                            and get_pp_group().is_last_rank):
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                quant_config=quant_config,
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: layer_type(config=config,
+                                      cache_config=cache_config,
+                                      quant_config=quant_config,
+                                      prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.aux_hidden_state_layers = tuple[int, ...]()
+
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors, tuple[torch.Tensor,
+                                                        list[torch.Tensor]]]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        aux_hidden_states = []
+        for idx, layer in enumerate(
+                self.layers[self.start_layer:self.end_layer]):
+            if idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+                "residual": residual
+            })
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv_proj", ".q_proj", "q"),
+            (".qkv_proj", ".k_proj", "k"),
+            (".qkv_proj", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if ("rotary_emb.cos_cached" in name
+                    or "rotary_emb.sin_cached" in name):
+                # Models trained using ColossalAI may include these tensors in
+                # the checkpoint. Skip them.
+                continue
+            if (self.quant_config is not None and
+                (scale_name := self.quant_config.get_cache_scale(name))):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else
+                                 loaded_weight[0])
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            if "scale" in name:
+                # Remapping the name of FP8 kv-scale.
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
+
+    # LoRA specific attributes
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings"
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 layer_type: type[nn.Module] = ApertusDecoderLayer):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        self.config = config
+        self.lora_config = lora_config
+
+        self.model = self._init_model(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(prefix, "model"),
+                                      layer_type=layer_type)
+
+        if get_pp_group().is_last_rank:
+            self.unpadded_vocab_size = config.vocab_size
+            if lora_config:
+                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "lm_head"),
+            )
+            if config.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                    config.vocab_size,
+                                                    logit_scale)
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def _init_model(self,
+                    vllm_config: VllmConfig,
+                    prefix: str = "",
+                    layer_type: type[nn.Module] = ApertusDecoderLayer):
+        return ApertusModel(vllm_config=vllm_config,
+                            prefix=prefix,
+                            layer_type=layer_type)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        model_output = self.model(input_ids, positions, intermediate_tensors,
+                                  inputs_embeds)
+        return model_output
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."]
+                           if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 9040189ee558..98115f862356 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -39,6 +39,7 @@
 # yapf: disable
 _TEXT_GENERATION_MODELS = {
     # [Decoder-only]
+    "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
     "ArceeForCausalLM": ("arcee", "ArceeForCausalLM"),

From 0a2f4c0793988d3cf0d47b5f771fb38231db4b2b Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 29 Aug 2025 15:42:57 +0100
Subject: [PATCH 096/125] [Models] Use in-place adds in Idefics2Vision (#23932)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
---
 vllm/model_executor/models/idefics2_vision_model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 88b2a295905b..0ca2e9e4bb68 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -108,7 +108,7 @@ def forward(self,
                        bucket_coords_w).flatten()
             position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
         position_ids = position_ids.to(self.position_embedding.weight.device)
-        embeddings = embeddings + self.position_embedding(position_ids)
+        embeddings += self.position_embedding(position_ids)
         return embeddings
 
 
@@ -262,11 +262,11 @@ def forward(
         residual = hidden_states
         hidden_states = self.layer_norm1(hidden_states)
         hidden_states = self.self_attn(hidden_states)
-        hidden_states = residual + hidden_states
+        hidden_states += residual
         residual = hidden_states
         hidden_states = self.layer_norm2(hidden_states)
         hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
+        hidden_states += residual
         return hidden_states
 
 

From d90d8eb674f3870b8c85515a468108d5f1bd609a Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Fri, 29 Aug 2025 08:17:27 -0700
Subject: [PATCH 097/125] [BugFix] Async scheduling and PP compatibility with
 DP (#23770)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 tests/v1/engine/test_engine_core.py    | 59 ++++++++--------------
 tests/v1/test_async_llm_dp.py          |  6 ++-
 vllm/executor/ray_utils.py             |  6 +++
 vllm/v1/engine/core.py                 | 69 ++++++++++++++------------
 vllm/v1/executor/abstract.py           |  9 ++--
 vllm/v1/executor/multiproc_executor.py | 15 ++++--
 vllm/v1/worker/gpu_worker.py           | 43 ++++++++--------
 7 files changed, 107 insertions(+), 100 deletions(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index e6f7ebf25970..98265c634957 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -306,17 +306,17 @@ def shutdown(self):
 
         # Schedule Batch 1: (10, req0)
         assert engine_core.step_with_batch_queue()[0] is None
-        assert engine_core.batch_queue.qsize() == 1
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["0"] == 10
         # num_computed_tokens should have been updated immediately.
         assert engine_core.scheduler.requests[
             req0.request_id].num_computed_tokens == 10
 
         # Schedule Batch 2: (2, req0), (8, req1)
-        assert engine_core.step_with_batch_queue()[0] is None
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert engine_core.step_with_batch_queue()[0] == {}
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["0"] == 2
         assert scheduler_output.num_scheduled_tokens["1"] == 8
         # num_computed_tokens should have been updated immediately.
@@ -325,42 +325,32 @@ def shutdown(self):
 
         assert engine_core.scheduler.get_num_unfinished_requests() == 2
 
-        # Batch queue is full. Finish Batch 1.
-        engine_core.step_with_batch_queue()
-
-        # Schedule Batch 3: (4, req1). Note that req0 cannot be scheduled
+        # Finish Batch 1 and schedule Batch 3: (4, req1).
+        # Note that req0 cannot be scheduled
         # because it is in the decoding stage now.
         engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        assert len(engine_core.batch_queue) == 1
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["1"] == 4
 
-        # Batch queue is full. Finish Batch 2. Get first token of req0.
+        # Finish Batch 2. Get first token of req0.
+        # Schedule Batch 4: (1, req0).
         output = engine_core.step_with_batch_queue()[0].get(0)
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req0.request_id].num_tokens == 13
-
-        # Schedule Batch 4: (1, req0).
-        engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["0"] == 1
 
-        # Batch queue is full. Finish Batch 3. Get first token of req1.
+        # Finish Batch 3. Get first token of req1. Schedule Batch 5: (1, req1).
         output = engine_core.step_with_batch_queue()[0].get(0)
         assert output is not None
         assert len(output.outputs) == 1
         assert engine_core.scheduler.requests[req1.request_id].num_tokens == 13
-
-        # Schedule Batch 5: (1, req1).
-        engine_core.step_with_batch_queue()
-        assert engine_core.batch_queue.qsize() == 2
-        scheduler_output = engine_core.batch_queue.queue[-1][1]
+        scheduler_output = engine_core.batch_queue[-1][1]
         assert scheduler_output.num_scheduled_tokens["1"] == 1
 
         # Loop until req0 is finished.
-        step = 0
         req_id = 0
         expected_num_tokens = [
             engine_core.scheduler.requests["0"].num_tokens + 1,
@@ -368,19 +358,14 @@ def shutdown(self):
         ]
         while engine_core.scheduler.get_num_unfinished_requests() == 2:
             output = engine_core.step_with_batch_queue()[0]
-            if step % 2 == 0:
-                # Even steps consumes an output.
-                assert output is not None
-                assert len(output[0].outputs) == 1
-                if req_id in engine_core.scheduler.requests:
-                    assert engine_core.scheduler.requests[
-                        req_id].num_tokens == expected_num_tokens[req_id]
-                expected_num_tokens[req_id] += 1
-                req_id = (req_id + 1) % 2
-            else:
-                # Odd steps schedules a new batch.
-                assert output is None
-            step += 1
+            # Every step consumes an output.
+            assert output is not None
+            assert len(output[0].outputs) == 1
+            if req_id in engine_core.scheduler.requests:
+                assert engine_core.scheduler.requests[
+                    req_id].num_tokens == expected_num_tokens[req_id]
+            expected_num_tokens[req_id] += 1
+            req_id = (req_id + 1) % 2
 
 
 @multi_gpu_test(num_gpus=2)
diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py
index c2610a87ac78..32da58011be9 100644
--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
@@ -75,9 +75,10 @@ async def generate(
     ],
 )
 @pytest.mark.parametrize("data_parallel_backend", ["mp", "ray"])
+@pytest.mark.parametrize("async_scheduling", [True, False])
 @pytest.mark.asyncio
-async def test_load(output_kind: RequestOutputKind,
-                    data_parallel_backend: str):
+async def test_load(output_kind: RequestOutputKind, data_parallel_backend: str,
+                    async_scheduling: bool):
 
     stats_loggers = {}
 
@@ -105,6 +106,7 @@ def log_engine_initialized(self):
         prompt = "This is a test of data parallel"
 
         engine_args.data_parallel_backend = data_parallel_backend
+        engine_args.async_scheduling = async_scheduling
         engine = AsyncLLM.from_engine_args(engine_args,
                                            stat_loggers=[SimpleStatsLogger])
         after.callback(engine.shutdown)
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 7abaffa54c08..4b2a15afb67a 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -10,6 +10,7 @@
 
 import vllm.platforms
 from vllm.config import ParallelConfig
+from vllm.distributed import get_pp_group
 from vllm.executor.msgspec_utils import decode_hook, encode_hook
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -136,6 +137,11 @@ def execute_model_ray(
                 scheduler_output, intermediate_tensors)
             if isinstance(output, IntermediateTensors):
                 output = scheduler_output, output
+            elif not get_pp_group().is_last_rank:
+                # Case where there are no scheduled requests
+                # but may still be finished requests.
+                assert not output or not output.req_ids
+                output = scheduler_output, None
             return output
 
         def override_env_vars(self, vars: Dict[str, str]):
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 785cbc9d8d56..922c06b44be8 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -138,12 +138,12 @@ def __init__(self,
         # schedule and execute batches, and is required by pipeline parallelism
         # to eliminate pipeline bubbles.
         self.batch_queue_size = self.model_executor.max_concurrent_batches
-        self.batch_queue: Optional[queue.Queue[tuple[Future[ModelRunnerOutput],
-                                                     SchedulerOutput]]] = None
+        self.batch_queue: Optional[deque[tuple[Future[ModelRunnerOutput],
+                                               SchedulerOutput]]] = None
         if self.batch_queue_size > 1:
             logger.info("Batch queue is enabled with size %d",
                         self.batch_queue_size)
-            self.batch_queue = queue.Queue(self.batch_queue_size)
+            self.batch_queue = deque(maxlen=self.batch_queue_size)
 
         self.request_block_hasher: Optional[Callable[[Request],
                                                      list[BlockHash]]] = None
@@ -319,41 +319,43 @@ def step_with_batch_queue(
         batch in the job queue is finished.
         3. Update the scheduler from the output.
         """
-        assert self.batch_queue is not None
+        batch_queue = self.batch_queue
+        assert batch_queue is not None
 
-        engine_core_outputs = None
-        scheduler_output = None
         # Try to schedule a new batch if the batch queue is not full, but
         # the scheduler may return an empty batch if all requests are scheduled.
         # Note that this is not blocking.
-        if not self.batch_queue.full():
-            scheduler_output = self.scheduler.schedule()
-            if scheduler_output.total_num_scheduled_tokens > 0:
-                future = self.model_executor.execute_model(scheduler_output)
-                self.batch_queue.put_nowait(
-                    (future, scheduler_output))  # type: ignore
-
-        scheduled_batch = (scheduler_output is not None
-                           and scheduler_output.total_num_scheduled_tokens > 0)
-
-        # If no more requests can be scheduled and the job queue is not empty,
-        # block until the first batch in the job queue is finished.
-        # TODO(comaniac): Ideally we should peek the first batch in the
-        # job queue to check if it's finished before scheduling a new batch,
-        # but peeking the first element in a queue is not thread-safe,
-        # so we need more work.
-        if not scheduled_batch and not self.batch_queue.empty():
-            future, scheduler_output = self.batch_queue.get_nowait()
+        assert len(batch_queue) < self.batch_queue_size
 
-            # Blocking until the first result is available.
-            model_output = self.execute_model_with_error_logging(
-                lambda _: future.result(), scheduler_output)
+        model_executed = False
+        if self.scheduler.has_requests():
+            scheduler_output = self.scheduler.schedule()
+            future = self.model_executor.execute_model(scheduler_output)
+            batch_queue.appendleft(
+                (future, scheduler_output))  # type: ignore[arg-type]
+
+            model_executed = scheduler_output.total_num_scheduled_tokens > 0
+            if model_executed and len(batch_queue) < self.batch_queue_size \
+                and not batch_queue[-1][0].done():
+                # Don't block on next worker response unless the queue is full
+                # or there are no more requests to schedule.
+                return None, True
+
+        elif not batch_queue:
+            # Queue is empty. We should not reach here since this method should
+            # only be called when the scheduler contains requests or the queue
+            # is non-empty.
+            return None, False
+
+        # Block until the next result is available.
+        future, scheduler_output = batch_queue.pop()
+        model_output = self.execute_model_with_error_logging(
+            lambda _: future.result(), scheduler_output)
 
-            self.batch_queue.task_done()
-            engine_core_outputs = (self.scheduler.update_from_output(
-                scheduler_output, model_output))
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output)
 
-        return engine_core_outputs, scheduled_batch
+        return engine_core_outputs, model_executed
 
     def shutdown(self):
         self.structured_output_manager.clear_backend()
@@ -388,7 +390,7 @@ def is_sleeping(self) -> bool:
         return self.model_executor.is_sleeping
 
     def execute_dummy_batch(self):
-        self.model_executor.collective_rpc("execute_dummy_batch")
+        self.model_executor.execute_dummy_batch()
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_executor.add_lora(lora_request)
@@ -733,7 +735,8 @@ def _process_input_queue(self):
         """Exits when an engine step needs to be performed."""
 
         waited = False
-        while not self.engines_running and not self.scheduler.has_requests():
+        while not self.engines_running and not self.scheduler.has_requests() \
+                and not self.batch_queue:
             if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
                 logger.debug("EngineCore waiting for work.")
                 waited = True
diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py
index 4be2f74177b1..68408a0b8a3d 100644
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -81,12 +81,10 @@ def register_failure_callback(self, callback: FailureCallback):
         pass
 
     def determine_available_memory(self) -> list[int]:  # in bytes
-        output = self.collective_rpc("determine_available_memory")
-        return output
+        return self.collective_rpc("determine_available_memory")
 
     def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
-        output = self.collective_rpc("get_kv_cache_spec")
-        return output
+        return self.collective_rpc("get_kv_cache_spec")
 
     def execute_model(
         self,
@@ -96,6 +94,9 @@ def execute_model(
                                      args=(scheduler_output, ))
         return output[0]
 
+    def execute_dummy_batch(self) -> None:
+        self.collective_rpc("execute_dummy_batch")
+
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
         output = self.collective_rpc("take_draft_token_ids")
         return output[0]
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 15b88a212899..12e79ff165f4 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -191,6 +191,10 @@ def execute_model(
                 outputs, self.output_rank)
         return self.kv_output_aggregator.aggregate(outputs, self.output_rank)
 
+    def execute_dummy_batch(self) -> None:
+        self.collective_rpc("execute_dummy_batch",
+                            unique_reply_rank=self.output_rank)
+
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
         # OPTIMIZATION: Get output only from a single worker (output_rank)
         outputs = self.collective_rpc("take_draft_token_ids",
@@ -242,12 +246,17 @@ def get_response(w: WorkerProcHandle,
                 dequeue_timeout = None if deadline is None else (
                     deadline - time.monotonic())
 
-                if non_block:
+                if self.io_thread_pool is not None:
+                    # We must consume worker_response_mq from a single thread.
                     result = self.io_thread_pool.submit(  # type: ignore
                         get_response, w, dequeue_timeout, self.shutdown_event)
-                else:
+                    if not non_block:
+                        result = result.result()
+                elif not non_block:
                     result = get_response(w, dequeue_timeout)
-
+                else:
+                    raise RuntimeError("non_block can only be used when"
+                                       " max_concurrent_batches > 1")
                 responses.append(result)
 
             return responses
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index c25219331334..2088bfff5bb3 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -354,36 +354,37 @@ def execute_model(
         scheduler_output: "SchedulerOutput",
     ) -> Optional[ModelRunnerOutput]:
         intermediate_tensors = None
-        if not get_pp_group().is_first_rank:
+        forward_pass = scheduler_output.total_num_scheduled_tokens > 0
+        if forward_pass and not get_pp_group().is_first_rank:
             intermediate_tensors = IntermediateTensors(
                 get_pp_group().recv_tensor_dict(
                     all_gather_group=get_tp_group()))
 
         output = self.model_runner.execute_model(scheduler_output,
                                                  intermediate_tensors)
+        if isinstance(output, ModelRunnerOutput):
+            return output
 
+        assert isinstance(output, IntermediateTensors)
         parallel_config = self.vllm_config.parallel_config
-        if parallel_config.distributed_executor_backend != "external_launcher" \
-            and not get_pp_group().is_last_rank:
-            assert isinstance(output, IntermediateTensors)
-            get_pp_group().send_tensor_dict(output.tensors,
-                                            all_gather_group=get_tp_group())
-
-            kv_connector_output = output.kv_connector_output
-            if not kv_connector_output:
-                return None
-
-            # In case of PP with kv transfer, we need to pass through the
-            # kv_connector_output
-            if (not kv_connector_output.finished_sending
-                    and not kv_connector_output.finished_recving):
-                return EMPTY_MODEL_RUNNER_OUTPUT
-
-            output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-            output.kv_connector_output = kv_connector_output
-            return output
+        assert parallel_config.distributed_executor_backend != (
+            "external_launcher") and not get_pp_group().is_last_rank
+
+        get_pp_group().send_tensor_dict(output.tensors,
+                                        all_gather_group=get_tp_group())
+
+        kv_connector_output = output.kv_connector_output
+        if not kv_connector_output:
+            return None
+
+        # In case of PP with kv transfer, we need to pass through the
+        # kv_connector_output
+        if (not kv_connector_output.finished_sending
+                and not kv_connector_output.finished_recving):
+            return EMPTY_MODEL_RUNNER_OUTPUT
 
-        assert isinstance(output, ModelRunnerOutput)
+        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
+        output.kv_connector_output = kv_connector_output
         return output
 
     def take_draft_token_ids(self) -> Optional[DraftTokenIds]:

From 72a69132dc540fe7168ffdbb761412fa569f323f Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Fri, 29 Aug 2025 23:29:21 +0800
Subject: [PATCH 098/125] [CI]  Add `aiter` to matching list of issue auto
 labeller for `rocm` tag (#23942)

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .github/workflows/issue_autolabel.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 6401d6586cc3..e0ab3872d8fa 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -49,6 +49,10 @@ jobs:
                     term: "VLLM_ROCM_",
                     searchIn: "both"
                   },
+                  {
+                    term: "aiter",
+                    searchIn: "title"
+                  },
                   {
                     term: "rocm",
                     searchIn: "title"

From 0dc9532065c5f98952cb82d4c497e49ca09400bf Mon Sep 17 00:00:00 2001
From: yzds <41983536+youzhedian@users.noreply.github.com>
Date: Sat, 30 Aug 2025 00:36:39 +0800
Subject: [PATCH 099/125] [BUGFIX ] fix undefined silu_and_mul_nvfp4_quant
 (#23929)

Signed-off-by: hongchao <hongchao@msh.team>
Signed-off-by: Richard Zou <zou3519@gmail.com>
Co-authored-by: hongchao <hongchao@msh.team>
Co-authored-by: Richard Zou <zou3519@gmail.com>
Co-authored-by: Richard Zou <zou3519@users.noreply.github.com>
---
 csrc/ops.h                                | 4 ++--
 csrc/torch_bindings.cpp                   | 3 ++-
 vllm/compilation/fix_functionalization.py | 4 +++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 78a487201bdd..7a176a5c0032 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -130,8 +130,8 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                         torch::Tensor& scale);
 
-#ifndef USE_ROCM
-
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                               torch::Tensor& output_block_scale,
                               torch::Tensor& input,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b769c09adc0f..56626a02c027 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -115,7 +115,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
 
-#ifndef USE_ROCM
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
   ops.def(
       "silu_and_mul_nvfp4_quant(Tensor! result, Tensor! result_block_scale, "
       "Tensor input, Tensor input_global_scale) -> ()");
diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py
index a36dd8b845f1..6bc721eec3d4 100644
--- a/vllm/compilation/fix_functionalization.py
+++ b/vllm/compilation/fix_functionalization.py
@@ -97,7 +97,9 @@ def __call__(self, graph: torch.fx.Graph):
                                      node,
                                      mutated_args,
                                      args=('result', 'input', 'scale'))
-            elif at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:
+            elif hasattr(
+                    torch.ops._C, "silu_and_mul_nvfp4_quant"
+            ) and at_target == torch.ops._C.silu_and_mul_nvfp4_quant.default:
                 mutated_args = {1: 'result', 2: 'result_block_scale'}
                 self.defunctionalize(graph,
                                      node,

From 4d7fe40fc0468b44404c32d87e4ae0158de24cdc Mon Sep 17 00:00:00 2001
From: 22quinn <33176974+22quinn@users.noreply.github.com>
Date: Fri, 29 Aug 2025 10:09:55 -0700
Subject: [PATCH 100/125] [RL][BugFix] Fix missing tokenizer error for
 token-in-token-out (#23904)

Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 .../openai/test_token_in_token_out.py         | 73 +++++++++++++++++++
 vllm/entrypoints/openai/serving_completion.py |  6 +-
 vllm/entrypoints/openai/serving_engine.py     | 38 +++++-----
 3 files changed, 99 insertions(+), 18 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_token_in_token_out.py

diff --git a/tests/entrypoints/openai/test_token_in_token_out.py b/tests/entrypoints/openai/test_token_in_token_out.py
new file mode 100644
index 000000000000..ed003939c44b
--- /dev/null
+++ b/tests/entrypoints/openai/test_token_in_token_out.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import tempfile
+
+import pytest
+
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MODEL_PATH = os.path.join(tempfile.gettempdir(), "qwen3_06b")
+
+
+@pytest.fixture(scope="module")
+def server():
+    global MODEL_PATH
+    MODEL_PATH = download_weights_from_hf(
+        MODEL_NAME,
+        allow_patterns=["*"],
+        cache_dir=MODEL_PATH,
+        ignore_patterns=["tokenizer*", "vocab*", "*.safetensors"])
+    args = [
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+        "--skip-tokenizer-init",
+        "--load-format",
+        "dummy",
+    ]
+    with RemoteOpenAIServer(MODEL_PATH, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_token_in_token_out_and_logprobs(server):
+    """
+    Test token-in-token-out and token_ids align with prompt_logprobs
+    & logprobs when return_tokens_as_token_ids is enabled.
+    """
+    tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
+    text = "Hello, world! How are you today?"
+    token_ids = tokenizer.encode(text)
+    async with server.get_async_client() as client:
+        # Test with both return_token_ids and return_tokens_as_token_ids enabled
+        completion = await client.completions.create(
+            model=MODEL_PATH,
+            prompt=token_ids,
+            max_tokens=20,
+            temperature=0,
+            echo=True,
+            extra_body={
+                "return_token_ids": True,
+            },
+        )
+
+        # Verify all fields are present
+        assert (completion.choices[0].token_ids is not None
+                and 0 < len(completion.choices[0].token_ids) <= 20)
+        assert completion.choices[0].prompt_token_ids is not None
+
+        # Decode prompt tokens
+        if completion.choices[0].prompt_token_ids:
+            prompt_text = tokenizer.decode(
+                completion.choices[0].prompt_token_ids)
+            # The decoded prompt should match or close to original prompt
+            assert prompt_text == text
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index b81fd63ece7a..f461d7609b94 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -127,7 +127,11 @@ async def create_completion(
         try:
             lora_request = self._maybe_get_adapters(request)
 
-            tokenizer = await self.engine_client.get_tokenizer(lora_request)
+            if self.model_config.skip_tokenizer_init:
+                tokenizer = None
+            else:
+                tokenizer = await self.engine_client.get_tokenizer(lora_request
+                                                                   )
 
             request_prompts, engine_prompts = await self._preprocess_completion(
                 request,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index a97935e109ef..ca6f3987936d 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -526,8 +526,8 @@ def _get_message_types(self, request: AnyRequest) -> set[str]:
     async def _normalize_prompt_text_to_input(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
         prompt: str,
+        tokenizer: AnyTokenizer,
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
@@ -563,12 +563,10 @@ async def _normalize_prompt_text_to_input(
     async def _normalize_prompt_tokens_to_input(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
         prompt_ids: list[int],
+        tokenizer: Optional[AnyTokenizer],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
     ) -> TextTokensPrompt:
-        async_tokenizer = self._get_async_tokenizer(tokenizer)
-
         if truncate_prompt_tokens is None:
             input_ids = prompt_ids
         elif truncate_prompt_tokens < 0:
@@ -576,7 +574,11 @@ async def _normalize_prompt_tokens_to_input(
         else:
             input_ids = prompt_ids[-truncate_prompt_tokens:]
 
-        input_text = await async_tokenizer.decode(input_ids)
+        if tokenizer is None:
+            input_text = ""
+        else:
+            async_tokenizer = self._get_async_tokenizer(tokenizer)
+            input_text = await async_tokenizer.decode(input_ids)
 
         return self._validate_input(request, input_ids, input_text)
 
@@ -681,27 +683,27 @@ async def _tokenize_prompt_inputs_async(
         [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
         that assumes multiple inputs.
         """
-        for text in prompt_inputs:
-            if isinstance(text, str):
+        for prompt in prompt_inputs:
+            if isinstance(prompt, str):
                 yield await self._normalize_prompt_text_to_input(
                     request,
-                    tokenizer,
-                    prompt=text,
+                    prompt=prompt,
+                    tokenizer=tokenizer,
                     truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=add_special_tokens,
                 )
             else:
                 yield await self._normalize_prompt_tokens_to_input(
                     request,
-                    tokenizer,
-                    prompt_ids=text,
+                    prompt_ids=prompt,
+                    tokenizer=tokenizer,
                     truncate_prompt_tokens=truncate_prompt_tokens,
                 )
 
     async def _tokenize_prompt_input_or_inputs_async(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
@@ -740,17 +742,19 @@ async def _tokenize_prompt_input_or_inputs_async(
         tasks = []
         for prompt_input in batch_inputs:
             if prompt_input["is_tokens"] is False:
+                assert tokenizer is not None, \
+                    "Tokenizer is required for text prompts"
                 task = self._normalize_prompt_text_to_input(
                     request,
-                    tokenizer,
                     prompt_input["content"],
+                    tokenizer=tokenizer,
                     truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=add_special_tokens)
             else:
                 task = self._normalize_prompt_tokens_to_input(
                     request,
-                    tokenizer,
                     prompt_input["content"],
+                    tokenizer=tokenizer,
                     truncate_prompt_tokens=truncate_prompt_tokens)
             tasks.append(task)
 
@@ -766,7 +770,7 @@ async def _preprocess_completion(
         request: Union[DetokenizeRequest, EmbeddingCompletionRequest,
                        RerankRequest, ClassificationRequest, ScoreRequest,
                        TokenizeCompletionRequest],
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
         add_special_tokens: bool = ...,
@@ -777,7 +781,7 @@ async def _preprocess_completion(
     async def _preprocess_completion(
         self,
         request: CompletionRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
@@ -789,7 +793,7 @@ async def _preprocess_completion(
     async def _preprocess_completion(
         self,
         request: CompletionLikeRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
         truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,

From b7adf94c4a6c7290dd8765819da68a801008f5a1 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Fri, 29 Aug 2025 13:28:35 -0400
Subject: [PATCH 101/125] Tuned H100/H200 triton fp8 block configs for
 fused_qkv_a_proj (#23939)

Signed-off-by: mgoin <mgoin64@gmail.com>
---
 benchmarks/kernels/bench_block_fp8_gemm.py    |   1 +
 .../kernels/benchmark_w8a8_block_fp8.py       |   1 +
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 146 ++++++++++++++++++
 ...,dtype=fp8_w8a8,block_shape=[128,128].json | 126 ++++++++++++++-
 4 files changed, 271 insertions(+), 3 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json

diff --git a/benchmarks/kernels/bench_block_fp8_gemm.py b/benchmarks/kernels/bench_block_fp8_gemm.py
index 883f0cf7e55f..9663503e9baa 100644
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@@ -16,6 +16,7 @@
 # DeepSeek-V3 weight shapes
 DEEPSEEK_V3_SHAPES = [
     (512 + 64, 7168),
+    (2112, 7168),
     ((128 + 64) * 128, 7168),
     (128 * (128 + 128), 512),
     (7168, 16384),
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index e648a91077fd..98bde9d83c82 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
     # cannot TP
     total = [
         (512 + 64, 7168),
+        (2112, 7168),
         ((128 + 64) * 128, 7168),
         (128 * (128 + 128), 512),
         (7168, 16384),
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..f81e09e198c8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index fbca5ce05d01..e073843af64c 100644
--- a/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=2112,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,10 +1,130 @@
 {
-    "2048": {
-        "BLOCK_SIZE_M": 256,
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
-        "num_warps": 8,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
         "num_stages": 3
     },
     "3072": {

From 1c26b4229673ba44eb418b7f60882daed34facc9 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 29 Aug 2025 20:47:58 +0200
Subject: [PATCH 102/125] [Docs] [V1] [Hybrid] Add new documentation re:
 contributing mamba-based models  (#23824)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
---
 docs/contributing/model/basic.md | 28 ++++++++++++++++++++++++++++
 docs/usage/v1_guide.md           | 12 +++++-------
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index 21b1f21d60a3..aafdb1058e03 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -121,3 +121,31 @@ To support a model with interleaving sliding windows, we need to take care of th
 - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
 
 With these two steps, interleave sliding windows should work with the model.
+
+### How to support models that use Mamba?
+
+We consider 3 different scenarios:
+
+1. Models that use Mamba layers (either Mamba-1 or Mamba-2) but do not use attention layers.
+2. Models that combine Mamba layers (either Mamba-1 or Mamba-2) together with attention layers.
+3. Models that combine Mamba-like mechanisms (e.g., Linear Attention, ShortConv) together with attention layers.
+
+For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](gh-file:vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](gh-file:vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
+The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
+For the mamba layers themselves, please use the [`MambaMixer`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](gh-file:vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
+Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
+V0-only classes and code will be removed in the very near future.
+The model should also be added to the `MODELS_CONFIG_MAP` dictionary in <gh-file:vllm/model_executor/models/config.py> to ensure that the runtime defaults are optimized.
+
+For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](gh-file:vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](gh-file:vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
+These models should follow the same instructions as case (1), but they should inherit protocol `IsHybrid` (instead of `IsAttentionFree`) and it is *not* necessary to add them to the `MODELS_CONFIG_MAP` (their runtime defaults will be inferred from the protocol).
+
+For case (3), we recommend looking at the implementation of [`MiniMaxText01ForCausalLM`](gh-file:vllm/model_executor/models/minimax_text_01.py) or [`Lfm2ForCausalLM`](gh-file:vllm/model_executor/models/lfm2.py) as a reference, which use custom "mamba-like" layers `MiniMaxText01LinearAttention` and `ShortConv` respectively.
+Please follow the same guidelines as case (2) for implementing these models.
+We use "mamba-like" to refer to layers that posses a state that is updated in-place, rather than being appended-to (like KV cache for attention).
+For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
+It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
+Please see [`LinearAttentionMetadata`](gh-file:vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](gh-file:v1/attention/backends/short_conv_attn.py) for examples of this.
+Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
+Please see the calls to `direct_register_custom_op` in <gh-file:vllm/model_executor/models/minimax_text_01.py> or <gh-file:vllm/model_executor/layers/mamba/short_conv.py> for examples of this.
+The new custom op should then be added to the list `_attention_ops` in <gh-file:vllm/config/compilation.py> to ensure that piecewise CUDA graphs works as intended.
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 20234e761133..f71805436a6a 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -107,16 +107,14 @@ to enable simultaneous generation and embedding using the same engine instance i
 #### Mamba Models
 
 Models using selective state-space mechanisms instead of standard transformer attention are supported.
-Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`) are supported.
-Please note that prefix caching is not yet supported for these models.
+Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.
 
-Models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
+Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
 `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
-Please note that prefix caching is not yet supported for these models.
 
-Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`).
-Please note that prefix caching is not yet supported for these models.
-It is also necessary to enforce eager mode for these models in V1.
+Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).
+
+Please note that prefix caching is not yet supported for any of the above models.
 
 #### Encoder-Decoder Models
 

From 8c3e199998cc5b1225328f2de01a7443fbb4f3cd Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Fri, 29 Aug 2025 12:16:57 -0700
Subject: [PATCH 103/125] Revert gemma3n fast prefill changes (#23897)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
---
 tests/v1/e2e/test_kv_sharing_fast_prefill.py |   1 +
 vllm/model_executor/models/gemma3n.py        | 419 +++----------------
 vllm/model_executor/models/gemma3n_mm.py     |   2 +-
 3 files changed, 67 insertions(+), 355 deletions(-)

diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index 7bc7f44dd7ab..6bc9b2b1d82d 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -64,6 +64,7 @@ def cleanup(llm: LLM, compilation_config: CompilationConfig):
 
 @fork_new_process_for_each_test
 @pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
 def test_kv_sharing_fast_prefill(
     monkeypatch: pytest.MonkeyPatch,
     enforce_eager: bool,
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 0e0e191e75fc..ffec3408702c 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -23,11 +23,9 @@
 from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
 
 from vllm.attention import Attention
-from vllm.compilation.backends import set_model_tag
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
-from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import (_ACTIVATION_REGISTRY,
                                                    GeluAndMul,
@@ -47,7 +45,6 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.v1.attention.backends.utils import KVSharingFastPrefillMetadata
 
 from .interfaces import SupportsQuant
 from .utils import (AutoWeightsLoader, extract_layer_index,
@@ -536,178 +533,7 @@ def forward(
         return corrected_predictions
 
 
-# This enables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
-                       kv_sharing_fast_prefill)
-class Gemma3nSelfDecoder(nn.Module):
-    """
-    Includes altup embedding and self decoder layers
-    """
-
-    def __init__(
-        self,
-        *,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-        decoder_layers: list[Gemma3nDecoderLayer],
-        layer_idx_start: int,
-        per_layer_model_projection: ColumnParallelLinear,
-        embed_scale_per_layer: torch.Tensor,
-        embed_tokens_per_layer: VocabParallelEmbedding,
-        per_layer_projection_norm: RMSNorm,
-        per_layer_input_scale: torch.Tensor,
-        altup_projections: nn.ModuleList,
-        eps: torch.Tensor,
-        embed_tokens: VocabParallelEmbedding,
-        embed_scale: torch.Tensor,
-    ):
-        super().__init__()
-        self.decoder_layers = decoder_layers
-        self.layer_idx_start = layer_idx_start
-        self.per_layer_model_projection = per_layer_model_projection
-        self.config = vllm_config.model_config.hf_config
-        self.embed_scale_per_layer = embed_scale_per_layer
-        self.embed_tokens_per_layer = embed_tokens_per_layer
-        self.per_layer_projection_norm = per_layer_projection_norm
-        self.per_layer_input_scale = per_layer_input_scale
-        self.altup_projections = altup_projections
-        self.eps = eps
-        self.embed_tokens = embed_tokens
-        self.embed_scale = embed_scale
-
-    def get_per_layer_input_embeddings(
-            self, input_ids: torch.Tensor) -> torch.Tensor:
-        # Deal with the fact that vocab_size_per_layer_input < vocab_size
-        # which causes us to have some out of vocab tokens by setting
-        # those token ids to 0. This matches the HF implementation.
-        per_layer_inputs_mask = torch.logical_and(
-            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
-        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
-                                              torch.zeros_like(input_ids))
-        return self.embed_tokens_per_layer(
-            per_layer_inputs_tokens) * self.embed_scale_per_layer
-
-    def get_per_layer_inputs(
-        self,
-        hidden_states_0: torch.Tensor,
-        per_layer_inputs: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
-        per_layer_projection = per_layer_projection.reshape(
-            *hidden_states_0.shape[:-1],
-            self.config.num_hidden_layers,
-            self.config.hidden_size_per_layer_input,
-        )
-        per_layer_projection = self.per_layer_projection_norm(
-            per_layer_projection)
-        if per_layer_inputs is not None:
-            # Profiling run does not compute per_layer_inputs
-            per_layer_inputs = per_layer_projection + per_layer_inputs
-            per_layer_inputs *= self.per_layer_input_scale
-        else:
-            per_layer_inputs = per_layer_projection
-        return per_layer_inputs
-
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.embed_tokens(input_ids) * self.embed_scale
-
-    def altup_embed(self, hidden_states_0: torch.Tensor) -> torch.Tensor:
-        # Altup embed.
-        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
-        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
-                                      keepdim=True)**0.5
-        for i in range(1, self.config.altup_num_inputs):
-            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
-            new_magnitude = torch.mean(hidden_states[i]**2,
-                                       dim=-1,
-                                       keepdim=True)**0.5
-            hidden_states[i] *= target_magnitude / torch.maximum(
-                new_magnitude, self.eps)
-        hidden_states = torch.stack(hidden_states, dim=-1)
-        return hidden_states
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        if inputs_embeds is not None:
-            hidden_states_0 = inputs_embeds
-        else:
-            hidden_states_0 = self.get_input_embeddings(input_ids)
-
-        adjusted_per_layer_inputs = self.get_per_layer_inputs(
-            hidden_states_0, per_layer_inputs)
-        hidden_states = self.altup_embed(hidden_states_0)
-
-        # [altnum_inputs, num_tokens, hidden_size]
-        hidden_states = hidden_states.permute(2, 0, 1)
-
-        for idx, layer in enumerate(self.decoder_layers):
-            layer_idx = idx + self.layer_idx_start
-            # [altup_num_inputs, num_tokens, hidden_size]
-            hidden_states = layer(
-                positions=positions,
-                hidden_states=hidden_states,
-                per_layer_input=adjusted_per_layer_inputs[:, layer_idx, :],
-                **kwargs,
-            )
-
-        # [num_tokens, hidden_size, altnum_inputs]
-        hidden_states = hidden_states.permute(1, 2, 0)
-
-        return hidden_states, adjusted_per_layer_inputs
-
-
-# This enables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
-                       kv_sharing_fast_prefill)
-class Gemma3nCrossDecoder(nn.Module):
-    """
-    Cross-decoder layers
-    """
-
-    def __init__(
-        self,
-        *,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-        decoder_layers: list[Gemma3nDecoderLayer],
-        layer_idx_start: int,
-    ):
-        super().__init__()
-        self.decoder_layers = decoder_layers
-        self.layer_idx_start = layer_idx_start
-
-    def forward(
-        self,
-        positions: torch.Tensor,
-        hidden_states: torch.Tensor,
-        per_layer_inputs: torch.Tensor,
-        **kwargs,
-    ) -> torch.Tensor:
-        # [altnum_inputs, num_tokens, hidden_size]
-        hidden_states = hidden_states.permute(2, 0, 1)
-        for idx, layer in enumerate(self.decoder_layers):
-            layer_idx = idx + self.layer_idx_start
-            # [altup_num_inputs, num_tokens, hidden_size]
-            hidden_states = layer(
-                positions=positions,
-                hidden_states=hidden_states,
-                per_layer_input=per_layer_inputs[:, layer_idx, :],
-                **kwargs,
-            )
-        # [num_tokens, hidden_size, altnum_inputs]
-        hidden_states = hidden_states.permute(1, 2, 0)
-        return hidden_states
-
-
-# This disables torch.compile if --kv-sharing-fast-prefill passed
-@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
-                       cache_config.kv_sharing_fast_prefill)
+@support_torch_compile
 class Gemma3nTextModel(nn.Module, SupportsQuant):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -717,6 +543,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
+
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -786,211 +613,95 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             lambda prefix: Gemma3nDecoderLayer(
                 config, cache_config, quant_config, prefix=prefix),
             prefix=f"{prefix}.layers")
-
-        self.eps = torch.tensor(torch.finfo().min)
-
-        first_kv_shared_layer_idx = (config.num_hidden_layers -
-                                     config.num_kv_shared_layers)
-        # Layer idx 0-19 are self-decoder layers in You Only Cache Once (YOCO)
-        with set_model_tag("self_decoder"):
-            self.self_decoder = Gemma3nSelfDecoder(
-                vllm_config=vllm_config,
-                prefix=f"{prefix}.self_decoder",
-                decoder_layers=self.layers[:first_kv_shared_layer_idx],
-                layer_idx_start=0,
-                per_layer_model_projection=self.per_layer_model_projection,
-                embed_scale_per_layer=self.embed_scale_per_layer,
-                embed_tokens_per_layer=self.embed_tokens_per_layer,
-                per_layer_projection_norm=self.per_layer_projection_norm,
-                per_layer_input_scale=self.per_layer_input_scale,
-                altup_projections=self.altup_projections,
-                eps=self.eps,
-                embed_tokens=self.embed_tokens,
-                embed_scale=self.embed_scale,
-            )
-        # Layer idx 20-30 are cross-decoder layers in YOCO
-        with set_model_tag("cross_decoder"):
-            self.cross_decoder = Gemma3nCrossDecoder(
-                vllm_config=vllm_config,
-                prefix=f"{prefix}.cross_decoder",
-                decoder_layers=self.layers[first_kv_shared_layer_idx:],
-                layer_idx_start=first_kv_shared_layer_idx,
-            )
-
         self.norm = RMSNorm(
             config.hidden_size,
             eps=config.rms_norm_eps,
         )
-
-        self.fast_prefill_enabled = cache_config.kv_sharing_fast_prefill
-
-        if self.fast_prefill_enabled:
-            # Allocate static buffers for CUDAGraph
-            # TODO(sarckk): Extract this functionality to interface
-            max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
-            device = next(self.parameters()).device
-            self.positions = torch.zeros(max_num_tokens,
-                                         dtype=torch.int64,
-                                         device=device)
-            self.hidden_states = torch.zeros(
-                (max_num_tokens, config.hidden_size,
-                 self.config.altup_num_inputs),
-                dtype=self.embed_tokens.weight.dtype,
-                device=device,
-            )
-            self.per_layer_inputs = torch.zeros(
-                (max_num_tokens, self.config.num_hidden_layers,
-                 self.config.hidden_size_per_layer_input),
-                dtype=self.embed_tokens.weight.dtype,
-                device=device,
-            )
+        self.eps = torch.tensor(torch.finfo().min)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.self_decoder.get_input_embeddings(input_ids)
+        return self.embed_tokens(input_ids) * self.embed_scale
+
+    def get_per_layer_input_embeddings(
+            self, input_ids: torch.Tensor) -> torch.Tensor:
+        # Deal with the fact that vocab_size_per_layer_input < vocab_size
+        # which causes us to have some out of vocab tokens by setting
+        # those token ids to 0. This matches the HF implementation.
+        per_layer_inputs_mask = torch.logical_and(
+            input_ids >= 0, input_ids < self.config.vocab_size_per_layer_input)
+        per_layer_inputs_tokens = torch.where(per_layer_inputs_mask, input_ids,
+                                              torch.zeros_like(input_ids))
+        return self.embed_tokens_per_layer(
+            per_layer_inputs_tokens) * self.embed_scale_per_layer
 
-    def fast_prefill_forward(
+    def forward(
         self,
-        input_ids: torch.Tensor,
+        input_ids: Optional[torch.Tensor],
         positions: torch.Tensor,
+        per_layer_inputs: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
         **kwargs,
-    ) -> torch.Tensor:
-        logits_indices_padded, num_logits_indices = None, None
-        attn_metadata = get_forward_context().attn_metadata
-
-        # attn_metadata is None during dummy runs
-        if (self.fast_prefill_enabled and attn_metadata is not None):
-            assert isinstance(attn_metadata, dict)
-            # Last layer is a KV sharing layer
-            layer_attn_metadata = attn_metadata[
-                self.layers[-1].self_attn.attn.layer_name]
-            if (isinstance(layer_attn_metadata, KVSharingFastPrefillMetadata)):
-                logits_indices_padded = (
-                    layer_attn_metadata.logits_indices_padded)
-                num_logits_indices = layer_attn_metadata.num_logits_indices
-
-        # Copy inputs for cudagraph
-        batch_size = positions.size(0)
-        self.positions[:batch_size].copy_(positions)
-        self_decoder_hidden_states, per_layer_inputs_adjusted = \
-            self.self_decoder(
-                input_ids=input_ids,
-                positions=self.positions[:batch_size],
-                inputs_embeds=inputs_embeds,
-                per_layer_inputs=per_layer_inputs,
-                **kwargs,
-            )
-
-        if logits_indices_padded is None:
-            logits_indices_padded = torch.arange(
-                positions.size(0),
-                dtype=positions.dtype,
-                device=positions.device,
-            )
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if inputs_embeds is not None:
+            hidden_states_0 = inputs_embeds
+        else:
+            hidden_states_0 = self.get_input_embeddings(input_ids)
 
-        # NOTE(sarckk): There is currently a bug caused by
-        # vLLM converting output of last piecewise CUDA graph
-        # to weakref, causing memory to be prematurely freed
-        # when there are multiple compilation units
-        # Keep .clone() until fix in
-        # https://github.com/vllm-project/vllm/pull/22282
-        hidden_states = self_decoder_hidden_states.clone()
-
-        # Copy inputs for cudagraph
-        num_padded_logits_indices = logits_indices_padded.size(0)
-        self.positions[:num_padded_logits_indices].copy_(
-            positions[logits_indices_padded])
-        self.hidden_states[:num_padded_logits_indices].copy_(
-            self_decoder_hidden_states[logits_indices_padded])
-        self.per_layer_inputs[:num_padded_logits_indices].copy_(
-            per_layer_inputs_adjusted[logits_indices_padded])
-        cross_decoder_hidden_states = self.cross_decoder(
-            positions=self.positions[:num_padded_logits_indices],
-            hidden_states=self.hidden_states[:num_padded_logits_indices],
-            per_layer_inputs=self.per_layer_inputs[:num_padded_logits_indices],
-            **kwargs,
+        per_layer_projection = self.per_layer_model_projection(hidden_states_0)
+        per_layer_projection = per_layer_projection.reshape(
+            *hidden_states_0.shape[:-1],
+            self.config.num_hidden_layers,
+            self.config.hidden_size_per_layer_input,
         )
+        per_layer_projection = self.per_layer_projection_norm(
+            per_layer_projection)
 
-        if num_logits_indices is not None:
-            assert num_logits_indices > 0
-            # Merge cross-decoder and self-decoder hidden states
-            hidden_states[logits_indices_padded[:num_logits_indices]] = (
-                cross_decoder_hidden_states[:num_logits_indices])
+        if per_layer_inputs is not None:
+            # Profiling run does not compute per_layer_inputs
+            per_layer_inputs = per_layer_projection + per_layer_inputs
+            per_layer_inputs *= self.per_layer_input_scale
         else:
-            hidden_states = cross_decoder_hidden_states
+            per_layer_inputs = per_layer_projection
 
-        return hidden_states
+        # Altup embed.
+        hidden_states = [hidden_states_0] * self.config.altup_num_inputs
+        target_magnitude = torch.mean(hidden_states_0**2, dim=-1,
+                                      keepdim=True)**0.5
+        for i in range(1, self.config.altup_num_inputs):
+            hidden_states[i] = self.altup_projections[i - 1](hidden_states[i])
+            new_magnitude = torch.mean(hidden_states[i]**2,
+                                       dim=-1,
+                                       keepdim=True)**0.5
+            hidden_states[i] *= target_magnitude / torch.maximum(
+                new_magnitude, self.eps)
+        hidden_states = torch.stack(hidden_states, dim=0)
 
-    def normal_forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        hidden_states, per_layer_inputs = self.self_decoder(
-            input_ids=input_ids,
-            positions=positions,
-            inputs_embeds=inputs_embeds,
-            per_layer_inputs=per_layer_inputs,
-            **kwargs,
-        )
-        hidden_states = self.cross_decoder(
-            positions=positions,
-            hidden_states=hidden_states,
-            per_layer_inputs=per_layer_inputs,
-            **kwargs,
-        )
-        return hidden_states
+        # Transformer blocks.
+        for layer_idx, layer in enumerate(self.layers):
+            # [altup_num_inputs, num_tokens, hidden_size]
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                per_layer_input=per_layer_inputs[:, layer_idx, :],
+                **kwargs,
+            )
 
-    def altup_unembed(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor:
         # Altup unembed.
-        target_magnitude = torch.mean(hidden_states[..., 0]**2,
+        target_magnitude = torch.mean(hidden_states[0]**2,
                                       dim=-1,
                                       keepdim=True)**0.5
         for i in range(1, self.config.altup_num_inputs):
-            hidden_states[..., i] = self.altup_unembed_projections[i - 1](
-                hidden_states[..., i])
-            new_magnitude = torch.mean(hidden_states[..., i]**2,
+            hidden_states[i] = self.altup_unembed_projections[i - 1](
+                hidden_states[i])
+            new_magnitude = torch.mean(hidden_states[i]**2,
                                        dim=-1,
                                        keepdim=True)**0.5
-            hidden_states[..., i] *= target_magnitude / torch.maximum(
+            hidden_states[i] *= target_magnitude / torch.maximum(
                 new_magnitude, self.eps)
-        # [num_tokens,hidden_size, altup_num_inputs] -> [num_tokens,hidden_size]
-        hidden_states = torch.mean(hidden_states, dim=-1)
-        return hidden_states
+        # [altup_num_inputs,num_tokens,hidden_size] -> [num_tokens,hidden_size]
+        hidden_states = torch.mean(hidden_states, dim=0)
 
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor],
-        positions: torch.Tensor,
-        per_layer_inputs: Optional[torch.Tensor] = None,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        if self.fast_prefill_enabled:
-            hidden_states = self.fast_prefill_forward(
-                input_ids,
-                positions,
-                inputs_embeds,
-                per_layer_inputs,
-                **kwargs,
-            )
-        else:
-            hidden_states = self.normal_forward(
-                input_ids,
-                positions,
-                inputs_embeds,
-                per_layer_inputs,
-                **kwargs,
-            )
-        hidden_states = self.altup_unembed(hidden_states)
         return self.norm(hidden_states)
 
     def load_weights(self, weights: Iterable[tuple[str,
diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py
index aba4f98ea5f3..d59dde1560ae 100644
--- a/vllm/model_executor/models/gemma3n_mm.py
+++ b/vllm/model_executor/models/gemma3n_mm.py
@@ -620,7 +620,7 @@ def get_input_embeddings(
         # NOTE (NickLucche) Each pass needs tokens to compute PLE so we cache
         # them here, as the model  forward has only access to the input_embeds.
         if input_ids is not None:
-            per_layer_inputs = self.language_model.model.self_decoder.get_per_layer_input_embeddings(
+            per_layer_inputs = self.language_model.model.get_per_layer_input_embeddings(
                 input_ids)
             per_layer_inputs = per_layer_inputs.reshape(
                 -1, self.config.text_config.num_hidden_layers,

From 5674a40366bae4cfc862c35b719e3dcac2587ac1 Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Fri, 29 Aug 2025 20:37:24 +0100
Subject: [PATCH 104/125] [Misc] Make `download_weights_from_hf` more reliable
 (#23863)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 .../model_loader/weight_utils.py              | 51 ++++++++++++-------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 3bb47f82d2f3..f87eeaa4563f 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -278,33 +278,48 @@ def download_weights_from_hf(
     Returns:
         str: The path to the downloaded model weights.
     """
+    assert len(allow_patterns) > 0
     local_only = huggingface_hub.constants.HF_HUB_OFFLINE
     if not local_only:
-        # Before we download we look at that is available:
-        fs = HfFileSystem()
-        file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
-
-        # depending on what is available we download different things
-        for pattern in allow_patterns:
-            matching = fnmatch.filter(file_list, pattern)
-            if len(matching) > 0:
-                allow_patterns = [pattern]
+        # Attempt to reduce allow_patterns to a single pattern
+        # so we only have to call snapshot_download once.
+        try:
+            fs = HfFileSystem()
+            file_list = fs.ls(model_name_or_path,
+                              detail=False,
+                              revision=revision)
+
+            # Use the first pattern found in the HF repo's files.
+            for pattern in allow_patterns:
+                matching = fnmatch.filter(file_list, pattern)
+                if len(matching) > 0:
+                    allow_patterns = [pattern]
                 break
+        except Exception as e:
+            logger.warning(
+                "Failed to get file list for '%s'. Trying each pattern in "
+                "allow_patterns individually until weights have been "
+                "downloaded. Error: %s", model_name_or_path, e)
 
     logger.info("Using model weights format %s", allow_patterns)
     # Use file lock to prevent multiple processes from
     # downloading the same model weights at the same time.
     with get_lock(model_name_or_path, cache_dir):
         start_time = time.perf_counter()
-        hf_folder = snapshot_download(
-            model_name_or_path,
-            allow_patterns=allow_patterns,
-            ignore_patterns=ignore_patterns,
-            cache_dir=cache_dir,
-            tqdm_class=DisabledTqdm,
-            revision=revision,
-            local_files_only=local_only,
-        )
+        for allow_pattern in allow_patterns:
+            hf_folder = snapshot_download(
+                model_name_or_path,
+                allow_patterns=allow_pattern,
+                ignore_patterns=ignore_patterns,
+                cache_dir=cache_dir,
+                tqdm_class=DisabledTqdm,
+                revision=revision,
+                local_files_only=local_only,
+            )
+            # If we have downloaded weights for this allow_pattern,
+            # we don't need to check the rest.
+            if any(Path(hf_folder).glob(allow_pattern)):
+                break
         time_taken = time.perf_counter() - start_time
         if time_taken > 0.5:
             logger.info("Time spent downloading weights for %s: %.6f seconds",

From d660c98c1b59580af97d6c7dd162c7c8894d40ed Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Fri, 29 Aug 2025 15:40:04 -0700
Subject: [PATCH 105/125] [CI] Fix unavailable image remote URL (#23966)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 tests/models/multimodal/generation/vlm_utils/custom_inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index c53243b42e38..c68ac8f57662 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -119,7 +119,7 @@ def different_patch_input_cases_internvl():
 
 def windows_attention_image_qwen2_5_vl():
     # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
-    image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
+    image_url = "https://github.com/AOMediaCodec/av1-avif/blob/main/testFiles/Link-U/hato.jpg?raw=true"
     image = Image.open(BytesIO(requests.get(image_url).content))
 
     question = "Describe the image."

From 5b31cb1781e594aae29d878b1acde3e2f900bc41 Mon Sep 17 00:00:00 2001
From: dubejf <dubejf@users.noreply.github.com>
Date: Sat, 30 Aug 2025 00:36:39 -0400
Subject: [PATCH 106/125] [Bugfix] Fix --config arg expansion called from
 api_server.py (#23944)

Signed-off-by: Jean-Francois Dube <dubejf+gh@gmail.com>
Co-authored-by: Jean-Francois Dube <dubejf+gh@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 tests/entrypoints/openai/test_cli_args.py | 22 ++++++++++++++++++++++
 vllm/utils/__init__.py                    |  7 +++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py
index b20838956d72..9a1c0ea13b54 100644
--- a/tests/entrypoints/openai/test_cli_args.py
+++ b/tests/entrypoints/openai/test_cli_args.py
@@ -27,6 +27,28 @@ def serve_parser():
     return make_arg_parser(parser)
 
 
+### Test config parsing
+def test_config_arg_parsing(serve_parser, cli_config_file):
+    args = serve_parser.parse_args([])
+    assert args.port == 8000
+    args = serve_parser.parse_args(['--config', cli_config_file])
+    assert args.port == 12312
+    args = serve_parser.parse_args([
+        '--config',
+        cli_config_file,
+        '--port',
+        '9000',
+    ])
+    assert args.port == 9000
+    args = serve_parser.parse_args([
+        '--port',
+        '9000',
+        '--config',
+        cli_config_file,
+    ])
+    assert args.port == 9000
+
+
 ### Tests for LoRA module parsing
 def test_valid_key_value_format(serve_parser):
     # Test old format: name=path
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 60bddc5b500b..c5ed10326fd5 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1976,13 +1976,16 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]:
 
         config_args = self.load_config_file(file_path)
 
-        # 0th index is for {serve,chat,complete}
+        # 0th index might be the sub command {serve,chat,complete,...}
         # optionally followed by model_tag (only for serve)
         # followed by config args
         # followed by rest of cli args.
         # maintaining this order will enforce the precedence
         # of cli > config > defaults
-        if args[0] == "serve":
+        if args[0].startswith('-'):
+            # No sub command (e.g., api_server entry point)
+            args = config_args + args[0:index] + args[index + 2:]
+        elif args[0] == "serve":
             model_in_cli = len(args) > 1 and not args[1].startswith('-')
             model_in_config = any(arg == '--model' for arg in config_args)
 

From 8fb85b7bb67408c725474c31fe2a8f980c250277 Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:36:48 -0700
Subject: [PATCH 107/125] Add routed_scaling_factor to MoE grouped topk
 (#23123)

Signed-off-by: Xin Yang <xyangx@amazon.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../layers/fused_moe/cpu_fused_moe.py          | 12 ++++++++++++
 .../layers/fused_moe/fused_moe.py              |  7 ++++---
 vllm/model_executor/layers/fused_moe/layer.py  | 18 ++++++++++++++++++
 .../layers/fused_moe/rocm_aiter_fused_moe.py   |  3 +++
 .../layers/quantization/awq_marlin.py          |  2 ++
 .../layers/quantization/bitsandbytes.py        |  2 ++
 .../compressed_tensors_moe.py                  | 10 ++++++++++
 .../layers/quantization/experts_int8.py        |  2 ++
 vllm/model_executor/layers/quantization/fp8.py |  4 +++-
 .../model_executor/layers/quantization/gguf.py |  2 ++
 .../layers/quantization/gptq_marlin.py         |  2 ++
 .../layers/quantization/modelopt.py            |  4 ++++
 .../layers/quantization/moe_wna16.py           |  2 ++
 .../layers/quantization/mxfp4.py               |  2 ++
 .../layers/quantization/quark/quark_moe.py     |  4 ++++
 vllm/model_executor/layers/quantization/rtn.py |  2 ++
 vllm/model_executor/models/deepseek_v2.py      |  1 +
 vllm/model_executor/models/dots1.py            |  1 +
 vllm/model_executor/models/glm4_moe.py         |  1 +
 19 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 769a04b7de89..0eec93601b3f 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -21,6 +21,7 @@ def grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,
     e_score_correction_bias: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
     assert hidden_states.shape[0] == gating_output.shape[0], (
@@ -65,6 +66,8 @@ def grouped_topk(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
+    if routed_scaling_factor != 1.0:
+        topk_weights = topk_weights * routed_scaling_factor
     return topk_weights, topk_ids.to(torch.int32)
 
 
@@ -78,6 +81,7 @@ def select_experts(
     num_expert_group: Optional[int] = None,
     custom_routing_function: Optional[Callable] = None,
     scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,
     e_score_correction_bias: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     if use_grouped_topk:
@@ -90,6 +94,7 @@ def select_experts(
                             num_expert_group=num_expert_group,
                             topk_group=topk_group,
                             scoring_func=scoring_func,
+                            routed_scaling_factor=routed_scaling_factor,
                             e_score_correction_bias=e_score_correction_bias)
     elif custom_routing_function is None:
         assert scoring_func == "softmax"
@@ -131,12 +136,15 @@ def __call__(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
         assert activation == "silu", f"{activation} is not supported."
         assert not apply_router_weight_on_input
+        assert routed_scaling_factor == 1.0, \
+            f"routed_scaling_factor {routed_scaling_factor} is not supported."
         return layer.ipex_fusion(
             x,
             use_grouped_topk,
@@ -170,6 +178,7 @@ def __call__(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -186,6 +195,7 @@ def __call__(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
         )
 
@@ -227,6 +237,7 @@ def __call__(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -243,6 +254,7 @@ def __call__(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
         )
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 17a5c735a57f..eb3e14180ecf 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1011,7 +1011,8 @@ def grouped_topk(
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
-    topk_weights = topk_weights * routed_scaling_factor
+    if routed_scaling_factor != 1.0:
+        topk_weights = topk_weights * routed_scaling_factor
     return topk_weights.to(torch.float32), topk_ids.to(torch.int32)
 
 
@@ -1790,8 +1791,8 @@ def fused_moe(
         Defaults to False.
     - global_num_experts (int): The total number of experts in the global
         expert space.
-    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices 
-        from the global expert space to the local expert space of the expert 
+    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
+        from the global expert space to the local expert space of the expert
         parallel shard.
     - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
         w1.
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5a87763c0721..3a2c9cbaf459 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -244,6 +244,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -400,6 +401,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -427,6 +429,7 @@ def apply(
             expert_map=expert_map,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             activation=activation,
             apply_router_weight_on_input=apply_router_weight_on_input,
@@ -450,6 +453,7 @@ def forward_cuda(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -469,6 +473,7 @@ def forward_cuda(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             enable_eplb=enable_eplb,
@@ -534,6 +539,7 @@ def forward_cpu(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -560,6 +566,7 @@ def forward_cpu(
             expert_map,
             custom_routing_function,
             scoring_func,
+            routed_scaling_factor,
             e_score_correction_bias,
             apply_router_weight_on_input,
             activation,
@@ -579,6 +586,7 @@ def forward_xpu(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -617,6 +625,7 @@ def forward_tpu(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -637,6 +646,9 @@ def forward_tpu(
             raise NotImplementedError(
                 "Expert score correction bias is not supported for TPU.")
         assert activation == "silu", f"{activation} is not supported for TPU."
+        assert routed_scaling_factor == 1.0, \
+            f"routed_scaling_factor {routed_scaling_factor} is not supported " \
+            f"for TPU."
         if enable_eplb is not False or expert_load_view is not None or \
                 logical_to_physical_map is not None or \
                 logical_replica_count is not None:
@@ -766,6 +778,7 @@ def __init__(
         prefix: str = "",
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -848,6 +861,7 @@ def __init__(
         self.topk_group = topk_group
         self.custom_routing_function = custom_routing_function
         self.scoring_func = scoring_func
+        self.routed_scaling_factor = routed_scaling_factor
         self.e_score_correction_bias = e_score_correction_bias
         self.apply_router_weight_on_input = apply_router_weight_on_input
         self.activation = activation
@@ -1416,6 +1430,7 @@ def select_experts(
         num_expert_group: Optional[int] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         indices_type: Optional[torch.dtype] = None,
         enable_eplb: bool = False,
@@ -1460,6 +1475,7 @@ def select_experts(
                 num_expert_group=num_expert_group,
                 topk_group=topk_group,
                 scoring_func=scoring_func,
+                routed_scaling_factor=routed_scaling_factor,
                 e_score_correction_bias=e_score_correction_bias)
             if indices_type is not None:
                 topk_ids = topk_ids.to(dtype=indices_type)
@@ -1627,6 +1643,7 @@ def process_chunk(chunk_start, chunk_end, skip_result_store=False):
                 num_expert_group=self.num_expert_group,
                 custom_routing_function=self.custom_routing_function,
                 scoring_func=self.scoring_func,
+                routed_scaling_factor=self.routed_scaling_factor,
                 e_score_correction_bias=self.e_score_correction_bias,
                 activation=self.activation,
                 enable_eplb=self.enable_eplb,
@@ -1695,6 +1712,7 @@ def forward_impl(self, hidden_states: torch.Tensor,
             num_expert_group=self.num_expert_group,
             custom_routing_function=self.custom_routing_function,
             scoring_func=self.scoring_func,
+            routed_scaling_factor=self.routed_scaling_factor,
             e_score_correction_bias=self.e_score_correction_bias,
             activation=self.activation,
             apply_router_weight_on_input=self.apply_router_weight_on_input,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index b838fd798bbc..f14f13e2ade9 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -267,6 +267,7 @@ def rocm_aiter_grouped_topk(
     num_expert_group: int = 0,
     topk_group: int = 0,
     scoring_func: str = "softmax",
+    routed_scaling_factor: float = 1.0,
     e_score_correction_bias: Optional[torch.Tensor] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
     token = hidden_states.shape[0]
@@ -298,6 +299,8 @@ def rocm_aiter_grouped_topk(
             scoring_func,
         )
 
+    if routed_scaling_factor != 1.0:
+        topk_weights = topk_weights * routed_scaling_factor
     return topk_weights, topk_ids
 
 
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 287d66b06d6e..8293d42ef455 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -497,6 +497,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -523,6 +524,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index b7897a43793c..9713757df9b0 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -466,6 +466,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -490,6 +491,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
         if self.quant_config.load_in_8bit:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 2cad9ff0d321..e4585419226c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -350,6 +350,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -375,6 +376,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
         )
@@ -809,6 +811,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -832,6 +835,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
         )
@@ -1057,6 +1061,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -1084,6 +1089,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
@@ -1361,6 +1367,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -1389,6 +1396,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
@@ -1592,6 +1600,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -1618,6 +1627,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 3e43caa4cbf7..2d8a684bc7d9 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -120,6 +120,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -146,6 +147,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0200b0e9ed00..48bac8697e46 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -955,6 +955,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -994,7 +995,7 @@ def apply(
                     expert_offset=layer.ep_rank * layer.local_num_experts,
                     local_num_experts=layer.local_num_experts,
                     block_shape=self.quant_config.weight_block_size,
-                    routed_scaling=1.0,
+                    routed_scaling=routed_scaling_factor,
                 )
             else:
                 assert (not renormalize
@@ -1020,6 +1021,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
             enable_eplb=enable_eplb,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index 90222f2e3b0e..ad648df23819 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -532,6 +532,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -562,6 +563,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
         return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index c5d1e017014f..350975966668 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -643,6 +643,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -669,6 +670,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 1fbb2e3bb6f2..4bb8438d9084 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -483,6 +483,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -521,6 +522,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype,
         )
@@ -1356,6 +1358,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -1434,6 +1437,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 0cde104cc75d..fb3e4b518bf6 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -297,6 +297,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -322,6 +323,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index f7d591328f93..a2301779c77e 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -546,6 +546,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -569,6 +570,7 @@ def apply(
                 num_expert_group=num_expert_group,
                 custom_routing_function=custom_routing_function,
                 scoring_func=scoring_func,
+                routed_scaling_factor=routed_scaling_factor,
                 e_score_correction_bias=e_score_correction_bias)
 
             return torch.ops.vllm.fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 58f56c6381b3..fdf03ded0448 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -218,6 +218,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -244,6 +245,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
@@ -380,6 +382,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -406,6 +409,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 8bdb50e07b13..8f72b8cbea7a 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -283,6 +283,7 @@ def apply(
         expert_map: Optional[torch.Tensor] = None,
         custom_routing_function: Optional[Callable] = None,
         scoring_func: str = "softmax",
+        routed_scaling_factor: float = 1.0,
         e_score_correction_bias: Optional[torch.Tensor] = None,
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
@@ -309,6 +310,7 @@ def apply(
             num_expert_group=num_expert_group,
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
+            routed_scaling_factor=routed_scaling_factor,
             e_score_correction_bias=e_score_correction_bias,
             indices_type=self.topk_indices_dtype)
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index ed033954f7c0..61e8090411f2 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -160,6 +160,7 @@ def __init__(
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func=config.scoring_func,
+            routed_scaling_factor=self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts)
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index c386f8db9eec..a5477af8694b 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -137,6 +137,7 @@ def __init__(
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func=config.scoring_func,
+            routed_scaling_factor=self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias)
 
         if config.n_shared_experts is not None:
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index fcc63815ac56..06ed453ec29f 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -159,6 +159,7 @@ def __init__(
             topk_group=config.topk_group,
             prefix=f"{prefix}.experts",
             scoring_func="sigmoid",
+            routed_scaling_factor=self.routed_scaling_factor,
             e_score_correction_bias=self.gate.e_score_correction_bias,
             enable_eplb=self.enable_eplb,
             num_redundant_experts=self.n_redundant_experts)

From ee52a32705988bcac1833feab34af977addf5cca Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Fri, 29 Aug 2025 21:41:25 -0700
Subject: [PATCH 108/125] [CI] Move testing image from remote URL to S3
 (#23980)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 .../multimodal/generation/vlm_utils/custom_inputs.py  | 11 ++++-------
 vllm/assets/image.py                                  |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
index c68ac8f57662..e369416fc49c 100644
--- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
+++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py
@@ -1,12 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom input builders for edge-cases in different models."""
-from io import BytesIO
 from typing import Callable
 
-import requests
-from PIL import Image
-
+from vllm.assets.image import ImageAsset
 from vllm.multimodal.image import rescale_image_size
 from vllm.multimodal.video import (rescale_video_size, resize_video,
                                    sample_frames_from_video)
@@ -118,9 +115,9 @@ def different_patch_input_cases_internvl():
 
 
 def windows_attention_image_qwen2_5_vl():
-    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122
-    image_url = "https://github.com/AOMediaCodec/av1-avif/blob/main/testFiles/Link-U/hato.jpg?raw=true"
-    image = Image.open(BytesIO(requests.get(image_url).content))
+
+    # image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
+    image = ImageAsset("hato").pil_image
 
     question = "Describe the image."
     img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index c977242a3d48..c8f8d43a9835 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -11,7 +11,7 @@
 
 VLM_IMAGES_DIR = "vision_model_images"
 
-ImageAssetName = Literal["stop_sign", "cherry_blossom"]
+ImageAssetName = Literal["stop_sign", "cherry_blossom", "hato"]
 
 
 @dataclass(frozen=True)

From 9748c5198b492e22dc24d6eb455ec907369392f3 Mon Sep 17 00:00:00 2001
From: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Date: Sat, 30 Aug 2025 00:14:43 -0700
Subject: [PATCH 109/125] [CI] Fix broken compile tests due to unsupported
 SiluMul+Nvfp4Quant fusion (#23973)

Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
---
 vllm/compilation/activation_quant_fusion.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/compilation/activation_quant_fusion.py b/vllm/compilation/activation_quant_fusion.py
index 40e124a03eb0..f2fbb1200eec 100644
--- a/vllm/compilation/activation_quant_fusion.py
+++ b/vllm/compilation/activation_quant_fusion.py
@@ -29,8 +29,9 @@
 FUSED_OPS: dict[QuantKey, OpOverload] = {
     kFp8StaticTensorSym: torch.ops._C.silu_and_mul_quant.default,  # noqa: E501
 }
-if current_platform.is_cuda() and hasattr(torch.ops._C,
-                                          "silu_and_mul_nvfp4_quant"):
+silu_and_mul_nvfp4_quant_supported = (current_platform.is_cuda() and hasattr(
+    torch.ops._C, "silu_and_mul_nvfp4_quant"))
+if silu_and_mul_nvfp4_quant_supported:
     FUSED_OPS[
         kNvfp4Quant] = torch.ops._C.silu_and_mul_nvfp4_quant.default  # noqa: E501
 
@@ -171,8 +172,9 @@ def __init__(self, config: VllmConfig):
         pattern_silu_mul_fp8 = SiluMulFp8StaticQuantPattern()
         pattern_silu_mul_fp8.register(self.patterns)
 
-        pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern()
-        pattern_silu_mul_nvfp4.register(self.patterns)
+        if silu_and_mul_nvfp4_quant_supported:
+            pattern_silu_mul_nvfp4 = SiluMulNvfp4QuantPattern()
+            pattern_silu_mul_nvfp4.register(self.patterns)
 
     def __call__(self, graph: torch.fx.Graph):
         self.begin()

From f1bddbd852f37f98958d636821c45014c05e07a8 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 30 Aug 2025 15:14:58 +0800
Subject: [PATCH 110/125] [Core] Cleanup TPU model runner for MM (#23894)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/v1/worker/tpu_model_runner.py | 32 +-----------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 230700612708..985d5ba58c49 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -808,31 +808,6 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput",
         return per_layer_attn_metadata, logits_indices, padded_num_reqs,\
             num_reqs, end_index
 
-    def _scatter_placeholders(
-        self,
-        embeds: torch.Tensor,
-        is_embed: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        if is_embed is None:
-            return embeds
-
-        placeholders = embeds.new_full(
-            (is_embed.shape[0], embeds.shape[-1]),
-            fill_value=torch.nan,
-        )
-        placeholders[is_embed] = embeds
-        return placeholders
-
-    def _gather_placeholders(
-        self,
-        placeholders: torch.Tensor,
-        is_embed: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        if is_embed is None:
-            return placeholders
-
-        return placeholders[is_embed]
-
     def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
         if not scheduled_encoder_inputs:
@@ -892,12 +867,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         # NOTE (NickLucche) here we diverge from logic in other runners, as we
         # assume to only have whole mm items to process. Hence we avoid the
         # intrinsic dynamism that `scatter_mm_placeholders` introduces.
-        for (mm_hash, pos_info), output in zip(
-                mm_hashes_pos,
-                encoder_outputs,
-        ):
-            if req_id not in self.encoder_cache:
-                self.encoder_cache[req_id] = {}
+        for (mm_hash, pos_info), output in zip(mm_hashes_pos, encoder_outputs):
             assert pos_info.is_embed is None, "Expected all positions to be"\
                 " contiguous and embeddings."
             self.encoder_cache[mm_hash] = output

From 4071c76cf3cff46e14630f1e66cbf006b6eb51d3 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Sat, 30 Aug 2025 09:16:15 +0200
Subject: [PATCH 111/125] [V1] [Hybrid] Move MiniMaxLinearAttention into
 layers/mamba (#23831)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 .../layers/mamba/linear_attn.py               | 442 ++++++++++++++++++
 vllm/model_executor/models/minimax_text_01.py | 416 +----------------
 2 files changed, 448 insertions(+), 410 deletions(-)
 create mode 100644 vllm/model_executor/layers/mamba/linear_attn.py

diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
new file mode 100644
index 000000000000..d93cef1a27ad
--- /dev/null
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -0,0 +1,442 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import math
+from typing import TYPE_CHECKING, Optional, Union
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+from typing import TYPE_CHECKING
+
+import torch
+import torch.distributed
+import torch.nn.functional as F
+from einops import rearrange
+from torch import nn
+
+from vllm import envs
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
+from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.lightning_attn import (
+    lightning_attention, linear_decode_forward_triton)
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateDtypeCalculator, MambaStateShapeCalculator)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.platforms import current_platform
+from vllm.utils import direct_register_custom_op
+from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+import torch
+import torch.distributed
+
+from vllm.model_executor.models.minimax_cache import MinimaxCacheParams
+
+
+class MiniMaxText01RMSNormTP(CustomOp):
+    name = "MiniMaxText01RMSNormTP"
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.tp_world = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.weight = nn.Parameter(torch.ones(int(hidden_size /
+                                                  self.tp_world)))
+
+        self.weight.weight_loader = self.weight_loader
+        self.variance_epsilon = eps
+        return
+
+    @staticmethod
+    def weight_loader(
+        param: nn.Parameter,
+        loaded_weight: torch.Tensor,
+    ) -> None:
+        tp_world = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+
+        shard_size = loaded_weight.shape[0] // tp_world
+        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
+        param.data.copy_(loaded_weight[shard])
+        return
+
+    def _forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
+        if self.tp_world > 1:
+            variance = tensor_model_parallel_all_reduce(
+                variance) / self.tp_world
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+
+        weight = self.weight
+        if x.size(-1) != self.weight.size(0):
+            if self.weight.size(0) < x.size(-1):
+                repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1)
+                full_weight = self.weight.repeat(repeat_count)
+                weight = full_weight[:x.size(-1)]
+            else:
+                weight = self.weight[:x.size(-1)]
+
+        x = x.to(orig_dtype) * weight
+        return x
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        assert residual is None, "RMSNorm does not support residual connection."
+        return self._forward(x)
+
+
+class MiniMaxText01LinearKernel:
+
+    @staticmethod
+    def jit_linear_forward_prefix(q: torch.Tensor,
+                                  k: torch.Tensor,
+                                  v: torch.Tensor,
+                                  kv_caches: torch.Tensor,
+                                  slope_rate: torch.Tensor,
+                                  block_size: int,
+                                  layer_idx: Optional[int] = None,
+                                  **kwargs) -> torch.Tensor:
+
+        slope_rate = slope_rate.to(torch.float32)
+        should_pad_dim = q.dim() == 3
+        if should_pad_dim:
+            q = q.unsqueeze(0)
+            k = k.unsqueeze(0)
+            v = v.unsqueeze(0)
+        b, h, n, d = q.shape
+        e = d
+        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
+        output, kv_history = lightning_attention(q,
+                                                 k,
+                                                 v,
+                                                 slope_rate,
+                                                 block_size=block_size,
+                                                 kv_history=kv_history)
+        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
+        assert output.shape[0] == 1, "batch size must be 1"
+        return rearrange(output.squeeze(0), "h n d -> n (h d)")
+
+
+class MiniMaxText01LinearAttention(nn.Module, MambaBase):
+
+    @property
+    def mamba_type(self) -> str:
+        return "linear_attention"
+
+    def get_attn_backend(self) -> type["AttentionBackend"]:
+        from vllm.v1.attention.backends.linear_attn import (
+            LinearAttentionBackend)
+        return LinearAttentionBackend
+
+    def get_state_dtype(self) -> tuple[torch.dtype]:
+        assert self.model_config is not None
+        assert self.cache_config is not None
+        return MambaStateDtypeCalculator.linear_attention_state_dtype(
+            self.model_config.dtype,
+            self.cache_config.mamba_cache_dtype,
+        )
+
+    def get_state_shape(self) -> tuple[tuple[int, int, int], ...]:
+        return MambaStateShapeCalculator.linear_attention_state_shape(
+            num_heads=self.num_heads,
+            tp_size=self.tp_size,
+            head_dim=self.head_dim)
+
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_inner_size: int,
+        num_heads: int,
+        head_dim: int,
+        max_position: int,
+        block_size: int,
+        num_hidden_layer: int,
+        model_config: Optional[ModelConfig] = None,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        layer_idx: int = 0,
+        linear_layer_idx: int = 0,
+        prefix: str = "linear_attn",
+    ) -> None:
+        super().__init__()
+
+        self.layer_idx = layer_idx
+        self.BLOCK = block_size
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.total_num_heads = num_heads
+        self.hidden_inner_size = hidden_inner_size
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+
+        assert self.total_num_heads % self.tp_size == 0
+        self.tp_heads = self.total_num_heads // self.tp_size
+        self.qkv_size = self.num_heads * self.head_dim
+        self.tp_hidden = self.head_dim * self.tp_heads
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.prefix = prefix
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size * 3,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.output_gate = ColumnParallelLinear(
+            hidden_size,
+            self.hidden_inner_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.output_gate",
+        )
+        self.out_proj = RowParallelLinear(
+            self.hidden_inner_size,
+            hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.out_proj",
+        )
+        self.norm = MiniMaxText01RMSNormTP(
+            self.hidden_inner_size,
+            eps=1e-5,
+        )
+
+        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
+            self.num_heads)
+        if num_hidden_layer <= 1:
+            self.slope_rate = slope_rate * (1 + 1e-5)
+        else:
+            self.slope_rate = slope_rate * (1 - layer_idx /
+                                            (num_hidden_layer - 1) + 1e-5)
+        self.tp_slope = self.slope_rate[self.tp_rank *
+                                        self.tp_heads:(self.tp_rank + 1) *
+                                        self.tp_heads].contiguous()
+
+        if envs.VLLM_USE_V1:
+            compilation_config = get_current_vllm_config().compilation_config
+            if prefix in compilation_config.static_forward_context:
+                raise ValueError(f"Duplicate layer name: {prefix}")
+            compilation_config.static_forward_context[prefix] = self
+
+    @staticmethod
+    def weight_direct_load(param: torch.Tensor,
+                           loaded_weight: torch.Tensor) -> None:
+        assert param.size() == loaded_weight.size()
+        param.data.copy_(loaded_weight)
+        return
+
+    @staticmethod
+    def _build_slope_tensor(n_attention_heads: int):
+
+        def get_slopes(n):
+
+            def get_slopes_power_of_2(n):
+                start = 2**(-(2**-(math.log2(n) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n)]
+
+            if math.log2(n).is_integer():
+                return get_slopes_power_of_2(n)
+            else:
+                closest_power_of_2 = 2**math.floor(math.log2(n))
+                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
+                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
+
+        slopes = torch.tensor(get_slopes(n_attention_heads),
+                              dtype=torch.float32).reshape(
+                                  n_attention_heads, 1, 1)
+        return slopes
+
+    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                               attn_metadata):
+        hidden = []
+        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
+            if _prefill_idx >= len(attn_metadata.query_start_loc):
+                break
+            if _prefill_idx >= len(state_indices_tensor):
+                break
+            # prefills are packed at end of batch in V1
+            offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0
+            _start = attn_metadata.query_start_loc[offset + _prefill_idx]
+            _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1]
+            slot_id = state_indices_tensor[offset + _prefill_idx]
+            qs = q[_start:_end].transpose(0, 1).contiguous()
+            ks = k[_start:_end].transpose(0, 1).contiguous()
+            vs = v[_start:_end].transpose(0, 1).contiguous()
+            slice_layer_cache = kv_cache[slot_id, ...]
+
+            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
+                qs,
+                ks,
+                vs,
+                slice_layer_cache,
+                self.tp_slope,
+                self.BLOCK,
+                layer_idx=self.layer_idx)
+            hidden.append(out_slice.contiguous())
+        if attn_metadata.num_decode_tokens > 0:
+            hidden_decode = self._decode_infer(q, k, v, kv_cache,
+                                               state_indices_tensor,
+                                               attn_metadata)
+            if envs.VLLM_USE_V1:
+                hidden.insert(0, hidden_decode)
+            else:
+                hidden.append(hidden_decode)
+
+        if not hidden:
+            return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
+
+        hidden = torch.concat(hidden, dim=0).contiguous()
+        return hidden
+
+    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
+                      attn_metadata):
+        if not envs.VLLM_USE_V1:
+            q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
+            num_prefills = getattr(attn_metadata, "num_prefills", 0)
+            slot_id = state_indices_tensor[num_prefills:]
+        else:
+            q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
+            slot_id = state_indices_tensor[:attn_metadata.num_decodes]
+        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
+                                              slot_id, 32)
+        return hidden
+
+    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                positions: torch.Tensor,
+                kv_caches: MinimaxCacheParams) -> None:
+        if not envs.VLLM_USE_V1:
+            self._forward(hidden_states, output, positions, kv_caches)
+        else:
+            torch.ops.vllm.linear_attention(
+                hidden_states,
+                output,
+                positions,
+                self.prefix,
+            )
+
+    def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
+                 positions: torch.Tensor,
+                 kv_caches: Optional[MinimaxCacheParams]) -> None:
+        forward_context = get_forward_context()
+        attn_metadata: AttentionMetadata = forward_context.attn_metadata
+        if envs.VLLM_USE_V1 and attn_metadata is not None:
+            assert isinstance(attn_metadata, dict)
+            attn_metadata = attn_metadata[self.prefix]
+            assert isinstance(attn_metadata, LinearAttentionMetadata)
+            num_actual_tokens = attn_metadata.num_prefill_tokens + \
+                attn_metadata.num_decode_tokens
+        else:
+            num_actual_tokens = hidden_states.shape[0]
+
+        qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens])
+        qkv32 = qkv.to(torch.float32)
+        qkvact = torch.nn.functional.silu(qkv32)
+        qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
+        q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
+        if envs.VLLM_USE_V1:
+            if attn_metadata is not None:
+                kv_cache = self.kv_cache[forward_context.virtual_engine][0]
+                state_indices_tensor = attn_metadata.state_indices_tensor
+
+                num_prefills = getattr(attn_metadata, "num_prefills", 0)
+                if num_prefills > 0:
+                    num_decode_tokens = getattr(attn_metadata,
+                                                "num_decode_tokens", 0)
+                    for prefill_idx in range(num_prefills):
+                        q_start = attn_metadata.query_start_loc[
+                            num_decode_tokens + prefill_idx]
+                        q_end = attn_metadata.query_start_loc[num_decode_tokens
+                                                              + prefill_idx +
+                                                              1]
+                        query_len = q_end - q_start
+                        context_len = attn_metadata.seq_lens[
+                            num_decode_tokens + prefill_idx] - query_len
+                        if context_len == 0:
+                            block_to_clear = state_indices_tensor[
+                                num_decode_tokens + prefill_idx]
+                            kv_cache[block_to_clear, ...] = 0
+        else:
+            assert kv_caches is not None
+            kv_cache = kv_caches.minimax_cache
+            state_indices_tensor = kv_caches.state_indices_tensor
+
+        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
+        if attn_metadata is None:
+            hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]),
+                                 device=q.device,
+                                 dtype=q.dtype)
+        else:
+            if not decode_only:
+                hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
+                                                     state_indices_tensor,
+                                                     attn_metadata)
+            else:
+                hidden = self._decode_infer(q, k, v, kv_cache,
+                                            state_indices_tensor,
+                                            attn_metadata)
+        hidden = self.norm._forward(hidden)
+        gate, _ = self.output_gate(hidden_states[:num_actual_tokens])
+        hidden = F.sigmoid(gate) * hidden
+        hidden = hidden.to(hidden_states.dtype)
+
+        output[:num_actual_tokens], _ = self.out_proj(hidden)
+
+
+def linear_attention(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    forward_context: ForwardContext = get_forward_context()
+    self = forward_context.no_compile_layers[layer_name]
+    self._forward(hidden_states=hidden_states,
+                  output=output,
+                  positions=positions,
+                  kv_caches=None)
+
+
+def linear_attention_fake(
+    hidden_states: torch.Tensor,
+    output: torch.Tensor,
+    positions: torch.Tensor,
+    layer_name: str,
+) -> None:
+    return
+
+
+direct_register_custom_op(
+    op_name="linear_attention",
+    op_func=linear_attention,
+    mutates_args=["output"],
+    fake_impl=linear_attention_fake,
+    dispatch_key=current_platform.dispatch_key,
+)
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 93ef13d5d16a..ef1fe86c5b5c 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -1,45 +1,37 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Inference-only MiniMaxText01 model."""
-import math
 from collections.abc import Iterable
 from itertools import islice
 from typing import TYPE_CHECKING, Optional, Union
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
+    pass
 
 import regex as re
 import torch
 import torch.distributed
-import torch.nn.functional as F
-from einops import rearrange
 from torch import nn
 from transformers import MiniMaxConfig
 
 from vllm import envs
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import (CacheConfig, ModelConfig, VllmConfig,
-                         get_current_vllm_config)
-from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
+from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size)
-from vllm.forward_context import ForwardContext, get_forward_context
-from vllm.model_executor.custom_op import CustomOp
+from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.lightning_attn import (
-    lightning_attention, linear_decode_forward_triton)
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
-                                               MergedColumnParallelLinear,
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
                                                QKVParallelLinear,
                                                ReplicatedLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mamba.abstract import MambaBase
+from vllm.model_executor.layers.mamba.linear_attn import (
+    MiniMaxText01LinearAttention)
 from vllm.model_executor.layers.mamba.mamba_utils import (
     MambaStateDtypeCalculator, MambaStateShapeCalculator)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -50,10 +42,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import maybe_prefix
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.utils import direct_register_custom_op
-from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
 from .interfaces import HasInnerState, IsHybrid
 from .minimax_cache import MinimaxCacheManager, MinimaxCacheParams
@@ -87,66 +76,6 @@ def inner_func(param: torch.Tensor,
     return wrapper
 
 
-class MiniMaxText01RMSNormTP(CustomOp):
-    name = "MiniMaxText01RMSNormTP"
-
-    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
-        super().__init__()
-        self.tp_world = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.weight = nn.Parameter(torch.ones(int(hidden_size /
-                                                  self.tp_world)))
-
-        self.weight.weight_loader = self.weight_loader
-        self.variance_epsilon = eps
-        return
-
-    @staticmethod
-    def weight_loader(
-        param: nn.Parameter,
-        loaded_weight: torch.Tensor,
-    ) -> None:
-        tp_world = get_tensor_model_parallel_world_size()
-        tp_rank = get_tensor_model_parallel_rank()
-
-        shard_size = loaded_weight.shape[0] // tp_world
-        shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size)
-        param.data.copy_(loaded_weight[shard])
-        return
-
-    def _forward(
-        self,
-        x: torch.Tensor,
-    ) -> torch.Tensor:
-        orig_dtype = x.dtype
-        x = x.to(torch.float32)
-        variance = x.pow(2).mean(dim=-1, keepdim=True, dtype=torch.float32)
-        if self.tp_world > 1:
-            variance = tensor_model_parallel_all_reduce(
-                variance) / self.tp_world
-        x = x * torch.rsqrt(variance + self.variance_epsilon)
-
-        weight = self.weight
-        if x.size(-1) != self.weight.size(0):
-            if self.weight.size(0) < x.size(-1):
-                repeat_count = (x.size(-1) + self.weight.size(0)) // x.size(-1)
-                full_weight = self.weight.repeat(repeat_count)
-                weight = full_weight[:x.size(-1)]
-            else:
-                weight = self.weight[:x.size(-1)]
-
-        x = x.to(orig_dtype) * weight
-        return x
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        assert residual is None, "RMSNorm does not support residual connection."
-        return self._forward(x)
-
-
 class MiniMaxText01MLP(nn.Module):
 
     def __init__(
@@ -253,307 +182,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return final_hidden
 
 
-class MiniMaxText01LinearKernel:
-
-    @staticmethod
-    def jit_linear_forward_prefix(q: torch.Tensor,
-                                  k: torch.Tensor,
-                                  v: torch.Tensor,
-                                  kv_caches: torch.Tensor,
-                                  slope_rate: torch.Tensor,
-                                  block_size: int,
-                                  layer_idx: int = None,
-                                  **kwargs) -> torch.Tensor:
-
-        slope_rate = slope_rate.to(torch.float32)
-        should_pad_dim = q.dim() == 3
-        if should_pad_dim:
-            q = q.unsqueeze(0)
-            k = k.unsqueeze(0)
-            v = v.unsqueeze(0)
-        b, h, n, d = q.shape
-        e = d
-        kv_history = kv_caches.reshape(1, h, d, e).contiguous()
-        output, kv_history = lightning_attention(q,
-                                                 k,
-                                                 v,
-                                                 slope_rate,
-                                                 block_size=block_size,
-                                                 kv_history=kv_history)
-        kv_caches.copy_(kv_history[:, :, -1, :, :].reshape(h, d, e))
-        assert output.shape[0] == 1, "batch size must be 1"
-        return rearrange(output.squeeze(0), "h n d -> n (h d)")
-
-
-class MiniMaxText01LinearAttention(nn.Module, MambaBase):
-
-    @property
-    def mamba_type(self) -> str:
-        return "linear_attention"
-
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.linear_attn import (
-            LinearAttentionBackend)
-        return LinearAttentionBackend
-
-    def get_state_dtype(self) -> tuple[torch.dtype]:
-        return MambaStateDtypeCalculator.linear_attention_state_dtype(
-            self.model_config.dtype,
-            self.cache_config.mamba_cache_dtype,
-        )
-
-    def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
-        return MambaStateShapeCalculator.linear_attention_state_shape(
-            num_heads=self.num_heads,
-            tp_size=self.tp_size,
-            head_dim=self.head_dim)
-
-    def __init__(
-        self,
-        hidden_size: int,
-        hidden_inner_size: int,
-        num_heads: int,
-        head_dim: int,
-        max_position: int,
-        block_size: int,
-        num_hidden_layer: int,
-        model_config: Optional[ModelConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        layer_idx: int = 0,
-        linear_layer_idx: int = 0,
-        prefix: str = "linear_attn",
-    ) -> None:
-        super().__init__()
-
-        self.layer_idx = layer_idx
-        self.BLOCK = block_size
-        self.hidden_size = hidden_size
-        self.num_heads = num_heads
-        self.head_dim = head_dim
-        self.total_num_heads = num_heads
-        self.hidden_inner_size = hidden_inner_size
-        self.tp_size = get_tensor_model_parallel_world_size()
-        self.tp_rank = get_tensor_model_parallel_rank()
-
-        assert self.total_num_heads % self.tp_size == 0
-        self.tp_heads = self.total_num_heads // self.tp_size
-        self.qkv_size = self.num_heads * self.head_dim
-        self.tp_hidden = self.head_dim * self.tp_heads
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.prefix = prefix
-
-        self.qkv_proj = ColumnParallelLinear(
-            hidden_size,
-            self.hidden_inner_size * 3,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.qkv_proj",
-        )
-        self.output_gate = ColumnParallelLinear(
-            hidden_size,
-            self.hidden_inner_size,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.output_gate",
-        )
-        self.out_proj = RowParallelLinear(
-            self.hidden_inner_size,
-            hidden_size,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.out_proj",
-        )
-        self.norm = MiniMaxText01RMSNormTP(
-            self.hidden_inner_size,
-            eps=1e-5,
-        )
-
-        slope_rate = MiniMaxText01LinearAttention._build_slope_tensor(
-            self.num_heads)
-        if num_hidden_layer <= 1:
-            self.slope_rate = slope_rate * (1 + 1e-5)
-        else:
-            self.slope_rate = slope_rate * (1 - layer_idx /
-                                            (num_hidden_layer - 1) + 1e-5)
-        self.tp_slope = self.slope_rate[self.tp_rank *
-                                        self.tp_heads:(self.tp_rank + 1) *
-                                        self.tp_heads].contiguous()
-
-        if envs.VLLM_USE_V1:
-            compilation_config = get_current_vllm_config().compilation_config
-            if prefix in compilation_config.static_forward_context:
-                raise ValueError(f"Duplicate layer name: {prefix}")
-            compilation_config.static_forward_context[prefix] = self
-
-    @staticmethod
-    def weight_direct_load(param: torch.Tensor,
-                           loaded_weight: torch.Tensor) -> None:
-        assert param.size() == loaded_weight.size()
-        param.data.copy_(loaded_weight)
-        return
-
-    @staticmethod
-    def _build_slope_tensor(n_attention_heads: int):
-
-        def get_slopes(n):
-
-            def get_slopes_power_of_2(n):
-                start = 2**(-(2**-(math.log2(n) - 3)))
-                ratio = start
-                return [start * ratio**i for i in range(n)]
-
-            if math.log2(n).is_integer():
-                return get_slopes_power_of_2(n)
-            else:
-                closest_power_of_2 = 2**math.floor(math.log2(n))
-                return (get_slopes_power_of_2(closest_power_of_2) + get_slopes(
-                    2 * closest_power_of_2)[0::2][:n - closest_power_of_2])
-
-        slopes = torch.tensor(get_slopes(n_attention_heads),
-                              dtype=torch.float32).reshape(
-                                  n_attention_heads, 1, 1)
-        return slopes
-
-    def _prefill_and_mix_infer(self, q, k, v, kv_cache, state_indices_tensor,
-                               attn_metadata):
-        hidden = []
-        for _prefill_idx in range(getattr(attn_metadata, "num_prefills", 0)):
-            if _prefill_idx >= len(attn_metadata.query_start_loc):
-                break
-            if _prefill_idx >= len(state_indices_tensor):
-                break
-            # prefills are packed at end of batch in V1
-            offset = attn_metadata.num_decode_tokens if envs.VLLM_USE_V1 else 0
-            _start = attn_metadata.query_start_loc[offset + _prefill_idx]
-            _end = attn_metadata.query_start_loc[offset + _prefill_idx + 1]
-            slot_id = state_indices_tensor[offset + _prefill_idx]
-            qs = q[_start:_end].transpose(0, 1).contiguous()
-            ks = k[_start:_end].transpose(0, 1).contiguous()
-            vs = v[_start:_end].transpose(0, 1).contiguous()
-            slice_layer_cache = kv_cache[slot_id, ...]
-
-            out_slice = MiniMaxText01LinearKernel.jit_linear_forward_prefix(
-                qs,
-                ks,
-                vs,
-                slice_layer_cache,
-                self.tp_slope,
-                self.BLOCK,
-                layer_idx=self.layer_idx)
-            hidden.append(out_slice.contiguous())
-        if attn_metadata.num_decode_tokens > 0:
-            hidden_decode = self._decode_infer(q, k, v, kv_cache,
-                                               state_indices_tensor,
-                                               attn_metadata)
-            if envs.VLLM_USE_V1:
-                hidden.insert(0, hidden_decode)
-            else:
-                hidden.append(hidden_decode)
-
-        if not hidden:
-            return torch.empty((0, q.size(-1)), device=q.device, dtype=q.dtype)
-
-        hidden = torch.concat(hidden, dim=0).contiguous()
-        return hidden
-
-    def _decode_infer(self, q, k, v, kv_cache, state_indices_tensor,
-                      attn_metadata):
-        if not envs.VLLM_USE_V1:
-            q = q[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            k = k[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            v = v[attn_metadata.num_prefill_tokens:].unsqueeze(2).contiguous()
-            num_prefills = getattr(attn_metadata, "num_prefills", 0)
-            slot_id = state_indices_tensor[num_prefills:]
-        else:
-            q = q[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            k = k[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            v = v[:attn_metadata.num_decode_tokens].unsqueeze(2).contiguous()
-            slot_id = state_indices_tensor[:attn_metadata.num_decodes]
-        hidden = linear_decode_forward_triton(q, k, v, kv_cache, self.tp_slope,
-                                              slot_id, 32)
-        return hidden
-
-    def forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                positions: torch.Tensor,
-                kv_caches: MinimaxCacheParams) -> None:
-        if not envs.VLLM_USE_V1:
-            self._forward(hidden_states, output, positions, kv_caches)
-        else:
-            torch.ops.vllm.linear_attention(
-                hidden_states,
-                output,
-                positions,
-                self.prefix,
-            )
-
-    def _forward(self, hidden_states: torch.Tensor, output: torch.Tensor,
-                 positions: torch.Tensor,
-                 kv_caches: Optional[MinimaxCacheParams]) -> None:
-        forward_context = get_forward_context()
-        attn_metadata: AttentionMetadata = forward_context.attn_metadata
-        if envs.VLLM_USE_V1 and attn_metadata is not None:
-            assert isinstance(attn_metadata, dict)
-            attn_metadata = attn_metadata[self.prefix]
-            assert isinstance(attn_metadata, LinearAttentionMetadata)
-            num_actual_tokens = attn_metadata.num_prefill_tokens + \
-                attn_metadata.num_decode_tokens
-        else:
-            num_actual_tokens = hidden_states.shape[0]
-
-        qkv, _ = self.qkv_proj(hidden_states[:num_actual_tokens])
-        qkv32 = qkv.to(torch.float32)
-        qkvact = torch.nn.functional.silu(qkv32)
-        qkvact = qkvact.view((qkv.shape[0], self.tp_heads, -1))
-        q, k, v = torch.split(qkvact, [self.head_dim] * 3, dim=-1)
-        if envs.VLLM_USE_V1:
-            if attn_metadata is not None:
-                kv_cache = self.kv_cache[forward_context.virtual_engine][0]
-                state_indices_tensor = attn_metadata.state_indices_tensor
-
-                num_prefills = getattr(attn_metadata, "num_prefills", 0)
-                if num_prefills > 0:
-                    num_decode_tokens = getattr(attn_metadata,
-                                                "num_decode_tokens", 0)
-                    for prefill_idx in range(num_prefills):
-                        q_start = attn_metadata.query_start_loc[
-                            num_decode_tokens + prefill_idx]
-                        q_end = attn_metadata.query_start_loc[num_decode_tokens
-                                                              + prefill_idx +
-                                                              1]
-                        query_len = q_end - q_start
-                        context_len = attn_metadata.seq_lens[
-                            num_decode_tokens + prefill_idx] - query_len
-                        if context_len == 0:
-                            block_to_clear = state_indices_tensor[
-                                num_decode_tokens + prefill_idx]
-                            kv_cache[block_to_clear, ...] = 0
-        else:
-            kv_cache = kv_caches.minimax_cache
-            state_indices_tensor = kv_caches.state_indices_tensor
-
-        decode_only = getattr(attn_metadata, "num_prefills", 0) == 0
-        if attn_metadata is None:
-            hidden = torch.empty((q.shape[0], q.shape[1] * q.shape[2]),
-                                 device=q.device,
-                                 dtype=q.dtype)
-        else:
-            if not decode_only:
-                hidden = self._prefill_and_mix_infer(q, k, v, kv_cache,
-                                                     state_indices_tensor,
-                                                     attn_metadata)
-            else:
-                hidden = self._decode_infer(q, k, v, kv_cache,
-                                            state_indices_tensor,
-                                            attn_metadata)
-        hidden = self.norm._forward(hidden)
-        gate, _ = self.output_gate(hidden_states[:num_actual_tokens])
-        hidden = F.sigmoid(gate) * hidden
-        hidden = hidden.to(hidden_states.dtype)
-        output[:num_actual_tokens], _ = self.out_proj(hidden)
-
-
 class MiniMaxText01Attention(nn.Module):
 
     def __init__(
@@ -1397,35 +1025,3 @@ def get_mamba_state_shape_from_config(
             tp_size=parallel_config.tensor_parallel_size,
             head_dim=hf_config.head_dim,
         )
-
-
-def linear_attention(
-    hidden_states: torch.Tensor,
-    output: torch.Tensor,
-    positions: torch.Tensor,
-    layer_name: str,
-) -> None:
-    forward_context: ForwardContext = get_forward_context()
-    self = forward_context.no_compile_layers[layer_name]
-    self._forward(hidden_states=hidden_states,
-                  output=output,
-                  positions=positions,
-                  kv_caches=None)
-
-
-def linear_attention_fake(
-    hidden_states: torch.Tensor,
-    output: torch.Tensor,
-    positions: torch.Tensor,
-    layer_name: str,
-) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="linear_attention",
-    op_func=linear_attention,
-    mutates_args=["output"],
-    fake_impl=linear_attention_fake,
-    dispatch_key=current_platform.dispatch_key,
-)

From 628d00cd7b06c9706b0613aafaefe927fb255877 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Sat, 30 Aug 2025 19:16:11 +0800
Subject: [PATCH 112/125] [Bugfix] Fix test_lora_resolvers.py (#23984)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/entrypoints/openai/test_lora_resolvers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index f4801172580c..818efd825640 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -47,6 +47,7 @@ class MockModelConfig:
     allowed_local_media_path: str = ""
     encoder_config = None
     generation_config: str = "auto"
+    skip_tokenizer_init: bool = False
 
     def get_diff_sampling_param(self):
         return self.diff_sampling_param or {}

From 5490d633cec0e6b946d3f5c2d56e6236ef42eb40 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 30 Aug 2025 19:22:14 +0800
Subject: [PATCH 113/125] [UT] fix unify_kv_cache_configs when kv cache config
 needs sort (#23843)

---
 tests/v1/core/test_kv_cache_utils.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index c4f927d69c2d..e738f2bd4647 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -601,8 +601,14 @@ def test_unify_kv_cache_configs():
     ]
 
     unify_kv_cache_configs(need_sort_kv_cache_config)
-    assert need_sort_kv_cache_config[0].num_blocks == 10
-    assert need_sort_kv_cache_config[1].num_blocks == 10
+    sorted_kv_cache_groups = [
+        KVCacheGroupSpec(["layer1"], new_kv_cache_spec()),
+        KVCacheGroupSpec(["layer2"], new_kv_cache_spec(num_kv_heads=4)),
+    ]
+    assert (
+        need_sort_kv_cache_config[0].kv_cache_groups == sorted_kv_cache_groups)
+    assert (
+        need_sort_kv_cache_config[1].kv_cache_groups == sorted_kv_cache_groups)
 
     diff_kv_cache_config = [
         KVCacheConfig(

From 3a6acad43177d612654082ed1d56fb8d2c442179 Mon Sep 17 00:00:00 2001
From: Jiangyun Zhu <riverclouds.zhu@qq.com>
Date: Sat, 30 Aug 2025 21:31:26 +0800
Subject: [PATCH 114/125] [Model] Enable encoder DP for MiniCPM-V (#23948)

Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
Signed-off-by: Jiangyun Zhu <riverclouds.zhu@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/configuration/optimization.md     |  2 +-
 vllm/model_executor/models/minicpmv.py | 43 +++++++++++++++++---------
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index b11ccb5c0027..2d8cdcc11fa9 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -175,7 +175,7 @@ Regardless, you need to set `mm_encoder_tp_mode="data"` in engine arguments to u
 Known supported models:
 
 - Llama4 (<gh-pr:18368>)
-- MiniCPM-V-4 (<gh-pr:23327>)
+- MiniCPM-V-2.5 or above (<gh-pr:23327>, <gh-pr:23948>)
 - Qwen2.5-VL (<gh-pr:22742>)
 - Step3 (<gh-pr:22697>)
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 0181bfeebda0..04176c5589ed 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -977,6 +977,8 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     instantiated.
     """
 
+    supports_encoder_tp_data = True
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
         if modality.startswith("image"):
@@ -990,6 +992,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
         quant_config = vllm_config.quant_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
         super().__init__()
         # All MiniCPM-V models disable `tie_word_embeddings` but
         # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
@@ -1237,6 +1240,8 @@ def get_vision_hidden_states(
 
 class MiniCPMV2_0(MiniCPMVBaseModel):
 
+    supports_encoder_tp_data = False
+
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (2, 0)
@@ -1351,9 +1356,12 @@ def init_vision_module(
         quant_config: Optional[QuantizationConfig],
         prefix: str = "",
     ) -> nn.Module:
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel,
+        )
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1441,9 +1449,12 @@ def init_vision_module(
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
     ) -> nn.Module:
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel,
+        )
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1521,8 +1532,6 @@ class MiniCPMV4_0(MiniCPMVBaseModel, SupportsLoRA):
         ],
     }
 
-    supports_encoder_tp_data = True
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (4, 0)
@@ -1546,9 +1555,12 @@ def init_vision_module(
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel,
+        )
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model
@@ -1652,9 +1664,12 @@ def init_vision_module(
         prefix: str = "",
     ) -> nn.Module:
         quant_config = self._maybe_ignore_quant_config(quant_config)
-        model = Idefics2VisionTransformer(config.vision_config,
-                                          quant_config=quant_config,
-                                          prefix=prefix)
+        model = Idefics2VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=prefix,
+            use_data_parallel=self.use_data_parallel,
+        )
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
         return model

From 379ea2823a751ef43de7b3e7bcb3262b1e14c510 Mon Sep 17 00:00:00 2001
From: "sadegh.shokatian" <sadegh.ja1070@gmail.com>
Date: Sat, 30 Aug 2025 06:40:02 -0700
Subject: [PATCH 115/125] Add LoRA support for DeepSeek models (V2, V3,
 R1-0528) (#23971)

Signed-off-by: sadeghja1070 <sadegh.ja1070@gmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Claude <noreply@anthropic.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
---
 docs/models/supported_models.md           | 6 +++---
 vllm/model_executor/models/deepseek.py    | 8 ++++++--
 vllm/model_executor/models/deepseek_v2.py | 5 +++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 01c1090c6fca..e8fe77e8d6c9 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -335,9 +335,9 @@ th {
 | `CohereForCausalLM`, `Cohere2ForCausalLM` | Command-R, Command-A | `CohereLabs/c4ai-command-r-v01`, `CohereLabs/c4ai-command-r7b-12-2024`, `CohereLabs/c4ai-command-a-03-2025`, `CohereLabs/command-a-reasoning-08-2025`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `DbrxForCausalLM` | DBRX | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. | | ✅︎ | ✅︎ |
 | `DeciLMForCausalLM` | DeciLM | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc. | ✅︎ | ✅︎ | ✅︎ |
-| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ |
-| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ |
+| `DeepseekForCausalLM` | DeepSeek | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
+| `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3`, `deepseek-ai/DeepSeek-R1`, `deepseek-ai/DeepSeek-V3.1`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ |
 | `Ernie4_5ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ |
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index e815f13d66dc..3f9349d766df 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -52,7 +52,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, extract_layer_index,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
@@ -439,7 +439,11 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loaded_params
 
 
-class DeepseekForCausalLM(nn.Module, SupportsPP):
+class DeepseekForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 61e8090411f2..36c9427e474e 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -56,7 +56,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import MixtureOfExperts, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
@@ -727,7 +727,8 @@ def forward(
         return hidden_states
 
 
-class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts):
+class DeepseekV2ForCausalLM(nn.Module, SupportsPP, MixtureOfExperts,
+                            SupportsLoRA):
     packed_modules_mapping = {
         "gate_up_proj": ["gate_proj", "up_proj"],
     }

From fb4983e112a81f4df25b92ab98c9c84a5babfec9 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 30 Aug 2025 21:41:45 +0800
Subject: [PATCH 116/125] [Misc] add reorder_batch AttentionMetadataBuilder
 (#23798)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/attention/backends/utils.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index ad53b2e80bc7..011a90ece01b 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -212,6 +212,23 @@ def build(self,
         """
         raise NotImplementedError
 
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        """
+        Update the order of requests in the batch based on the attention
+        backend's needs. For example, some attention backends (namely MLA) may
+        want to separate requests based on if the attention computation will be
+        compute-bound or memory-bound.
+
+        Args:
+            input_batch: input batch
+            scheduler_output: scheduler output.
+
+        Returns:
+            True if the batch was modified, False otherwise.
+        """
+        raise NotImplementedError
+
     def build_for_cudagraph_capture(
             self, common_attn_metadata: CommonAttentionMetadata) -> M:
         """

From e80bca309eb866e6d62e081e6ad80f3e10c113e9 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 30 Aug 2025 21:42:25 +0800
Subject: [PATCH 117/125] [Refactor] refactor freezing_value/cuda_event
 initialize outside try finally (#23758)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/v1/worker/cpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index 226d7792a42f..360a626979e5 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -128,8 +128,8 @@ def __init__(self, *args, **kwargs) -> None:
             self.record = lambda: None
             self.synchronize = lambda: None
 
+    cuda_event = torch.cuda.Event
     try:
-        cuda_event = torch.cuda.Event
         torch.cuda.Event = _EventPlaceholder
         yield
     finally:
@@ -141,9 +141,9 @@ def _set_global_compilation_settings(config: VllmConfig):
     import torch._inductor.config
 
     inductor_config = config.compilation_config.inductor_compile_config
+    # Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
+    freezing_value = torch._inductor.config.freezing
     try:
-        # Note: The MKLDNN and CPPGEMM backend requires freezing parameters.
-        freezing_value = torch._inductor.config.freezing
         if inductor_config.get("max_autotune", False):
             torch._inductor.config.freezing = True
         yield

From 68a349114f2a90f17645a8172496b0803d677f29 Mon Sep 17 00:00:00 2001
From: Ning Xie <andy.xning@gmail.com>
Date: Sat, 30 Aug 2025 21:43:33 +0800
Subject: [PATCH 118/125] [Misc] enhance type hint for rearrange return value
 (#23519)

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 vllm/distributed/eplb/eplb_state.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 042acf40d67c..d5ab61473ab0 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -409,12 +409,14 @@ def step(self,
             self.expert_rearrangement_step = 0
             self.rearrange(model)
 
-    def rearrange(self,
-                  model: MixtureOfExperts,
-                  is_profile: bool = False,
-                  execute_shuffle: bool = True,
-                  global_expert_load: Optional[torch.Tensor] = None,
-                  rank_mapping: Optional[dict[int, int]] = None) -> None:
+    def rearrange(
+        self,
+        model: MixtureOfExperts,
+        is_profile: bool = False,
+        execute_shuffle: bool = True,
+        global_expert_load: Optional[torch.Tensor] = None,
+        rank_mapping: Optional[dict[int,
+                                    int]] = None) -> Optional[torch.Tensor]:
         """
         Rearrange the experts according to the current load.
         """
@@ -548,6 +550,7 @@ def rearrange(self,
                 " (profile) " if is_profile else " ",
                 time_end - time_start,
             )
+        return None
 
     @staticmethod
     def recv_state() -> tuple[torch.Tensor, torch.Tensor]:
@@ -613,4 +616,4 @@ def _node_count_with_rank_mapping(
             if is_same_node and node_assignment[other_rank] == 0:
                 node_assignment[other_rank] = next_node_id
 
-    return next_node_id
\ No newline at end of file
+    return next_node_id

From 038e9be4eb7a63189c8980845d80cb96957b9919 Mon Sep 17 00:00:00 2001
From: Andy Lo <andy@mistral.ai>
Date: Sat, 30 Aug 2025 16:37:39 +0100
Subject: [PATCH 119/125] [LoRA] Much faster startup when LoRA is enabled
 (#23777)

Signed-off-by: Andy Lo <andy@mistral.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/v1/worker/gpu_model_runner.py        | 11 ++++++---
 vllm/v1/worker/gpu_worker.py              |  5 +++-
 vllm/v1/worker/lora_model_runner_mixin.py | 30 ++++++++++++++++-------
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c6d50c17f2b4..d6717892d4ae 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2213,6 +2213,7 @@ def _dummy_run(
         uniform_decode: bool = False,
         skip_eplb: bool = False,
         is_profile: bool = False,
+        remove_lora: bool = True,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Run a dummy forward pass to warm up/profile run or capture the
@@ -2230,6 +2231,7 @@ def _dummy_run(
             uniform_decode: If True, the batch is a uniform decode batch.
             skip_eplb: If True, skip EPLB state update.
             is_profile: If True, this is a profile run.
+            remove_lora: If False, dummy LoRAs are not destroyed after the run
         """
         assert cudagraph_runtime_mode in {
             CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL
@@ -2317,7 +2319,7 @@ def _dummy_run(
                         attn_metadata[layer_name] = attn_metadata_i
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
-                                            num_scheduled_tokens):
+                                            num_scheduled_tokens, remove_lora):
             if self.supports_mm_inputs:
                 input_ids = None
                 inputs_embeds = self.inputs_embeds[:num_tokens]
@@ -2708,11 +2710,14 @@ def _capture_cudagraphs(self, compilation_cases: list[int],
                                 cudagraph_runtime_mode=CUDAGraphMode.NONE,
                                 force_attention=force_attention,
                                 uniform_decode=uniform_decode,
-                                skip_eplb=True)
+                                skip_eplb=True,
+                                remove_lora=False)
             self._dummy_run(num_tokens,
                             cudagraph_runtime_mode=cudagraph_runtime_mode,
                             uniform_decode=uniform_decode,
-                            skip_eplb=True)
+                            skip_eplb=True,
+                            remove_lora=False)
+        self.maybe_remove_all_loras(self.lora_config)
 
     def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
         """
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 2088bfff5bb3..2e7d6685377f 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -308,7 +308,10 @@ def compile_or_warm_up_model(self) -> None:
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
-            self.model_runner._dummy_run(size, skip_eplb=True)
+            self.model_runner._dummy_run(size,
+                                         skip_eplb=True,
+                                         remove_lora=False)
+        self.model_runner.maybe_remove_all_loras(self.model_runner.lora_config)
 
         # Warmup and tune the kernels used during model execution before
         # cuda graph capture.
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 84ed46989ea9..4b5f27d27541 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -5,7 +5,7 @@
 """
 
 from contextlib import contextmanager
-from typing import Union
+from typing import Optional, Union
 
 import numpy as np
 import torch
@@ -87,7 +87,9 @@ def set_active_loras(self, input_batch: InputBatch,
                                       lora_requests)
 
     @contextmanager
-    def maybe_setup_dummy_loras(self, lora_config):
+    def maybe_setup_dummy_loras(self,
+                                lora_config: Optional[LoRAConfig],
+                                remove_lora: bool = True):
         if lora_config is None:
             yield
         else:
@@ -114,10 +116,11 @@ def maybe_setup_dummy_loras(self, lora_config):
                 yield
 
             # __exit__ code
-            self.lora_manager.remove_all_adapters()
+            if remove_lora:
+                self.lora_manager.remove_all_adapters()
 
     @contextmanager
-    def maybe_select_dummy_loras(self, lora_config: LoRAConfig,
+    def maybe_select_dummy_loras(self, lora_config: Optional[LoRAConfig],
                                  num_scheduled_tokens: np.ndarray):
         if lora_config is None:
             yield
@@ -151,13 +154,22 @@ def maybe_select_dummy_loras(self, lora_config: LoRAConfig,
             yield
 
     @contextmanager
-    def maybe_dummy_run_with_lora(self, lora_config: LoRAConfig,
-                                  num_scheduled_tokens: np.ndarray):
-        with self.maybe_setup_dummy_loras(
-                lora_config), self.maybe_select_dummy_loras(
-                    lora_config, num_scheduled_tokens):
+    def maybe_dummy_run_with_lora(self,
+                                  lora_config: Optional[LoRAConfig],
+                                  num_scheduled_tokens: np.ndarray,
+                                  remove_lora: bool = True):
+        with (
+                self.maybe_setup_dummy_loras(lora_config, remove_lora),
+                self.maybe_select_dummy_loras(lora_config,
+                                              num_scheduled_tokens),
+        ):
             yield
 
+    def maybe_remove_all_loras(self, lora_config: Optional[LoRAConfig]):
+        if lora_config is None:
+            return
+        self.lora_manager.remove_all_adapters()
+
     def add_lora(self, lora_request: LoRARequest) -> bool:
         if not self.lora_manager:
             raise RuntimeError("LoRA is not enabled.")

From 5b8077b8ac42625a3465ad1f885e409d33e0e42e Mon Sep 17 00:00:00 2001
From: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Date: Sat, 30 Aug 2025 17:39:38 -0300
Subject: [PATCH 120/125] Fix wrong truncate_prompt_tokens type hint (#22761)

Signed-off-by: Gabriel Marinho <gmarinho@ibm.com>
Signed-off-by: Gabriel Marinho <104592062+gmarinho2@users.noreply.github.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/llm.py                       | 41 +++++++---------
 vllm/entrypoints/openai/protocol.py           | 28 +++++++----
 vllm/entrypoints/openai/serving_chat.py       |  1 -
 .../openai/serving_classification.py          | 13 -----
 vllm/entrypoints/openai/serving_completion.py |  1 -
 vllm/entrypoints/openai/serving_embedding.py  | 14 ------
 vllm/entrypoints/openai/serving_engine.py     | 47 ++++++++-----------
 vllm/entrypoints/openai/serving_pooling.py    |  2 -
 vllm/entrypoints/openai/serving_score.py      |  6 +--
 vllm/inputs/preprocess.py                     | 22 ++++++++-
 vllm/pooling_params.py                        |  7 ++-
 vllm/sampling_params.py                       | 14 ++++--
 vllm/transformers_utils/tokenizer_group.py    |  1 +
 vllm/utils/__init__.py                        |  6 +++
 14 files changed, 101 insertions(+), 102 deletions(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9d900e691b0a..479524a11799 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -51,7 +51,7 @@
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
                                                get_cached_tokenizer)
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import Counter, Device, is_list_of
+from vllm.utils import Counter, Device, as_iter, is_list_of
 from vllm.v1.sample.logits_processor import LogitsProcessor
 
 if TYPE_CHECKING:
@@ -364,14 +364,6 @@ def generate(
             # Use default sampling params.
             sampling_params = self.get_default_sampling_params()
 
-        tokenization_kwargs: dict[str, Any] = {}
-        truncate_prompt_tokens = None
-        if isinstance(sampling_params, SamplingParams):
-            truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
-
-        _validate_truncation_size(model_config.max_model_len,
-                                  truncate_prompt_tokens, tokenization_kwargs)
-
         # Add any modality specific loras to the corresponding prompts
         lora_request = self._get_modality_specific_lora_reqs(
             prompts, lora_request)
@@ -381,7 +373,6 @@ def generate(
             params=sampling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
             priority=priority,
         )
 
@@ -871,6 +862,8 @@ def encode(
                 If `False`, no progress bar is created.
             lora_request: LoRA request to use for generation, if any.
             pooling_task: Override the pooling task to use.
+            tokenization_kwargs: overrides tokenization_kwargs set in
+                pooling_params
 
         Returns:
             A list of `PoolingRequestOutput` objects containing the
@@ -916,24 +909,17 @@ def encode(
             # Use default pooling params.
             pooling_params = PoolingParams()
 
-        if isinstance(pooling_params, PoolingParams):
-            pooling_params.verify(pooling_task, model_config)
-        else:
-            for pooling_param in pooling_params:
-                pooling_param.verify(pooling_task, model_config)
-
-        if tokenization_kwargs is None:
-            tokenization_kwargs = dict[str, Any]()
-            _validate_truncation_size(model_config.max_model_len,
-                                      truncate_prompt_tokens,
-                                      tokenization_kwargs)
+        for param in as_iter(pooling_params):
+            param.verify(pooling_task, model_config)
+            # for backwards compatibility
+            if truncate_prompt_tokens is not None:
+                param.truncate_prompt_tokens = truncate_prompt_tokens
 
         self._validate_and_add_requests(
             prompts=prompts,
             params=pooling_params,
             use_tqdm=use_tqdm,
             lora_request=lora_request,
-            tokenization_kwargs=tokenization_kwargs,
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
@@ -1385,7 +1371,6 @@ def _validate_and_add_requests(
         *,
         use_tqdm: Union[bool, Callable[..., tqdm]] = True,
         lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
-        tokenization_kwargs: Optional[dict[str, Any]] = None,
         priority: Optional[list[int]] = None,
     ) -> None:
         if isinstance(prompts, (str, dict)):
@@ -1412,7 +1397,17 @@ def _validate_and_add_requests(
             tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
             it = tqdm_func(it, desc="Adding requests")
 
+        model_config = self.llm_engine.model_config
+
         for i, prompt in enumerate(it):
+
+            param = params[i] if isinstance(params, Sequence) else params
+
+            tokenization_kwargs: dict[str, Any] = {}
+            _validate_truncation_size(model_config.max_model_len,
+                                      param.truncate_prompt_tokens,
+                                      tokenization_kwargs)
+
             self._add_request(
                 prompt,
                 params[i] if isinstance(params, Sequence) else params,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5cb41bd93d4b..0fa1136b47b8 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -452,7 +452,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
     prompt_logprobs: Optional[int] = None
     allowed_token_ids: Optional[list[int]] = None
     bad_words: list[str] = Field(default_factory=list)
@@ -995,7 +995,7 @@ class CompletionRequest(OpenAIBaseModel):
     min_tokens: int = 0
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
     allowed_token_ids: Optional[list[int]] = None
     prompt_logprobs: Optional[int] = None
     # --8<-- [end:completion-sampling-params]
@@ -1325,8 +1325,10 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
     # --8<-- [end:embedding-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(dimensions=self.dimensions,
-                             normalize=self.normalize)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize)
 
 
 class EmbeddingChatRequest(OpenAIBaseModel):
@@ -1393,8 +1395,10 @@ def check_generation_prompt(cls, data):
         return data
 
     def to_pooling_params(self):
-        return PoolingParams(dimensions=self.dimensions,
-                             normalize=self.normalize)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize)
 
 
 EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest]
@@ -1430,7 +1434,9 @@ class ScoreRequest(OpenAIBaseModel):
     # --8<-- [end:score-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(activation=self.activation)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            activation=self.activation)
 
 
 class RerankRequest(OpenAIBaseModel):
@@ -1460,7 +1466,9 @@ class RerankRequest(OpenAIBaseModel):
     # --8<-- [end:rerank-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(activation=self.activation)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            activation=self.activation)
 
 
 class RerankDocument(BaseModel):
@@ -1618,7 +1626,9 @@ class ClassificationRequest(OpenAIBaseModel):
     # --8<-- [end:classification-extra-params]
 
     def to_pooling_params(self):
-        return PoolingParams(activation=self.activation)
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            activation=self.activation)
 
 
 class ClassificationData(OpenAIBaseModel):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 1c0ffdfb9189..6300d0758c3d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -237,7 +237,6 @@ async def create_chat_completion(
                     documents=request.documents,
                     chat_template_kwargs=request.chat_template_kwargs,
                     tool_parser=tool_parser,
-                    truncate_prompt_tokens=request.truncate_prompt_tokens,
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/openai/serving_classification.py
index 1d510d0b60a2..b4fdc3639031 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/openai/serving_classification.py
@@ -61,7 +61,6 @@ async def _preprocess(
                 ctx.request,
                 ctx.tokenizer,
                 ctx.request.input,
-                truncate_prompt_tokens=ctx.request.truncate_prompt_tokens,
             )
 
             return None
@@ -157,18 +156,6 @@ async def create_classify(
 
         return await super().handle(ctx)  # type: ignore
 
-    @override
-    def _validate_request(
-        self,
-        ctx: ClassificationServeContext,
-    ) -> Optional[ErrorResponse]:
-        if error := super()._validate_request(ctx):
-            return error
-
-        ctx.truncate_prompt_tokens = ctx.request.truncate_prompt_tokens
-
-        return None
-
     @override
     def _create_pooling_params(
         self,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index f461d7609b94..11effba8f9eb 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -137,7 +137,6 @@ async def create_completion(
                 request,
                 tokenizer,
                 request.prompt,
-                truncate_prompt_tokens=request.truncate_prompt_tokens,
                 add_special_tokens=request.add_special_tokens,
             )
         except ValueError as e:
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 45c1932f1873..0a0d98db2d0d 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -97,7 +97,6 @@ async def _preprocess(
                     # so there is no need to append extra tokens to the input
                     add_generation_prompt=False,
                     continue_final_message=False,
-                    truncate_prompt_tokens=ctx.truncate_prompt_tokens,
                     add_special_tokens=ctx.request.add_special_tokens,
                 )
             else:
@@ -106,7 +105,6 @@ async def _preprocess(
                      ctx.request,
                      tokenizer,
                      ctx.request.input,
-                     truncate_prompt_tokens=ctx.truncate_prompt_tokens,
                      add_special_tokens=ctx.request.add_special_tokens,
                  )
             return None
@@ -631,18 +629,6 @@ async def create_embedding(
 
         return await super().handle(ctx)  # type: ignore
 
-    @override
-    def _validate_request(
-        self,
-        ctx: ServeContext[EmbeddingRequest],
-    ) -> Optional[ErrorResponse]:
-        if error := super()._validate_request(ctx):
-            return error
-
-        ctx.truncate_prompt_tokens = ctx.request.truncate_prompt_tokens
-
-        return None
-
     @override
     def _create_pooling_params(
         self,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index ca6f3987936d..320c1e61f1d1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -165,7 +165,6 @@ class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, BaseModel,
 
     # Shared across most requests
     tokenizer: Optional[AnyTokenizer] = None
-    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
 
     # `protected_namespaces` resolves Pydantic v2's warning
     # on conflict with protected namespace "model_"
@@ -297,14 +296,12 @@ def _validate_request(self, ctx: ServeContext) -> Optional[ErrorResponse]:
         truncate_prompt_tokens = getattr(ctx.request, "truncate_prompt_tokens",
                                          None)
 
-        if truncate_prompt_tokens is not None:
-            if truncate_prompt_tokens <= self.max_model_len:
-                ctx.truncate_prompt_tokens = truncate_prompt_tokens
-            else:
-                return self.create_error_response(
-                    "truncate_prompt_tokens value is "
-                    "greater than max_model_len."
-                    " Please, select a smaller truncation size.")
+        if truncate_prompt_tokens is not None and \
+            truncate_prompt_tokens > self.max_model_len:
+            return self.create_error_response(
+                "truncate_prompt_tokens value is "
+                "greater than max_model_len."
+                " Please, select a smaller truncation size.")
         return None
 
     def _create_pooling_params(
@@ -528,7 +525,6 @@ async def _normalize_prompt_text_to_input(
         request: AnyRequest,
         prompt: str,
         tokenizer: AnyTokenizer,
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]],
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
         async_tokenizer = self._get_async_tokenizer(tokenizer)
@@ -538,6 +534,9 @@ async def _normalize_prompt_text_to_input(
                     "do_lower_case", False)):
             prompt = prompt.lower()
 
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
         if truncate_prompt_tokens is None:
             encoded = await async_tokenizer(
                 prompt, add_special_tokens=add_special_tokens)
@@ -565,8 +564,10 @@ async def _normalize_prompt_tokens_to_input(
         request: AnyRequest,
         prompt_ids: list[int],
         tokenizer: Optional[AnyTokenizer],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]],
     ) -> TextTokensPrompt:
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
         if truncate_prompt_tokens is None:
             input_ids = prompt_ids
         elif truncate_prompt_tokens < 0:
@@ -652,7 +653,6 @@ async def _tokenize_prompt_input_async(
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt_input: Union[str, list[int]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
         """
@@ -664,7 +664,6 @@ async def _tokenize_prompt_input_async(
                 request,
                 tokenizer,
             [prompt_input],
-                truncate_prompt_tokens=truncate_prompt_tokens,
                 add_special_tokens=add_special_tokens,
         ):
             return result
@@ -675,7 +674,6 @@ async def _tokenize_prompt_inputs_async(
         request: AnyRequest,
         tokenizer: AnyTokenizer,
         prompt_inputs: Iterable[Union[str, list[int]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> AsyncGenerator[TextTokensPrompt, None]:
         """
@@ -689,7 +687,6 @@ async def _tokenize_prompt_inputs_async(
                     request,
                     prompt=prompt,
                     tokenizer=tokenizer,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=add_special_tokens,
                 )
             else:
@@ -697,7 +694,6 @@ async def _tokenize_prompt_inputs_async(
                     request,
                     prompt_ids=prompt,
                     tokenizer=tokenizer,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
                 )
 
     async def _tokenize_prompt_input_or_inputs_async(
@@ -706,7 +702,6 @@ async def _tokenize_prompt_input_or_inputs_async(
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> tuple[list[TextTokensPrompt], list[EmbedsPrompt]]:
         """
@@ -719,6 +714,12 @@ async def _tokenize_prompt_input_or_inputs_async(
         inputs_embeds = list[EmbedsPrompt]()
         inputs_text = list[TextTokensPrompt]()
 
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
+        if (truncate_prompt_tokens or 0) < 0:
+            truncate_prompt_tokens = self.max_model_len
+
         if (isinstance(request, CompletionRequest)
                 and request.prompt_embeds is not None):
             inputs_embeds.extend(
@@ -748,14 +749,10 @@ async def _tokenize_prompt_input_or_inputs_async(
                     request,
                     prompt_input["content"],
                     tokenizer=tokenizer,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=add_special_tokens)
             else:
                 task = self._normalize_prompt_tokens_to_input(
-                    request,
-                    prompt_input["content"],
-                    tokenizer=tokenizer,
-                    truncate_prompt_tokens=truncate_prompt_tokens)
+                    request, prompt_input["content"], tokenizer=tokenizer)
             tasks.append(task)
 
         # Wait for all tokenization tasks to complete
@@ -772,7 +769,6 @@ async def _preprocess_completion(
                        TokenizeCompletionRequest],
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Union[str, list[str], list[int], list[list[int]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
         add_special_tokens: bool = ...,
     ) -> tuple[list[TextTokensPrompt], list[EngineTokensPrompt]]:
         ...
@@ -784,7 +780,6 @@ async def _preprocess_completion(
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = ...,
         add_special_tokens: bool = ...,
     ) -> tuple[list[Union[TextTokensPrompt, EmbedsPrompt]], list[Union[
             EngineTokensPrompt, EngineEmbedsPrompt]]]:
@@ -796,7 +791,6 @@ async def _preprocess_completion(
         tokenizer: Optional[AnyTokenizer],
         input_or_inputs: Optional[Union[str, list[str], list[int],
                                         list[list[int]]]],
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None,
         add_special_tokens: bool = True,
     ) -> tuple[Union[list[TextTokensPrompt], list[Union[
             TextTokensPrompt, EmbedsPrompt]]], Union[
@@ -813,7 +807,6 @@ async def _preprocess_completion(
              request,
              tokenizer,
              input_or_inputs,
-             truncate_prompt_tokens=truncate_prompt_tokens,
              add_special_tokens=add_special_tokens,
          )
 
@@ -866,7 +859,6 @@ async def _preprocess_chat(
         documents: Optional[list[dict[str, str]]] = None,
         chat_template_kwargs: Optional[dict[str, Any]] = None,
         tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None,
-        truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
         add_special_tokens: bool = False,
     ) -> tuple[list[ConversationMessage], Sequence[RequestPrompt],
                list[EngineTokensPrompt]]:
@@ -941,7 +933,6 @@ async def _preprocess_chat(
                 request,
                 tokenizer,
                 request_prompt,
-                truncate_prompt_tokens=truncate_prompt_tokens,
                 add_special_tokens=add_special_tokens,
             )
         else:
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py
index e8cb1aed8459..b2c2af2ec58e 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/openai/serving_pooling.py
@@ -120,7 +120,6 @@ async def create_pooling(
                     # so there is no need to append extra tokens to the input
                     add_generation_prompt=False,
                     continue_final_message=False,
-                    truncate_prompt_tokens=truncate_prompt_tokens,
                     add_special_tokens=request.add_special_tokens,
                 )
             else:
@@ -129,7 +128,6 @@ async def create_pooling(
                      request,
                      tokenizer,
                      request.input,
-                     truncate_prompt_tokens=truncate_prompt_tokens,
                      add_special_tokens=request.add_special_tokens,
                  )
         except (ValueError, TypeError, jinja2.TemplateError) as e:
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py
index c54deb371d54..847c014a11dc 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/openai/serving_score.py
@@ -266,12 +266,14 @@ async def _run_scoring(
         request: Union[ScoreRequest, RerankRequest],
         request_id: str,
         raw_request: Optional[Request] = None,
-        truncate_prompt_tokens: Optional[int] = None,
     ) -> Union[list[PoolingRequestOutput], ErrorResponse]:
         lora_request = self._maybe_get_adapters(request)
 
         tokenizer = await self.engine_client.get_tokenizer(lora_request)
 
+        truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens",
+                                         None)
+
         tokenization_kwargs: dict[str, Any] = {}
         _validate_truncation_size(self.max_model_len, truncate_prompt_tokens,
                                   tokenization_kwargs)
@@ -343,7 +345,6 @@ async def create_score(
                 request,
                 request_id,
                 raw_request,
-                request.truncate_prompt_tokens,
             )
             if isinstance(final_res_batch, ErrorResponse):
                 return final_res_batch
@@ -391,7 +392,6 @@ async def do_rerank(
                 request,
                 request_id,
                 raw_request,
-                request.truncate_prompt_tokens,
             )
             if isinstance(final_res_batch, ErrorResponse):
                 return final_res_batch
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 3dbd9057fe0f..2f2fbe274bf0 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -346,6 +346,22 @@ async def _process_embeds_async(
     ) -> EmbedsInputs:
         return self._process_embeds(parsed_content)
 
+    def _truncate_inputs(
+            self,
+            inputs: list[int],
+            tokenization_kwargs: Optional[dict[str, Any]] = None) -> list[int]:
+
+        if not tokenization_kwargs or "truncation" not in \
+                tokenization_kwargs or self.tokenizer is None:
+            return inputs
+
+        max_length = tokenization_kwargs["max_length"]
+
+        if self.tokenizer.truncation_side == "left":
+            return inputs[-max_length:]
+        else:
+            return inputs[:max_length]
+
     def _process_tokens(
         self,
         parsed_content: TokensPrompt,
@@ -354,7 +370,8 @@ def _process_tokens(
         *,
         mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
-        prompt_token_ids = parsed_content["prompt_token_ids"]
+        prompt_token_ids = self._truncate_inputs(
+            parsed_content["prompt_token_ids"], tokenization_kwargs)
 
         inputs: Union[TokenInputs, MultiModalInputs]
         if multi_modal_data := parsed_content.get("multi_modal_data"):
@@ -382,7 +399,8 @@ async def _process_tokens_async(
         *,
         mm_hash_overrides: Optional[dict[str, list[str]]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
-        prompt_token_ids = parsed_content["prompt_token_ids"]
+        prompt_token_ids = self._truncate_inputs(
+            parsed_content["prompt_token_ids"], tokenization_kwargs)
 
         inputs: Union[TokenInputs, MultiModalInputs]
         if multi_modal_data := parsed_content.get("multi_modal_data"):
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 29f037b4372c..6672392b8d08 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Annotated, Any, Optional
 
 import msgspec
 
@@ -27,6 +27,11 @@ class PoolingParams(
                     the classification outputs.
         softmax: Whether to apply softmax to the reward outputs.
     """
+    truncate_prompt_tokens: Optional[Annotated[int,
+                                               msgspec.Meta(ge=-1)]] = None
+    """If set to -1, will use the truncation size supported by the model. If
+    set to an integer k, will use only the last k tokens from the prompt
+    (i.e., left truncation). If set to `None`, truncation is disabled."""
 
     ## for embeddings models
     dimensions: Optional[int] = None
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index df4cca9ba114..c7b4ba34c602 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -182,7 +182,8 @@ class SamplingParams(
     optionally prompt tokens as a first argument."""
     include_stop_str_in_output: bool = False
     """Whether to include the stop strings in output text."""
-    truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None
+    truncate_prompt_tokens: Optional[Annotated[int,
+                                               msgspec.Meta(ge=-1)]] = None
     """If set to -1, will use the truncation size supported by the model. If
     set to an integer k, will use only the last k tokens from the prompt
     (i.e., left truncation). If set to `None`, truncation is disabled."""
@@ -241,7 +242,8 @@ def from_optional(
         spaces_between_special_tokens: bool = True,
         logits_processors: Optional[list[LogitsProcessor]] = None,
         truncate_prompt_tokens: Optional[Annotated[int,
-                                                   msgspec.Meta(ge=1)]] = None,
+                                                   msgspec.Meta(
+                                                       ge=-1)]] = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
         guided_decoding: Optional[GuidedDecodingParams] = None,
         logit_bias: Optional[Union[dict[int, float], dict[str, float]]] = None,
@@ -411,9 +413,11 @@ def _verify_args(self) -> None:
             raise ValueError(f"prompt_logprobs must be non-negative, got "
                              f"{self.prompt_logprobs}.")
         if (self.truncate_prompt_tokens is not None
-                and self.truncate_prompt_tokens < 1):
-            raise ValueError(f"truncate_prompt_tokens must be >= 1, "
-                             f"got {self.truncate_prompt_tokens}")
+                and (self.truncate_prompt_tokens == 0
+                     or self.truncate_prompt_tokens < -1)):
+            raise ValueError(
+                f"truncate_prompt_tokens must be an integer >= 1 or -1, "
+                f"got {self.truncate_prompt_tokens}")
         assert isinstance(self.stop_token_ids, list)
         if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
             raise ValueError(f"stop_token_ids must contain only integers, "
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index a8bb0398dfdb..ae8220f9b9dc 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -23,6 +23,7 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
         self.tokenizer_config = tokenizer_config
         self.enable_lora = enable_lora
         self.max_input_length = max_input_length
+        self.truncation_side = tokenizer_config.get("truncation_side", "left")
         self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
         max_loras = tokenizer_config.get("max_loras", 0)
         self.lora_tokenizers = LRUCache[int, AnyTokenizer](
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index c5ed10326fd5..698aaab3aaa0 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1328,6 +1328,12 @@ def as_list(maybe_list: Iterable[T]) -> list[T]:
     return maybe_list if isinstance(maybe_list, list) else list(maybe_list)
 
 
+def as_iter(obj: Union[T, Iterable[T]]) -> Iterable[T]:
+    if isinstance(obj, str) or not isinstance(obj, Iterable):
+        obj = [obj]
+    return obj
+
+
 # `collections` helpers
 def is_list_of(
     value: object,

From 749be00a98eef8eab262cc3119893c00dbca22e9 Mon Sep 17 00:00:00 2001
From: Roger Wang <hey@rogerw.io>
Date: Sat, 30 Aug 2025 18:01:22 -0700
Subject: [PATCH 121/125] [Core][Multimodal] Allow passing `multi_modal_uuids`
 as multimodal identifiers. (#23394)

Signed-off-by: Roger Wang <hey@rogerw.io>
---
 docs/features/multimodal_inputs.md            |  35 +++
 .../test_processor_multi_modal_uuids.py       | 229 ++++++++++++++++++
 vllm/entrypoints/openai/serving_engine.py     |   2 +-
 vllm/inputs/data.py                           |  20 +-
 vllm/inputs/preprocess.py                     |  44 ++--
 vllm/multimodal/__init__.py                   |   6 +-
 vllm/multimodal/hasher.py                     |   7 +-
 vllm/multimodal/inputs.py                     |  15 +-
 vllm/multimodal/processing.py                 |  91 +++++--
 vllm/v1/engine/processor.py                   |  60 ++++-
 10 files changed, 455 insertions(+), 54 deletions(-)
 create mode 100644 tests/v1/engine/test_processor_multi_modal_uuids.py

diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 9d51f9cf52f5..206ab7a46875 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -13,6 +13,41 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
 - `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].
 
+### Stable UUIDs for Caching (multi_modal_uuids)
+
+When using multi-modal inputs, vLLM normally hashes each media item by content to enable caching across requests. You can optionally pass `multi_modal_uuids` to provide your own stable IDs for each item so caching can reuse work across requests without rehashing the raw content.
+
+??? code
+
+    ```python
+    from vllm import LLM
+    from PIL import Image
+
+    # Qwen2.5-VL example with two images
+    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct")
+
+    prompt = "USER: <image><image>\nDescribe the differences.\nASSISTANT:"
+    img_a = Image.open("/path/to/a.jpg")
+    img_b = Image.open("/path/to/b.jpg")
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"image": [img_a, img_b]},
+        # Provide stable IDs for caching.
+        # Requirements (matched by this example):
+        #  - Include every modality present in multi_modal_data.
+        #  - For lists, provide the same number of entries.
+        #  - Use None to fall back to content hashing for that item.
+        "multi_modal_uuids": {"image": ["sku-1234-a", None]},
+    })
+
+    for o in outputs:
+        print(o.outputs[0].text)
+    ```
+
+!!! warning
+    If both multimodal processor caching and prefix caching are disabled, user-provided `multi_modal_uuids` are ignored.
+
 ### Image Inputs
 
 You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_processor_multi_modal_uuids.py
new file mode 100644
index 000000000000..970a59eca8ec
--- /dev/null
+++ b/tests/v1/engine/test_processor_multi_modal_uuids.py
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.platforms.interface import UnspecifiedPlatform
+from vllm.sampling_params import SamplingParams
+from vllm.v1.engine import processor as processor_mod
+from vllm.v1.engine.processor import Processor
+
+cherry_pil_image = ImageAsset("cherry_blossom").pil_image
+stop_pil_image = ImageAsset("stop_sign").pil_image
+baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
+
+
+# Mock processor for testing
+def _mk_processor(monkeypatch,
+                  *,
+                  mm_cache_gb: float = 4.0,
+                  enable_prefix_caching: bool = True) -> Processor:
+    """
+    Create a Processor instance with minimal configuration suitable for unit
+    tests without accessing external resources.
+    """
+    monkeypatch.setattr(ModelConfig,
+                        "try_get_generation_config",
+                        lambda self: {},
+                        raising=True)
+    monkeypatch.setattr(ModelConfig,
+                        "__post_init__",
+                        lambda self: None,
+                        raising=True)
+    monkeypatch.setattr(UnspecifiedPlatform,
+                        "is_async_output_supported",
+                        classmethod(lambda cls, enforce_eager: True),
+                        raising=True)
+    monkeypatch.setattr(
+        ModelConfig,
+        "verify_async_output_proc",
+        lambda self, parallel_config, speculative_config, device_config: None,
+        raising=True)
+    monkeypatch.setattr(ModelConfig,
+                        "verify_with_parallel_config",
+                        lambda self, parallel_config: None,
+                        raising=True)
+    monkeypatch.setattr(processor_mod,
+                        "processor_cache_from_config",
+                        lambda vllm_config, mm_registry: None,
+                        raising=True)
+
+    monkeypatch.setattr(VllmConfig,
+                        "__post_init__",
+                        lambda self: None,
+                        raising=True)
+
+    model_config = ModelConfig(
+        skip_tokenizer_init=True,
+        max_model_len=128,
+        mm_processor_cache_gb=mm_cache_gb,
+        generation_config="vllm",
+        tokenizer="dummy",
+    )
+
+    # Minimal multimodal_config to satisfy references in
+    # Processor.process_inputs.
+    class _MockMMConfig:
+
+        def __init__(self, gb: float):
+            self.mm_processor_cache_gb = gb
+
+    model_config.multimodal_config = _MockMMConfig(
+        mm_cache_gb)  # type: ignore[attr-defined]
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
+        device_config=DeviceConfig(device="cpu"),
+    )
+
+    # Pass tokenizer=None; InputPreprocessor handles None when
+    # skip_tokenizer_init is True.
+    return Processor(vllm_config, tokenizer=None)  # type: ignore[arg-type]
+
+
+def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
+    processor = _mk_processor(monkeypatch)
+
+    prompt = {
+        "prompt": "USER: <image>\nDescribe\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image]
+        },
+        # Mismatch: 2 items but only 1 uuid provided
+        "multi_modal_uuids": {
+            "image": ["hash_cherry"]
+        },
+    }
+
+    with pytest.raises(ValueError, match="must have same length as data"):
+        processor.process_inputs(
+            request_id="req-1",
+            prompt=prompt,  # type: ignore[arg-type]
+            params=SamplingParams(),
+        )
+
+
+def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
+    processor = _mk_processor(monkeypatch)
+
+    prompt = {
+        "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
+        # Two modalities provided in data
+        "multi_modal_data": {
+            "image": [cherry_pil_image],
+            "video": [baby_reading_np_ndarrays]
+        },
+        # Only image uuids provided; video missing should raise
+        "multi_modal_uuids": {
+            "image": ["hash_cherry"]
+        },
+    }
+
+    with pytest.raises(ValueError,
+                       match="must be provided if multi_modal_data"):
+        processor.process_inputs(
+            request_id="req-2",
+            prompt=prompt,  # type: ignore[arg-type]
+            params=SamplingParams(),
+        )
+
+
+@pytest.mark.parametrize(
+    "mm_cache_gb, enable_prefix_caching",
+    [
+        (4.0, True),  # default behavior
+        (4.0, False),  # prefix caching disabled
+        (0.0, True),  # processor cache disabled
+    ],
+)
+def test_multi_modal_uuids_accepts_none_and_passes_through(
+        monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool):
+    processor = _mk_processor(monkeypatch,
+                              mm_cache_gb=mm_cache_gb,
+                              enable_prefix_caching=enable_prefix_caching)
+
+    # Capture the overrides passed to InputPreprocessor.preprocess
+    captured: dict[str, object] = {}
+
+    def fake_preprocess(prompt,
+                        *,
+                        tokenization_kwargs=None,
+                        lora_request=None,
+                        mm_hash_overrides=None):
+        captured["mm_hash_overrides"] = mm_hash_overrides
+        # Minimal processed inputs for decoder-only flow
+        return {"type": "token", "prompt_token_ids": [1]}
+
+    # Monkeypatch only the bound preprocess method on this instance
+    monkeypatch.setattr(processor.input_preprocessor,
+                        "preprocess",
+                        fake_preprocess,
+                        raising=True)
+
+    # Use a consistent two-image scenario across all configurations
+    mm_uuids = {"image": [None, "hash_stop"], "video": None}
+    prompt = {
+        "prompt": "USER: <image><image>\nTwo images\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image],
+            "video": baby_reading_np_ndarrays,
+        },
+        "multi_modal_uuids": mm_uuids,
+    }
+
+    processor.process_inputs(
+        request_id="req-3",
+        prompt=prompt,  # type: ignore[arg-type]
+        params=SamplingParams(),
+    )
+
+    assert captured["mm_hash_overrides"] == mm_uuids
+
+
+def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
+    # When both processor cache is 0 and prefix caching disabled, the
+    # processor builds overrides from request id instead of using user UUIDs.
+    processor = _mk_processor(monkeypatch,
+                              mm_cache_gb=0.0,
+                              enable_prefix_caching=False)
+
+    captured: dict[str, object] = {}
+
+    def fake_preprocess(prompt,
+                        *,
+                        tokenization_kwargs=None,
+                        lora_request=None,
+                        mm_hash_overrides=None):
+        captured["mm_hash_overrides"] = mm_hash_overrides
+        return {"type": "token", "prompt_token_ids": [1]}
+
+    monkeypatch.setattr(processor.input_preprocessor,
+                        "preprocess",
+                        fake_preprocess,
+                        raising=True)
+
+    request_id = "req-42"
+    mm_uuids = {"image": ["hash_cherry", "hash_stop"], "video": "hash_video"}
+    prompt = {
+        "prompt": "USER: <image><image><video>\nDescribe\nASSISTANT:",
+        "multi_modal_data": {
+            "image": [cherry_pil_image, stop_pil_image],
+            "video": baby_reading_np_ndarrays,
+        },
+        "multi_modal_uuids": mm_uuids,
+    }
+
+    processor.process_inputs(
+        request_id=request_id,
+        prompt=prompt,  # type: ignore[arg-type]
+        params=SamplingParams(),
+    )
+
+    # Expect request-id-based overrides are passed through
+    assert captured["mm_hash_overrides"] == {
+        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
+        "video": [f"{request_id}-video-0"],
+    }
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 320c1e61f1d1..7014744cd976 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -67,7 +67,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (  # noqa: F401 - Required to resolve Pydantic error in RequestProcessingMixin
-    MultiModalDataDict)
+    MultiModalDataDict, MultiModalUUIDDict)
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, SamplingParams
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 8e6d3136d5e9..f23b49872234 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -7,7 +7,8 @@
 from typing_extensions import NotRequired, TypedDict, TypeIs, TypeVar
 
 if TYPE_CHECKING:
-    from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs
+    from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalInputs,
+                                        MultiModalUUIDDict)
 
 
 class TextPrompt(TypedDict):
@@ -30,6 +31,15 @@ class TextPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
+    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
+    """
+    Optional user-specified UUIDs for multimodal items, mapped by modality.
+    Lists must match the number of items per modality and may contain `None`.
+    For `None` entries, the hasher will compute IDs automatically; non-None
+    entries override the default hashes for caching, and MUST be unique per
+    multimodal item.
+    """
+
     cache_salt: NotRequired[str]
     """
     Optional cache salt to be used for prefix caching.
@@ -59,6 +69,14 @@ class TokensPrompt(TypedDict):
     to pass the mm_processor_kwargs to each of them.
     """
 
+    multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
+    """
+    Optional user-specified UUIDs for multimodal items, mapped by modality.
+    Lists must match the number of items per modality and may contain `None`.
+    For `None` entries, the hasher will compute IDs automatically; non-None
+    entries override the default hashes for caching.
+    """
+
     cache_salt: NotRequired[str]
     """
     Optional cache salt to be used for prefix caching.
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 2f2fbe274bf0..094fcf021b61 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -13,7 +13,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.multimodal.cache import BaseMultiModalProcessorCache
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalEncDecInputs,
-                                    MultiModalInputs)
+                                    MultiModalInputs, MultiModalUUIDDict)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
@@ -258,7 +258,8 @@ def _process_multimodal(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalInputs:
         """
         Apply the model's multi-modal processor to a multi-modal prompt,
@@ -291,7 +292,8 @@ async def _process_multimodal_async(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalInputs:
         """
         Async version of
@@ -368,7 +370,8 @@ def _process_tokens(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = self._truncate_inputs(
             parsed_content["prompt_token_ids"], tokenization_kwargs)
@@ -397,7 +400,8 @@ async def _process_tokens_async(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_token_ids = self._truncate_inputs(
             parsed_content["prompt_token_ids"], tokenization_kwargs)
@@ -426,7 +430,8 @@ def _process_text(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -462,7 +467,8 @@ async def _process_text_async(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> Union[TokenInputs, MultiModalInputs]:
         prompt_text = parsed_content["prompt"]
 
@@ -498,7 +504,8 @@ def _prompt_to_llm_inputs(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> SingletonInputs:
         """
         Extract the singleton inputs from a prompt.
@@ -545,7 +552,8 @@ async def _prompt_to_llm_inputs_async(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> SingletonInputs:
         """
         Async version of
@@ -684,7 +692,8 @@ def _process_encoder_decoder_prompt(
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
@@ -759,7 +768,8 @@ async def _process_encoder_decoder_prompt_async(
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> EncoderDecoderInputs:
         """
         Async version of
@@ -826,7 +836,8 @@ def _process_decoder_only_prompt(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
@@ -858,7 +869,8 @@ async def _process_decoder_only_prompt_async(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> DecoderOnlyInputs:
         """
         Async version of
@@ -879,7 +891,8 @@ def preprocess(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
         if self.model_config.is_encoder_decoder:
@@ -909,7 +922,8 @@ async def preprocess_async(
         tokenization_kwargs: Optional[dict[str, Any]] = None,
         lora_request: Optional[LoRARequest] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> ProcessorInputs:
         """
         Async version of
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 69eed2274144..b7d4cd298e24 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .base import MultiModalPlaceholderMap
-from .hasher import MultiModalHashDict, MultiModalHasher
+from .hasher import MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalKwargs,
                      MultiModalKwargsItems, MultiModalPlaceholderDict,
-                     NestedTensors)
+                     MultiModalUUIDDict, NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -23,12 +23,12 @@
     "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
-    "MultiModalHashDict",
     "MultiModalHasher",
     "MultiModalKwargs",
     "MultiModalKwargsItems",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
+    "MultiModalUUIDDict",
     "NestedTensors",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 3708dc7065ba..da019d40a6fe 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -3,7 +3,7 @@
 
 import pickle
 import uuid
-from collections.abc import Iterable, Mapping
+from collections.abc import Iterable
 from typing import Union
 
 import numpy as np
@@ -16,11 +16,6 @@
 
 logger = init_logger(__name__)
 
-MultiModalHashDict = Mapping[str, list[str]]
-"""
-A dictionary containing hashes for items in each modality.
-"""
-
 
 class MultiModalHasher:
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6fcc5bc77214..f8ea3835f049 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -22,7 +22,8 @@
     from PIL.Image import Image
     from transformers.feature_extraction_utils import BatchFeature
 
-    from .hasher import MultiModalHashDict
+    from .processing import MultiModalHashes
+
 else:
     torch = LazyLoader("torch", globals(), "torch")
 
@@ -115,6 +116,16 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 [`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins].
 """
 
+MultiModalUUIDDict: TypeAlias = Mapping[str, Union[list[Optional[str]], str]]
+"""
+A dictionary containing user-provided UUIDs for items in each modality.
+If a UUID for an item is not provided, its entry will be `None` and
+MultiModalHasher will compute a hash for the item.
+
+The UUID will be used to identify the item for all caching purposes
+(input processing caching, embedding caching, prefix caching, etc).
+"""
+
 
 @dataclass(frozen=True)
 class PlaceholderRange:
@@ -939,7 +950,7 @@ class MultiModalInputs(TypedDict):
     mm_kwargs: MultiModalKwargsOptionalItems
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: "MultiModalHashDict"
+    mm_hashes: "MultiModalHashes"
     """The hashes of the multi-modal data."""
 
     mm_placeholders: "MultiModalPlaceholderDict"
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 41595df2e262..0531b7bd9f0a 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -24,7 +24,8 @@
 from .inputs import (MultiModalDataDict, MultiModalEncDecInputs,
                      MultiModalFieldConfig, MultiModalInputs,
                      MultiModalKwargsItem, MultiModalKwargsItems,
-                     MultiModalKwargsOptionalItems, PlaceholderRange)
+                     MultiModalKwargsOptionalItems, MultiModalUUIDDict,
+                     PlaceholderRange)
 from .parse import (DictEmbeddingItems, EmbeddingItems, MultiModalDataItems,
                     MultiModalDataParser)
 
@@ -1021,7 +1022,8 @@ def __call__(
         mm_data: MultiModalDataDict,
         hf_processor_mm_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[MultiModalHashes] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalInputs:
         return self.apply(prompt,
                           mm_data,
@@ -1361,24 +1363,62 @@ def _hash_mm_items(
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
+        *,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalHashes:
         """Create MM hashes to be returned (only used in V1).
 
+
         Note: When overrides are provided via callers of `apply`,
         `_hash_mm_items` will be bypassed and the overrides will be used.
         """
         model_id = self.info.model_id
 
-        return {
-            modality: [
-                MultiModalHasher.hash_kwargs(model_id=model_id,
-                                             **{modality: item},
-                                             **hf_processor_mm_kwargs,
-                                             **tokenization_kwargs)
-                for item in items
-            ]
-            for modality, items in mm_items.items()
-        }
+        hashes: MultiModalHashes = {}
+        mm_hash_overrides = mm_hash_overrides or {}
+
+        for modality, items in mm_items.items():
+            if modality in mm_hash_overrides:
+                mm_hashes = mm_hash_overrides[modality]
+                if isinstance(mm_hashes, str):
+                    mm_hashes = [mm_hashes]
+
+                # For None entries, compute a hash; otherwise, use provided ID.
+                computed: list[str] = []
+                for i, item in enumerate(items):
+                    mm_hash = mm_hashes[i]
+
+                    # NOTE: Even if a mm_hash is provided, we still compute a
+                    # hash if `hf_processor_mm_kwargs` or `tokenization_kwargs`
+                    # are provided. This is because the processed multimodal
+                    # inputs can be different depending on the processor kwargs.
+                    if mm_hash is None or \
+                        hf_processor_mm_kwargs or \
+                        tokenization_kwargs:
+
+                        # NOTE: use provided hash string to hash with kwargs
+                        # if available for better performance.
+                        item = mm_hash if mm_hash is not None else item
+                        computed.append(
+                            MultiModalHasher.hash_kwargs(
+                                model_id=model_id,
+                                **{modality: item},
+                                **hf_processor_mm_kwargs,
+                                **tokenization_kwargs))
+                    else:
+                        computed.append(mm_hash)
+                hashes[modality] = computed
+            else:
+                hashes[modality] = [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs,
+                                                 **tokenization_kwargs)
+                    for item in items
+                ]
+
+        return hashes
 
     def _get_cache_missing_items(
         self,
@@ -1474,7 +1514,8 @@ def _apply_hf_processor(
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[MultiModalHashes] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         (
             prompt_ids,
@@ -1495,9 +1536,10 @@ def _apply_hf_processor(
         )
 
         # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
-                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs))
+        mm_hashes = self._hash_mm_items(mm_data_items,
+                                        hf_processor_mm_kwargs,
+                                        tokenization_kwargs,
+                                        mm_hash_overrides=mm_hash_overrides)
 
         mm_prompt_updates = self._get_mm_prompt_updates(
             mm_data_items,
@@ -1520,7 +1562,8 @@ def _cached_apply_hf_processor(
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Mapping[str, object],
         *,
-        mm_hash_overrides: Optional[MultiModalHashes] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> tuple[list[int], MultiModalProcessingInfo, bool]:
         """
         Apply the HF processor on the full prompt text,
@@ -1538,10 +1581,10 @@ def _cached_apply_hf_processor(
                 mm_hash_overrides=mm_hash_overrides,
             )
 
-        # Use overrides if provided; fallback to data-dependent hashing.
-        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
-                     self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs))
+        mm_hashes = self._hash_mm_items(mm_data_items,
+                                        hf_processor_mm_kwargs,
+                                        tokenization_kwargs,
+                                        mm_hash_overrides=mm_hash_overrides)
 
         mm_missing_data_items = self._get_cache_missing_items(
             cache=cache,
@@ -1742,7 +1785,8 @@ def apply(
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
         *,
-        mm_hash_overrides: Optional[dict[str, list[str]]] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalInputs:
         """
         Process multi-modal inputs to be used in vLLM.
@@ -1857,7 +1901,8 @@ def apply(
         hf_processor_mm_kwargs: Mapping[str, object],
         tokenization_kwargs: Optional[Mapping[str, object]] = None,
         *,
-        mm_hash_overrides: Optional[MultiModalHashes] = None,
+        mm_hash_overrides: Optional[Union[dict[str, list[str]],
+                                          MultiModalUUIDDict]] = None,
     ) -> MultiModalEncDecInputs:
         """
         Process multi-modal inputs to be used in vLLM.
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6cff95c39344..6d54c92e2dd1 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -150,6 +150,49 @@ def _validate_params(
         self._validate_sampling_params(params, lora_request)
         self._validate_supported_sampling_params(params)
 
+    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
+        """
+        Validate that user-provided multi_modal_uuids align with
+        multi_modal_data in the incoming request prompt(s).
+        Only checks lengths; `None` entries are allowed and will be 
+        auto-hashed downstream.
+        """
+
+        def _validate_single_prompt(single_prompt: Union[dict, str]) -> None:
+            if not isinstance(single_prompt, dict):
+                return
+            mm_data = single_prompt.get("multi_modal_data")
+            mm_uuids = single_prompt.get("multi_modal_uuids")
+            if not mm_data or not mm_uuids:
+                return
+
+            for modality, items in mm_data.items():
+                if modality in mm_uuids:
+                    data_len = len(items) if isinstance(items, list) else 1
+                    uuid_len = len(mm_uuids[modality]) if isinstance(
+                        mm_uuids[modality], list) else 1
+                    if uuid_len != data_len:
+                        raise ValueError(
+                            f"multi_modal_uuids for modality '{modality}' "
+                            "must have same length as data: got "
+                            f"{uuid_len} uuids vs "
+                            f"{data_len} items.")
+                else:
+                    raise ValueError(
+                        f"multi_modal_uuids for modality '{modality}' must "
+                        "be provided if multi_modal_data is provided.")
+
+        # Handle explicit encoder/decoder prompts or singleton prompt
+        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
+            enc = prompt.get("encoder_prompt")
+            dec = prompt.get("decoder_prompt")
+            if enc is not None:
+                _validate_single_prompt(enc)
+            if dec is not None:
+                _validate_single_prompt(dec)
+        else:
+            _validate_single_prompt(prompt)  # type: ignore[arg-type]
+
     def _validate_lora(self, lora_request: Optional[LoRARequest]) -> None:
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -289,17 +332,27 @@ def process_inputs(
         if arrival_time is None:
             arrival_time = time.time()
 
-        # Optionally generate multimodal hash overrides based on request id.
+        # Optionally generate multimodal hash overrides to avoid hashing
+        # multimodal data items by their content as their identifiers.
+
         # NOTE: when users explicitly turn off BOTH prefix caching and input
         # processing caching, no multimodal features or embeddings will be
-        # reused across requests, therefore hashing is no longer necessary.
+        # reused across requests, therefore identifying multimodal data items
+        # by their content is no longer necessary, and we create uuids with
+        # request id-modality-index as multimodal hash overrides.
         if (self.model_config.multimodal_config and
                 self.model_config.multimodal_config.mm_processor_cache_gb == 0
                 and not self.cache_config.enable_prefix_caching):
             mm_hash_overrides = self._maybe_build_mm_hash_overrides(
                 request_id, prompt)
         else:
-            mm_hash_overrides = None
+            # Otherwise, use user-provided uuids as multimodal hash overrides
+            # if provided.
+            self._validate_multi_modal_uuids(prompt)
+            if isinstance(prompt, dict):
+                mm_hash_overrides = prompt.get("multi_modal_uuids")
+            else:
+                mm_hash_overrides = None
 
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
@@ -317,6 +370,7 @@ def process_inputs(
             params=params,
             processed_inputs=processed_inputs,
         )
+
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
         self._validate_model_inputs(processed_inputs, lora_request)

From 9701352e4ba75e89b5d35a5cb74d0a55567319d9 Mon Sep 17 00:00:00 2001
From: Didier Durand <2927957+didier-durand@users.noreply.github.com>
Date: Sun, 31 Aug 2025 10:21:59 +0200
Subject: [PATCH 122/125] [Doc]: fix typos in Python comments (#24001)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
---
 vllm/compilation/monitor.py                     |  2 +-
 vllm/core/evictor.py                            |  2 +-
 vllm/engine/llm_engine.py                       |  2 +-
 vllm/entrypoints/llm.py                         | 10 +++++-----
 vllm/executor/mp_distributed_executor.py        |  2 +-
 vllm/lora/layers.py                             |  2 +-
 vllm/platforms/interface.py                     |  2 +-
 vllm/reasoning/hunyuan_a13b_reasoning_parser.py |  2 +-
 vllm/v1/worker/gpu_model_runner.py              |  2 +-
 vllm/v1/worker/gpu_worker.py                    |  2 +-
 10 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py
index 9047bf3cbf8e..c46721ab2d74 100644
--- a/vllm/compilation/monitor.py
+++ b/vllm/compilation/monitor.py
@@ -43,7 +43,7 @@ def end_monitoring_torch_compile(vllm_config: VllmConfig):
 
 
 def validate_cudagraph_capturing_enabled():
-    # used to monitor whether an cudagraph capturing is legal at runtime.
+    # used to monitor whether a cudagraph capturing is legal at runtime.
     # should be called before any cudagraph capturing.
     # if an illegal cudagraph capturing happens, raise an error.
     global cudagraph_capturing_enabled
diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py
index 7ec4768e90b1..7a4a836ee348 100644
--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -76,7 +76,7 @@ class LRUEvictor(Evictor):
     that's recorded in the Block. If there are multiple blocks with
     the same last_accessed time, then the one with the largest num_hashed_tokens
     will be evicted. If two blocks each have the lowest last_accessed time and
-    highest num_hashed_tokens value, then one will be chose arbitrarily
+    highest num_hashed_tokens value, then one will be chosen arbitrarily
     """
 
     # CLEANUP_THRESHOLD determines the maximum allowable size of the priority
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 7a5130af0bbb..10ded6f16d41 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1239,7 +1239,7 @@ def step(self) -> List[RequestOutput]:
 
             # Stop the execute model loop in parallel workers until there are
             # more requests to process. This avoids waiting indefinitely in
-            # torch.distributed ops which may otherwise timeout, and unblocks
+            # torch.distributed ops which may otherwise time out, and unblocks
             # the RPC thread in the workers so that they can process any other
             # queued control plane messages, such as add/remove lora adapters.
             logger.debug("Stopping remote worker execution loop.")
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 479524a11799..cab761b8ea17 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -329,7 +329,7 @@ def generate(
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             sampling_params: The sampling parameters for text generation. If
                 None, we use the default sampling parameters.
                 When it is a single value, it is applied to every prompt.
@@ -853,7 +853,7 @@ def encode(
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
             use_tqdm: If `True`, shows a tqdm progress bar.
@@ -946,7 +946,7 @@ def embed(
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
             use_tqdm: If `True`, shows a tqdm progress bar.
@@ -994,7 +994,7 @@ def classify(
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             use_tqdm: If `True`, shows a tqdm progress bar.
                 If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                 it is used to create the progress bar.
@@ -1038,7 +1038,7 @@ def reward(
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
                 for batch inference. See [PromptType][vllm.inputs.PromptType]
-                for more details about the format of each prompts.
+                for more details about the format of each prompt.
             use_tqdm: If `True`, shows a tqdm progress bar.
                 If a callable (e.g., `functools.partial(tqdm, leave=False)`),
                 it is used to create the progress bar.
diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py
index 4e8c6d79095f..136dca54e6e5 100644
--- a/vllm/executor/mp_distributed_executor.py
+++ b/vllm/executor/mp_distributed_executor.py
@@ -101,7 +101,7 @@ def _init_executor(self) -> None:
             result_handler.start()
             self.worker_monitor.start()
 
-        # Set up signal handlers to shutdown the executor cleanly
+        # Set up signal handlers to shut down the executor cleanly
         # sometimes gc does not work well
 
         self.driver_worker = WorkerWrapperBase(self.vllm_config, 0)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 24a05d310d10..d8503b20459f 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -605,7 +605,7 @@ def can_replace_layer(
 
 class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     """ColumnParallelLinear layer that is composed of 2 sublayers (slices)
-    packed together (eg. gate_proj + up_proj -> gate_up_proj).
+    packed together (e.g. gate_proj + up_proj -> gate_up_proj).
 
     This means we have 2 LoRAs, each applied to one half of the layer.
 
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 01f3e2d977bc..ad12f7f788cf 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -537,7 +537,7 @@ def __getattr__(self, key: str):
 
     def get_global_graph_pool(self) -> Any:
         """
-        Return the global graph pool for the this platform.
+        Return the global graph pool for this platform.
         """
         cls = self.__class__
         if cls._global_graph_pool is None:
diff --git a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
index b2452b95c1c6..9deec8a1e8fb 100644
--- a/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
+++ b/vllm/reasoning/hunyuan_a13b_reasoning_parser.py
@@ -30,7 +30,7 @@ class HunyuanA13BReasoningParser(ReasoningParser):
     Key Features:
         - For non-stream output , Recognizes and extracts reasoning ("think")
          and answer ("answer") sections from text using regular expressions.
-        - For stream process, it require a token id sequences to change the 
+        - For stream process, it requires a token id sequences to change the
           reasoning state and other state so it maintains internal state to 
           manage parsing across multiple token.
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d6717892d4ae..f77373e8adaa 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2734,7 +2734,7 @@ def get_attn_backends_for_layers(
                                                  layer_names)
             attn_backends = {}
             attn_backend_layers = defaultdict(list)
-            # Dedupe based on full class name; this is a bit safer than using
+            # Dedupe based on full class name; this is a bit safer than
             # using the class itself as the key because when we create dynamic
             # attention backend subclasses (e.g. ChunkedLocalAttention) unless
             # they are cached correctly, there will be different objects per
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 2e7d6685377f..f49f5bdd9703 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -224,7 +224,7 @@ def determine_available_memory(self) -> int:
         memory can be used for KV cache without OOMs.
 
         The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the free memory that can be used for KV cache in
+        Then, it calculates the free memory that can be used for KV cache in
         bytes.
 
         Tip:

From 81eea3d348c26fb1e6ff0185ad109aedd60a28a2 Mon Sep 17 00:00:00 2001
From: Xiaodong Wang <xdwang@meta.com>
Date: Sun, 31 Aug 2025 05:57:05 -0700
Subject: [PATCH 123/125] vllm fix check on max vocab size (#22471)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
---
 vllm/v1/engine/processor.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 6d54c92e2dd1..1aa117ded4ed 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -470,7 +470,19 @@ def _validate_model_input(
         else:
             tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
             max_input_id = max(prompt_ids, default=0)
-            if max_input_id > tokenizer.max_token_id:
+
+            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
+            # self.model_config.get_vocab_size() is the model’s vocab size.
+            # For Qwen3 models, the language model has extra tokens that do
+            # not exist in the tokenizer, and vice versa for multimodal
+            # placeholder tokens in some multimodal models.
+            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
+            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
+
+            # Here we take the max of the two to determine if a token id is
+            # truly out-of-vocabulary.
+            if max_input_id > max(tokenizer.max_token_id,
+                                  self.model_config.get_vocab_size() - 1):
                 raise ValueError(
                     f"Token id {max_input_id} is out of vocabulary")
 

From 752d2e1c364e4195093e4f3f2fc33e3ae1840707 Mon Sep 17 00:00:00 2001
From: Nick Hill <nhill@redhat.com>
Date: Sun, 31 Aug 2025 16:42:17 -0700
Subject: [PATCH 124/125] [Minor] Fix some random typos in comments (#24009)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/utils/__init__.py                       | 2 +-
 vllm/v1/core/sched/scheduler.py              | 2 +-
 vllm/v1/core/single_type_kv_cache_manager.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 698aaab3aaa0..9c78e56d580e 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -3290,7 +3290,7 @@ def sha256_cbor_64bit(input) -> int:
     return full_hash & ((1 << 64) - 1)
 
 
-def get_hash_fn_by_name(hash_fn_name: str) -> Callable:
+def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], int]:
     """Get a hash function by name, or raise an error if
     the function is not found.
     Args:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 3bd2fe2f0515..30a443499dc2 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -1207,7 +1207,7 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
         # Now that the blocks are ready, actually cache them.
         (block_ids, ) = self.kv_cache_manager.get_block_ids(request.request_id)
         num_computed_tokens = len(block_ids) * self.block_size
-        # Handle the case where num request tokens less then one block.
+        # Handle the case where num request tokens less than one block.
         num_computed_tokens = min(num_computed_tokens, request.num_tokens)
         if num_computed_tokens == request.num_tokens:
             num_computed_tokens -= 1
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index f0af92122958..f6affb3dab66 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -47,7 +47,7 @@ def __init__(
         # {req_id: The number of cached blocks for this given request}
         # This is used to track the number of cached blocks for each request.
         # This is only used to track the RUNNING requests, we do not track the
-        # data for reempted ones.
+        # data for preempted ones.
         self.num_cached_block: dict[str, int] = {}
 
         self.kv_cache_group_id = kv_cache_group_id

From 14b4326b9470c098d537cce3834033a7f3b2c386 Mon Sep 17 00:00:00 2001
From: Or Ozeri <oro@il.ibm.com>
Date: Mon, 1 Sep 2025 04:13:21 +0300
Subject: [PATCH 125/125] v1: Support KV events from connectors (#19737)

Signed-off-by: Or Ozeri <oro@il.ibm.com>
---
 examples/online_serving/kv_events_subscriber.py     |  2 ++
 vllm/distributed/kv_events.py                       |  5 +++++
 .../distributed/kv_transfer/kv_connector/v1/base.py | 13 +++++++++++++
 .../kv_transfer/kv_connector/v1/multi_connector.py  |  6 ++++++
 vllm/v1/core/block_pool.py                          |  9 ++++++---
 vllm/v1/core/sched/scheduler.py                     | 12 ++++++++++++
 6 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index 584db53db4e4..f238c66234dc 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -27,10 +27,12 @@ class BlockStored(KVCacheEvent):
     token_ids: list[int]
     block_size: int
     lora_id: Optional[int]
+    medium: Optional[str]
 
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[int]
+    medium: Optional[str]
 
 
 class AllBlocksCleared(KVCacheEvent):
diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py
index 2d7935773dd9..37f8f72fa905 100644
--- a/vllm/distributed/kv_events.py
+++ b/vllm/distributed/kv_events.py
@@ -40,16 +40,21 @@ class KVCacheEvent(
     """Base class for all KV cache-related events"""
 
 
+MEDIUM_GPU = "GPU"
+
+
 class BlockStored(KVCacheEvent):
     block_hashes: list[int]
     parent_block_hash: Optional[int]
     token_ids: list[int]
     block_size: int
     lora_id: Optional[int]
+    medium: Optional[str]
 
 
 class BlockRemoved(KVCacheEvent):
     block_hashes: list[int]
+    medium: Optional[str]
 
 
 class AllBlocksCleared(KVCacheEvent):
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index 5601ee74be11..2804003f5a70 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -19,6 +19,8 @@
             Returns whether KV cache should be freed now or will be
             freed asynchronously and optionally returns KV transfer
             params.
+        take_events() - returns new KV events that were collected
+            by the connector since the last call.
 
     Worker-side: runs in each worker, loads/saves KV cache to/from
     the Connector based on the metadata.
@@ -34,6 +36,7 @@
 
 import enum
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional
 
 import torch
@@ -45,6 +48,7 @@
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
+    from vllm.distributed.kv_events import KVCacheEvent
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.request import Request
@@ -313,6 +317,15 @@ def request_finished(
         """
         return False, None
 
+    def take_events(self) -> Iterable["KVCacheEvent"]:
+        """
+        Take the KV cache events from the connector.
+
+        Yields:
+            New KV cache events since the last call.
+        """
+        return ()
+
     @classmethod
     def get_required_kvcache_layout(
             cls, vllm_config: "VllmConfig") -> Optional[str]:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index d3f6a226dc72..65bcb4d93b1e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -1,12 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
 
 from vllm.config import KVTransferConfig, VllmConfig
+from vllm.distributed.kv_events import KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.factory import (
     KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
@@ -208,6 +210,10 @@ def request_finished(
 
         return async_saves > 0, kv_txfer_params
 
+    def take_events(self) -> Iterable[KVCacheEvent]:
+        for c in self._connectors:
+            yield from c.take_events()
+
     @classmethod
     def get_required_kvcache_layout(
             cls, vllm_config: "VllmConfig") -> Optional[str]:
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index fdd96c3e9557..b537cac8e1d7 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -4,8 +4,9 @@
 from collections.abc import Iterable
 from typing import Optional
 
-from vllm.distributed.kv_events import (AllBlocksCleared, BlockRemoved,
-                                        BlockStored, KVCacheEvent)
+from vllm.distributed.kv_events import (MEDIUM_GPU, AllBlocksCleared,
+                                        BlockRemoved, BlockStored,
+                                        KVCacheEvent)
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId,
                                          FreeKVCacheBlockQueue, KVCacheBlock)
@@ -156,6 +157,7 @@ def cache_full_blocks(
                     block_size=block_size,
                     lora_id=request.lora_request.id
                     if request.lora_request else None,
+                    medium=MEDIUM_GPU,
                 ))
 
     def get_new_blocks(self, num_blocks: int) -> list[KVCacheBlock]:
@@ -218,7 +220,8 @@ def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool:
             # we disable hybrid kv cache manager when kv cache event is
             # enabled, so there is only one group.
             self.kv_event_queue.append(
-                BlockRemoved(block_hashes=[block_hash.get_hash_value()]))
+                BlockRemoved(block_hashes=[block_hash.get_hash_value()],
+                             medium=MEDIUM_GPU))
         return True
 
     def touch(self, blocks: tuple[list[KVCacheBlock], ...]) -> None:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 30a443499dc2..d4391b1c2137 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -589,7 +589,19 @@ def schedule(self) -> SchedulerOutput:
             meta = self.connector.build_connector_meta(scheduler_output)
             scheduler_output.kv_connector_metadata = meta
 
+        # collect KV cache events from KV cache manager
         events = self.kv_cache_manager.take_events()
+
+        # collect KV cache events from connector
+        if self.connector is not None:
+            connector_events = self.connector.take_events()
+            if connector_events:
+                if events is None:
+                    events = list(connector_events)
+                else:
+                    events.extend(connector_events)
+
+        # publish collected KV cache events
         if events:
             batch = KVEventBatch(ts=time.time(), events=events)
             self.kv_event_publisher.publish(batch)