From 99fc34728ccb708a91c70f4af627f4eda74b8e18 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:30:46 -0700
Subject: [PATCH 01/15] move weights loading related logic to ModelLoader

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 .../_torch/pyexecutor/model_engine.py         | 305 +----------------
 .../_torch/pyexecutor/model_loader.py         | 311 ++++++++++++++++++
 tensorrt_llm/bench/benchmark/utils/general.py |   2 +-
 tensorrt_llm/bench/dataclasses/reporting.py   |   2 +-
 4 files changed, 327 insertions(+), 293 deletions(-)
 create mode 100644 tensorrt_llm/_torch/pyexecutor/model_loader.py

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 1c313db240e..e35495a0535 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1,12 +1,9 @@
 import bisect
 import contextlib
-import copy
 import functools
 import gc
 import inspect
 import math
-import os
-import traceback
 import weakref
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
@@ -17,16 +14,13 @@
 
 import tensorrt_llm.bindings.internal.userbuffers as ub
 from tensorrt_llm._utils import (is_trace_enabled, nvtx_range, release_gc,
-                                 str_dtype_to_torch, torch_dtype_to_str,
-                                 trace_func)
+                                 torch_dtype_to_str, trace_func)
 from tensorrt_llm.inputs.multimodal import (MultimodalParams,
                                             MultimodalRuntimeData)
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_helper import LoraConfig
 from tensorrt_llm.lora_manager import LoraModelConfig
 from tensorrt_llm.mapping import CpType, Mapping
-from tensorrt_llm.models.modeling_utils import QuantAlgo
-from tensorrt_llm.quantization.utils.fp4_utils import float4_e2m1x2
 
 from ..attention_backend.interface import (AttentionMetadata,
                                            AttentionRuntimeFeatures)
@@ -40,13 +34,10 @@
 from ..distributed.communicator import init_pp_comm
 from ..expert_statistic import ExpertStatistic
 from ..metadata import KVCacheParams
-from ..model_config import ModelConfig, MoeLoadBalancerConfig
-from ..models import AutoModelForCausalLM
 from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
-from ..models.modeling_utils import (DecoderModelForCausalLM, MetaInitMode,
-                                     timing)
-from ..modules.fused_moe.moe_load_balancer import (
-    MoeLoadBalancer, MoeLoadBalancerIterContext, maybe_create_moe_load_balancer)
+from ..models.modeling_utils import DecoderModelForCausalLM
+from ..modules.fused_moe.moe_load_balancer import (MoeLoadBalancer,
+                                                   MoeLoadBalancerIterContext)
 from ..speculative import (SpecMetadata, get_num_extra_kv_tokens,
                            get_spec_metadata,
                            update_spec_config_from_model_config)
@@ -55,12 +46,13 @@
 from ..utils import (get_model_extra_attrs,
                      set_per_request_piecewise_cuda_graph_flag,
                      set_torch_compiling, with_model_extra_attrs)
-from .config import LoadFormat, PyTorchConfig
+from .config import PyTorchConfig
 from .config_utils import is_mla
 from .cuda_graph_runner import CUDAGraphRunner
 from .guided_decoder import CapturableGuidedDecoder
 from .layerwise_nvtx_marker import LayerwiseNvtxMarker
 from .llm_request import get_draft_token_length
+from .model_loader import ModelLoader
 from .resource_manager import (BaseResourceManager, KVCacheManager,
                                ResourceManager, ResourceManagerType)
 from .sampler import SampleStateTensors
@@ -95,137 +87,6 @@ def warmup(self, resource_manager: ResourceManager) -> None:
         return
 
 
-_KV_CACHE_MAP = {
-    "fp8": QuantAlgo.FP8.value,
-    "nvfp4": QuantAlgo.NVFP4.value,
-    "auto": "auto"
-}
-_VALID_KV_CACHE_DTYPES = ("fp8", "nvfp4", "auto")
-
-
-def validate_and_set_mamba_ssm_cache_dtype(config: ModelConfig,
-                                           mamba_ssm_cache_dtype: str) -> None:
-    if mamba_ssm_cache_dtype == "auto":
-        mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype
-    else:
-        mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype)
-
-    config.quant_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
-
-
-def validate_and_set_kv_cache_quant(model_config: ModelConfig,
-                                    pyt_kv_cache_dtype: str) -> QuantAlgo:
-    logger.info(
-        f'Validating KV Cache config against kv_cache_dtype="{pyt_kv_cache_dtype}"'
-    )
-    # Quantization from hf_quant_config.json
-    kv_cache_quant = model_config.quant_config.kv_cache_quant_algo
-    # PyTorch configuration quantization
-    valid_pyt_quant = bool(pyt_kv_cache_dtype in _VALID_KV_CACHE_DTYPES)
-    mapped_pyt_quant = _KV_CACHE_MAP.get(pyt_kv_cache_dtype, None)
-
-    # If we're letting the checkpoint dictate the quant with auto, simply
-    # return and do not modify the checkpoint.
-    if pyt_kv_cache_dtype == "auto":
-        logger.info(
-            f'KV cache quantization set to "{pyt_kv_cache_dtype}". Using '
-            "checkpoint KV quantization.")
-        return
-
-    # If we have an invalid quantization, simply raise an exception.
-    if not valid_pyt_quant:
-        raise ValueError(
-            "Overriding KV cache quantization with an invalid type "
-            f'"PyTorchConfig.kv_cache_dtype="{pyt_kv_cache_dtype}" '
-            f'Accepted types are "{_VALID_KV_CACHE_DTYPES}".')
-
-    # If we get to this point we have a valid quantization setting, but if
-    # we have an existing setting and it doesn't match we shouldn't proceed.
-    if kv_cache_quant is not None and mapped_pyt_quant != kv_cache_quant:
-        raise RuntimeError(
-            "Attempting to override KV cache quantization "
-            f'"{kv_cache_quant}" with PyTorchConfig.kv_cache_dtype='
-            f'"{pyt_kv_cache_dtype}". You cannot override a checkpoint with a '
-            "pre-quantized KV cache that doesn't match.")
-
-    # We have an open ended KV cache in the checkpoint
-    # and we have a specified override.
-    model_config.quant_config.kv_cache_quant_algo = mapped_pyt_quant
-
-
-def initialize_dummy_weights(
-    model: torch.nn.Module,
-    low: float = -1e-3,
-    high: float = 1e-3,
-    seed: int = 0,
-) -> None:
-    """
-    This is similar to this function in SGLang with a few changes:
-    https://github.com/sgl-project/sglang/blob/e074e76b31d4fff13e87a455dbc3acdaa92c537a/python/sglang/srt/model_loader/weight_utils.py#L577
-
-    This method is used to initialize weights with dummy values for testing
-    models without checkpoints. Unquantized (FP16/BF16/etc) values are generated
-    from a uniform distribution over the interval (low, high).
-
-    For some quantized types (FP8/NVFP4), torch has no built-in way to generate random values.
-    We simply generate values uniformly across an interval that has been empirically verified
-    to not generate NaNs/inf for these.
-    """
-
-    def _get_random_min_max(dtype: torch.dtype) -> Tuple[int, int]:
-        # These values are not necessarily the largest possible min/max,
-        # they need to be small enough to avoid NaNs.
-        if dtype in (torch.float8_e4m3fn, torch.int8):
-            return (-3.0, 3.0)
-
-        elif dtype == float4_e2m1x2:
-            # These correspond to bits of 2 packed FP4 values.
-            # Because we only go up to 64, the high 4 bits will
-            # always be 0. But this is fine - we just need values
-            # that won't generate NaNs.
-            return (0, 64)
-
-        else:
-            raise NotImplementedError(f"Unknown quantized type: {dtype}.")
-
-    for param in model.state_dict().values():
-        generator = torch.Generator(device=param.data.device)
-        generator.manual_seed(seed)
-        dtype = param.data.dtype
-
-        if param.data.element_size() < 2:
-            # We need to do a cast/round since torch doesn't have uniform_
-            # support for these dtypes.
-            tmp_param = torch.empty(param.data.shape,
-                                    dtype=torch.float16,
-                                    device=param.data.device)
-
-            quant_min, quant_max = _get_random_min_max(dtype)
-            tmp_param = tmp_param.uniform_(quant_min,
-                                           quant_max,
-                                           generator=generator)
-
-            param.data.copy_(tmp_param.to(dtype))
-
-        # Note: no need to to mess with int32 params, these are probably
-        # constants and not weights.
-        elif torch.is_floating_point(param):
-            param.uniform_(low, high, generator=generator)
-
-
-def get_rank_model_storage(model):
-    total_bytes = 0
-    for _, param in model.named_parameters():
-        if param.device.type == 'cuda' and param.device.index == torch.cuda.current_device(
-        ):
-            total_bytes += param.element_size() * param.nelement()
-    for _, buf in model.named_buffers():
-        if buf.device.type == 'cuda' and buf.device.index == torch.cuda.current_device(
-        ):
-            total_bytes += buf.element_size() * buf.nelement()
-    return total_bytes
-
-
 def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
                                    max_batch_size: int, max_num_tokens: int,
                                    max_draft_len: int,
@@ -302,20 +163,17 @@ def __init__(
         )
 
         attn_backend = pytorch_backend_config.attn_backend
-        self.model = self._load_model(
-            model_path,
+        loader = ModelLoader(
+            pytorch_backend_config=pytorch_backend_config,
             mapping=self.mapping,
-            checkpoint_loader=checkpoint_loader,
-            attn_backend=attn_backend,
-            moe_backend=pytorch_backend_config.moe_backend,
-            moe_disable_finalize_fusion=pytorch_backend_config.
-            moe_disable_finalize_fusion,
-            load_format=pytorch_backend_config.load_format,
+            spec_config=self.spec_config,
             max_num_tokens=max_num_tokens,
-            moe_max_num_tokens=pytorch_backend_config.moe_max_num_tokens,
-            moe_load_balancer=pytorch_backend_config.moe_load_balancer,
+            max_seq_len=max_seq_len,
             lora_config=lora_config,
-            drafting_loop_wrapper=drafting_loop_wrapper)
+        )
+        self.model = loader.load(checkpoint_dir=model_path,
+                                 checkpoint_loader=checkpoint_loader,
+                                 drafting_loop_wrapper=drafting_loop_wrapper)
         # In case that some tests use stub models and override `_load_model`.
         if not hasattr(self.model, 'extra_attrs'):
             self.model.extra_attrs = {}
@@ -944,141 +802,6 @@ def __del__(self) -> None:
         # Release model weights.
         release_gc()
 
-    def _load_model(self,
-                    checkpoint_dir: str,
-                    checkpoint_loader: BaseCheckpointLoader,
-                    load_format: LoadFormat,
-                    max_num_tokens: int,
-                    moe_max_num_tokens: Optional[int] = None,
-                    moe_load_balancer: Optional[MoeLoadBalancerConfig] = None,
-                    lora_config: Optional[LoraConfig] = None,
-                    drafting_loop_wrapper: Optional[Callable[
-                        [torch.nn.Module], torch.nn.Module]] = None,
-                    **kwargs) -> DecoderModelForCausalLM:
-        config = checkpoint_loader.load_config(
-            checkpoint_dir,
-            trust_remote_code=True,
-            enable_min_latency=self.pytorch_backend_config.enable_min_latency,
-            use_cuda_graph=self.pytorch_backend_config.use_cuda_graph,
-            force_dynamic_quantization=self.pytorch_backend_config.
-            force_dynamic_quantization,
-            spec_config=self.spec_config,
-            max_num_tokens=max_num_tokens,
-            max_seq_len=self.max_seq_len,
-            moe_max_num_tokens=moe_max_num_tokens,
-            moe_load_balancer=moe_load_balancer,
-            lora_config=lora_config,
-            allreduce_strategy=self.pytorch_backend_config.allreduce_strategy,
-            mm_encoder_only=self.pytorch_backend_config.mm_encoder_only,
-            **kwargs)
-
-        validate_and_set_kv_cache_quant(
-            config, self.pytorch_backend_config.kv_cache_dtype)
-        validate_and_set_mamba_ssm_cache_dtype(
-            config, self.pytorch_backend_config.mamba_ssm_cache_dtype)
-
-        num_layers = int(os.environ.get("TLLM_OVERRIDE_LAYER_NUM", "0"))
-        if num_layers > 0:
-            config.pretrained_config.num_hidden_layers = num_layers
-            for sub_config in ["text_config", "vision_config"]:
-                if hasattr(config.pretrained_config, sub_config):
-                    getattr(config.pretrained_config,
-                            sub_config).num_hidden_layers = num_layers
-
-        with timing("Model init total"), maybe_create_moe_load_balancer(
-                config, self.mapping) as moe_load_balancer:
-
-            try:
-                # config will be modified in-place for some models, like Qwen2
-                config_copy = copy.deepcopy(config)
-                with MetaInitMode():
-                    model = AutoModelForCausalLM.from_config(config_copy)
-
-                memo = dict()
-
-                def init_meta_tensor(t: torch.Tensor):
-                    if t.device != torch.device('meta'):
-                        return t
-                    if t not in memo:
-                        memo[t] = torch.empty_like(t, device='cuda')
-                    return memo[t]
-
-                model._apply(init_meta_tensor)
-                config = config_copy
-
-            except Exception:
-                logger.info(
-                    f"Fallback to regular model init: {traceback.format_exc(limit=1)}\n"
-                )
-                model = AutoModelForCausalLM.from_config(config)
-
-            model.to("cuda")
-            rank_model_storage = get_rank_model_storage(model)
-            logger.info(
-                f"Use {rank_model_storage / (1024**3):.2f} GB for model weights."
-            )
-            if load_format == LoadFormat.AUTO:
-                if hasattr(model, 'llm_checkpoint_dir'):
-                    weights = checkpoint_loader.load_weights(
-                        model.llm_checkpoint_dir)
-                else:
-                    weights = checkpoint_loader.load_weights(checkpoint_dir)
-
-                weight_mapper = checkpoint_loader.get_initialized_weight_mapper(
-                    model, config)
-                self._call_load_weights(model.load_weights, weights,
-                                        weight_mapper)
-
-                if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights(
-                ):
-                    weights = checkpoint_loader.load_weights(
-                        self.spec_config.speculative_model_dir)
-                    self._call_load_weights(model.load_draft_weights, weights,
-                                            weight_mapper)
-
-            elif load_format == LoadFormat.DUMMY:
-                initialize_dummy_weights(model)
-                if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights(
-                ):
-                    model.draft_model.load_weights_from_target_model(model)
-
-            elif load_format == LoadFormat.VISION_ONLY:
-                # Vision weights are already loaded within the model.
-                logger.info(
-                    "LoadFormat.VISION_ONLY: skipping weight loading; using preloaded vision weights."
-                )
-
-            else:
-                raise NotImplementedError(
-                    f"No load support for load format: {load_format}")
-
-            if isinstance(moe_load_balancer, MoeLoadBalancer):
-                setattr(self, "moe_load_balancer", moe_load_balancer)
-                moe_load_balancer.register_weight_slots_after_to_cuda()
-                logger.info("moe_load_balancer finalizing model...")
-                moe_load_balancer.finalize_model()
-                logger.info("moe_load_balancer finalize model done")
-
-            torch.cuda.current_stream().synchronize()
-
-        if drafting_loop_wrapper is not None:
-            model = drafting_loop_wrapper(model)
-            self.model_is_wrapped = True
-        else:
-            self.model_is_wrapped = False
-
-        return model
-
-    def _call_load_weights(self, load_method, weights, weight_mapper):
-        # TODO smor- this is a temporary solution to load weights.
-        # Once checkpoint format is unified, this method will be removed.
-        from inspect import getfullargspec
-        args = getfullargspec(load_method).args
-        if "weight_mapper" in args:
-            load_method(weights, weight_mapper=weight_mapper)
-        else:
-            load_method(weights)
-
     def _init_max_seq_len(self):
         # For mm_encoder_only mode, infer_max_seq_len() is for LLM decoder models
         if hasattr(self.model, 'infer_max_seq_len'):
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
new file mode 100644
index 00000000000..389e24096f5
--- /dev/null
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -0,0 +1,311 @@
+import copy
+import inspect
+import os
+import traceback
+from typing import Callable, Optional, Tuple
+
+import torch
+
+from tensorrt_llm._utils import str_dtype_to_torch
+from tensorrt_llm.logger import logger
+from tensorrt_llm.lora_helper import LoraConfig
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models.modeling_utils import QuantAlgo
+from tensorrt_llm.quantization.utils.fp4_utils import float4_e2m1x2
+
+from ..model_config import ModelConfig
+from ..models import AutoModelForCausalLM
+from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
+from ..models.modeling_utils import (DecoderModelForCausalLM, MetaInitMode,
+                                     timing)
+from ..modules.fused_moe.moe_load_balancer import (
+    MoeLoadBalancer, maybe_create_moe_load_balancer)
+from .config import LoadFormat, PyTorchConfig
+
+# Constants from the original file for KV cache validation
+_KV_CACHE_MAP = {
+    "fp8": QuantAlgo.FP8.value,
+    "nvfp4": QuantAlgo.NVFP4.value,
+    "auto": "auto"
+}
+_VALID_KV_CACHE_DTYPES = ("fp8", "nvfp4", "auto")
+
+
+def validate_and_set_mamba_ssm_cache_dtype(config: ModelConfig,
+                                           mamba_ssm_cache_dtype: str) -> None:
+    if mamba_ssm_cache_dtype == "auto":
+        mamba_ssm_cache_dtype = config.pretrained_config.torch_dtype
+    else:
+        mamba_ssm_cache_dtype = str_dtype_to_torch(mamba_ssm_cache_dtype)
+
+    config.quant_config.mamba_ssm_cache_dtype = mamba_ssm_cache_dtype
+
+
+def validate_and_set_kv_cache_quant(model_config: ModelConfig,
+                                    pyt_kv_cache_dtype: str) -> QuantAlgo:
+    logger.info(
+        f'Validating KV Cache config against kv_cache_dtype="{pyt_kv_cache_dtype}"'
+    )
+    # Quantization from hf_quant_config.json
+    kv_cache_quant = model_config.quant_config.kv_cache_quant_algo
+    # PyTorch configuration quantization
+    valid_pyt_quant = bool(pyt_kv_cache_dtype in _VALID_KV_CACHE_DTYPES)
+    mapped_pyt_quant = _KV_CACHE_MAP.get(pyt_kv_cache_dtype, None)
+
+    # If we're letting the checkpoint dictate the quant with auto, simply
+    # return and do not modify the checkpoint.
+    if pyt_kv_cache_dtype == "auto":
+        logger.info(
+            f'KV cache quantization set to "{pyt_kv_cache_dtype}". Using '
+            "checkpoint KV quantization.")
+        return
+
+    # If we have an invalid quantization, simply raise an exception.
+    if not valid_pyt_quant:
+        raise ValueError(
+            "Overriding KV cache quantization with an invalid type "
+            f'"PyTorchConfig.kv_cache_dtype="{pyt_kv_cache_dtype}" '
+            f'Accepted types are "{_VALID_KV_CACHE_DTYPES}".')
+
+    # If we get to this point we have a valid quantization setting, but if
+    # we have an existing setting and it doesn't match we shouldn't proceed.
+    if kv_cache_quant is not None and mapped_pyt_quant != kv_cache_quant:
+        raise RuntimeError(
+            "Attempting to override KV cache quantization "
+            f'"{kv_cache_quant}" with PyTorchConfig.kv_cache_dtype='
+            f'"{pyt_kv_cache_dtype}". You cannot override a checkpoint with a '
+            "pre-quantized KV cache that doesn't match.")
+
+    # We have an open ended KV cache in the checkpoint
+    # and we have a specified override.
+    model_config.quant_config.kv_cache_quant_algo = mapped_pyt_quant
+
+
+class ModelLoader:
+    """
+    Handles the loading, configuration, and weight initialization of a PyTorch model.
+    This class isolates model loading logic from the main execution engine.
+    """
+
+    def __init__(self,
+                 pytorch_backend_config: PyTorchConfig,
+                 mapping: Mapping,
+                 spec_config: Optional["DecodingBaseConfig"],
+                 max_num_tokens: int,
+                 max_seq_len: Optional[int],
+                 lora_config: Optional[LoraConfig] = None):
+        """
+        Initializes the ModelLoader.
+
+        Args:
+            pytorch_backend_config: Configuration for the PyTorch backend.
+            mapping: The distributed mapping configuration.
+            spec_config: Configuration for speculative decoding.
+            max_num_tokens: The maximum number of tokens the engine will handle.
+            max_seq_len: The maximum sequence length.
+            lora_config: Configuration for LoRA.
+        """
+        self.pytorch_backend_config = pytorch_backend_config
+        self.mapping = mapping
+        self.spec_config = spec_config
+        self.max_num_tokens = max_num_tokens
+        self.max_seq_len = max_seq_len
+        self.lora_config = lora_config
+        self.moe_load_balancer = None
+
+    def load(
+        self,
+        checkpoint_dir: str,
+        checkpoint_loader: BaseCheckpointLoader,
+        drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
+                                                 torch.nn.Module]] = None
+    ) -> DecoderModelForCausalLM:
+        """
+        Loads the model, its weights, and applies necessary configurations.
+
+        Args:
+            checkpoint_dir: The directory of the model checkpoint.
+            checkpoint_loader: The loader object for model checkpoints.
+            drafting_loop_wrapper: An optional wrapper for speculative decoding models.
+
+        Returns:
+            The loaded and initialized PyTorch model.
+        """
+        config = self._load_and_validate_config(checkpoint_dir,
+                                                checkpoint_loader)
+
+        with timing("Model init total"), maybe_create_moe_load_balancer(
+                config, self.mapping) as moe_load_balancer:
+
+            # Attempt to initialize the model on the meta device for speed
+            try:
+                config_copy = copy.deepcopy(config)
+                with MetaInitMode():
+                    model = AutoModelForCausalLM.from_config(config_copy)
+                self._materialize_meta_model(model)
+                config = config_copy
+            except Exception:
+                logger.info("Fallback to regular model init: "
+                            f"{traceback.format_exc(limit=1)}\n")
+                model = AutoModelForCausalLM.from_config(config)
+
+            model.to("cuda")
+
+            logger.info("Use %.2f GB for model weights.",
+                        self._get_rank_model_storage(model) / (1024**3))
+
+            self._load_weights(model, config, checkpoint_dir, checkpoint_loader)
+
+            if isinstance(moe_load_balancer, MoeLoadBalancer):
+                self.moe_load_balancer = moe_load_balancer
+                moe_load_balancer.register_weight_slots_after_to_cuda()
+                logger.info("moe_load_balancer finalizing model...")
+                moe_load_balancer.finalize_model()
+                logger.info("moe_load_balancer finalize model done")
+
+            torch.cuda.current_stream().synchronize()
+
+        if drafting_loop_wrapper is not None:
+            model = drafting_loop_wrapper(model)
+
+        return model
+
+    def _load_weights(self, model: DecoderModelForCausalLM, config: ModelConfig,
+                      checkpoint_dir: str,
+                      checkpoint_loader: BaseCheckpointLoader):
+        """Handles the logic for loading weights based on the specified format."""
+        load_format = self.pytorch_backend_config.load_format
+
+        if load_format == LoadFormat.AUTO:
+            checkpoint_path = (getattr(model, 'llm_checkpoint_dir', None)
+                               or checkpoint_dir)
+            weights = checkpoint_loader.load_weights(checkpoint_path)
+            weight_mapper = checkpoint_loader.get_initialized_weight_mapper(
+                model, config)
+            self._call_load_weights(model.load_weights, weights, weight_mapper)
+
+            # Load draft model weights if needed for speculative decoding
+            if self.spec_config and self.spec_config.spec_dec_mode.need_load_draft_weights(
+            ):
+                draft_weights = checkpoint_loader.load_weights(
+                    self.spec_config.speculative_model_dir)
+                self._call_load_weights(model.load_draft_weights, draft_weights,
+                                        weight_mapper)
+
+        elif load_format == LoadFormat.DUMMY:
+            self._initialize_dummy_weights(model)
+            if self.spec_config and self.spec_config.spec_dec_mode.need_load_draft_weights(
+            ):
+                model.draft_model.load_weights_from_target_model(model)
+
+        elif load_format == LoadFormat.VISION_ONLY:
+            logger.info(
+                "LoadFormat.VISION_ONLY: skipping weight loading; using preloaded vision weights."
+            )
+
+        else:
+            raise NotImplementedError(
+                f"No load support for load format: {load_format}")
+
+    def _load_and_validate_config(
+            self, checkpoint_dir: str,
+            checkpoint_loader: BaseCheckpointLoader) -> ModelConfig:
+        """Loads and validates the model configuration."""
+        config = checkpoint_loader.load_config(
+            checkpoint_dir,
+            trust_remote_code=True,
+            enable_min_latency=self.pytorch_backend_config.enable_min_latency,
+            use_cuda_graph=self.pytorch_backend_config.use_cuda_graph,
+            force_dynamic_quantization=self.pytorch_backend_config.
+            force_dynamic_quantization,
+            spec_config=self.spec_config,
+            max_num_tokens=self.max_num_tokens,
+            max_seq_len=self.max_seq_len,
+            moe_max_num_tokens=self.pytorch_backend_config.moe_max_num_tokens,
+            moe_load_balancer=self.pytorch_backend_config.moe_load_balancer,
+            lora_config=self.lora_config,
+            allreduce_strategy=self.pytorch_backend_config.allreduce_strategy,
+            mm_encoder_only=self.pytorch_backend_config.mm_encoder_only)
+
+        validate_and_set_kv_cache_quant(
+            config, self.pytorch_backend_config.kv_cache_dtype)
+        validate_and_set_mamba_ssm_cache_dtype(
+            config, self.pytorch_backend_config.mamba_ssm_cache_dtype)
+
+        # Allow overriding the number of layers via environment variable
+        num_layers_override = int(os.environ.get("TLLM_OVERRIDE_LAYER_NUM",
+                                                 "0"))
+        if num_layers_override > 0:
+            config.pretrained_config.num_hidden_layers = num_layers_override
+            for sub_config in ["text_config", "vision_config"]:
+                if hasattr(config.pretrained_config, sub_config):
+                    getattr(config.pretrained_config,
+                            sub_config).num_hidden_layers = num_layers_override
+        return config
+
+    @staticmethod
+    def _materialize_meta_model(model: torch.nn.Module):
+        """Converts a model on the 'meta' device to a materialized model on CUDA."""
+        memo = {}
+
+        def init_meta_tensor(t: torch.Tensor):
+            if t.device != torch.device('meta'):
+                return t
+            if t not in memo:
+                memo[t] = torch.empty_like(t, device='cuda')
+            return memo[t]
+
+        model._apply(init_meta_tensor)
+
+    @staticmethod
+    def _call_load_weights(load_method: Callable, weights, weight_mapper):
+        """Calls the model's weight loading method with the correct arguments."""
+        args = inspect.getfullargspec(load_method).args
+        if "weight_mapper" in args:
+            load_method(weights, weight_mapper=weight_mapper)
+        else:
+            load_method(weights)
+
+    @staticmethod
+    def _get_rank_model_storage(model: torch.nn.Module) -> int:
+        """Calculates the total memory in bytes used by the model's weights and buffers on the current device."""
+        total_bytes = 0
+        current_device_idx = torch.cuda.current_device()
+        for param in model.parameters():
+            if param.device.type == 'cuda' and param.device.index == current_device_idx:
+                total_bytes += param.element_size() * param.nelement()
+        for buf in model.buffers():
+            if buf.device.type == 'cuda' and buf.device.index == current_device_idx:
+                total_bytes += buf.element_size() * buf.nelement()
+        return total_bytes
+
+    @staticmethod
+    def _initialize_dummy_weights(model: torch.nn.Module,
+                                  low: float = -1e-3,
+                                  high: float = 1e-3,
+                                  seed: int = 0) -> None:
+        """Initializes model weights with random dummy values for testing purposes."""
+
+        # This function's logic is copied directly from the original file
+        def _get_random_min_max(dtype: torch.dtype) -> Tuple[int, int]:
+            if dtype in (torch.float8_e4m3fn, torch.int8):
+                return (-3.0, 3.0)
+            elif dtype == float4_e2m1x2:
+                return (0, 64)
+            else:
+                raise NotImplementedError(f"Unknown quantized type: {dtype}.")
+
+        for param in model.state_dict().values():
+            generator = torch.Generator(device=param.data.device)
+            generator.manual_seed(seed)
+            dtype = param.data.dtype
+
+            if param.data.element_size() < 2:
+                tmp_param = torch.empty_like(param.data,
+                                             dtype=torch.float16,
+                                             device=param.data.device)
+                quant_min, quant_max = _get_random_min_max(dtype)
+                tmp_param.uniform_(quant_min, quant_max, generator=generator)
+                param.data.copy_(tmp_param.to(dtype))
+            elif torch.is_floating_point(param):
+                param.uniform_(low, high, generator=generator)
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index ff3cd933ce1..a21511f38cd 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -8,7 +8,7 @@
 
 import yaml
 
-from tensorrt_llm._torch.pyexecutor.model_engine import \
+from tensorrt_llm._torch.pyexecutor.model_loader import \
     validate_and_set_kv_cache_quant
 from tensorrt_llm.bench.build.build import (get_benchmark_engine_settings,
                                             get_model_config)
diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py
index b12873b5637..70e4cae646b 100755
--- a/tensorrt_llm/bench/dataclasses/reporting.py
+++ b/tensorrt_llm/bench/dataclasses/reporting.py
@@ -4,7 +4,7 @@
 from collections import defaultdict
 from typing import Any, Dict, List, NamedTuple
 
-from tensorrt_llm._torch.pyexecutor.model_engine import \
+from tensorrt_llm._torch.pyexecutor.model_loader import \
     validate_and_set_kv_cache_quant
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import DatasetMetadata

From 018b022b9dadde651bbe2666ed04b7ba148f53f1 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:39:19 -0700
Subject: [PATCH 02/15] fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index e35495a0535..92445327e25 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -174,6 +174,7 @@ def __init__(
         self.model = loader.load(checkpoint_dir=model_path,
                                  checkpoint_loader=checkpoint_loader,
                                  drafting_loop_wrapper=drafting_loop_wrapper)
+        self.model_is_wrapped = drafting_loop_wrapper is not None
         # In case that some tests use stub models and override `_load_model`.
         if not hasattr(self.model, 'extra_attrs'):
             self.model.extra_attrs = {}

From cf84a58e1b115ecd82ce12c4d718e16583677a63 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Mon, 8 Sep 2025 14:46:42 -0700
Subject: [PATCH 03/15] clean

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 .../_torch/pyexecutor/model_engine.py         |   2 +-
 .../_torch/pyexecutor/model_loader.py         | 234 ++++++++++--------
 2 files changed, 130 insertions(+), 106 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index bfc8e02b69d..da2a133cab0 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -175,7 +175,7 @@ def __init__(
         self.model = loader.load(checkpoint_dir=model_path,
                                  checkpoint_loader=checkpoint_loader,
                                  drafting_loop_wrapper=drafting_loop_wrapper)
-        self.model_is_wrapped = drafting_loop_wrapper is not None
+        self.model_is_wrapped = loader.model_is_wrapped
         # In case that some tests use stub models and override `_load_model`.
         if not hasattr(self.model, 'extra_attrs'):
             self.model.extra_attrs = {}
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 389e24096f5..35bbd2d6bc7 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -81,6 +81,77 @@ def validate_and_set_kv_cache_quant(model_config: ModelConfig,
     model_config.quant_config.kv_cache_quant_algo = mapped_pyt_quant
 
 
+def initialize_dummy_weights(
+    model: torch.nn.Module,
+    low: float = -1e-3,
+    high: float = 1e-3,
+    seed: int = 0,
+) -> None:
+    """
+    This is similar to this function in SGLang with a few changes:
+    https://github.com/sgl-project/sglang/blob/e074e76b31d4fff13e87a455dbc3acdaa92c537a/python/sglang/srt/model_loader/weight_utils.py#L577
+    This method is used to initialize weights with dummy values for testing
+    models without checkpoints. Unquantized (FP16/BF16/etc) values are generated
+    from a uniform distribution over the interval (low, high).
+    For some quantized types (FP8/NVFP4), torch has no built-in way to generate random values.
+    We simply generate values uniformly across an interval that has been empirically verified
+    to not generate NaNs/inf for these.
+    """
+
+    def _get_random_min_max(dtype: torch.dtype) -> Tuple[int, int]:
+        # These values are not necessarily the largest possible min/max,
+        # they need to be small enough to avoid NaNs.
+        if dtype in (torch.float8_e4m3fn, torch.int8):
+            return (-3.0, 3.0)
+
+        elif dtype == float4_e2m1x2:
+            # These correspond to bits of 2 packed FP4 values.
+            # Because we only go up to 64, the high 4 bits will
+            # always be 0. But this is fine - we just need values
+            # that won't generate NaNs.
+            return (0, 64)
+
+        else:
+            raise NotImplementedError(f"Unknown quantized type: {dtype}.")
+
+    for param in model.state_dict().values():
+        generator = torch.Generator(device=param.data.device)
+        generator.manual_seed(seed)
+        dtype = param.data.dtype
+
+        if param.data.element_size() < 2:
+            # We need to do a cast/round since torch doesn't have uniform_
+            # support for these dtypes.
+            tmp_param = torch.empty(param.data.shape,
+                                    dtype=torch.float16,
+                                    device=param.data.device)
+
+            quant_min, quant_max = _get_random_min_max(dtype)
+            tmp_param = tmp_param.uniform_(quant_min,
+                                           quant_max,
+                                           generator=generator)
+
+            param.data.copy_(tmp_param.to(dtype))
+
+        # Note: no need to to mess with int32 params, these are probably
+        # constants and not weights.
+        elif torch.is_floating_point(param):
+            param.uniform_(low, high, generator=generator)
+
+
+def get_rank_model_storage(model):
+    total_bytes = 0
+    for _, param in model.named_parameters():
+        if param.device.type == 'cuda' and param.device.index == torch.cuda.current_device(
+        ):
+            total_bytes += param.element_size() * param.nelement()
+    for _, buf in model.named_buffers():
+        if buf.device.type == 'cuda' and buf.device.index == torch.cuda.current_device(
+        ):
+            total_bytes += buf.element_size() * buf.nelement()
+    return total_bytes
+
+
 class ModelLoader:
     """
     Handles the loading, configuration, and weight initialization of a PyTorch model.
@@ -133,31 +204,77 @@ def load(
         """
         config = self._load_and_validate_config(checkpoint_dir,
                                                 checkpoint_loader)
+        load_format = self.pytorch_backend_config.load_format
 
         with timing("Model init total"), maybe_create_moe_load_balancer(
                 config, self.mapping) as moe_load_balancer:
 
-            # Attempt to initialize the model on the meta device for speed
             try:
+                # config will be modified in-place for some models, like Qwen2
                 config_copy = copy.deepcopy(config)
                 with MetaInitMode():
                     model = AutoModelForCausalLM.from_config(config_copy)
-                self._materialize_meta_model(model)
+
+                memo = dict()
+
+                def init_meta_tensor(t: torch.Tensor):
+                    if t.device != torch.device('meta'):
+                        return t
+                    if t not in memo:
+                        memo[t] = torch.empty_like(t, device='cuda')
+                    return memo[t]
+
+                model._apply(init_meta_tensor)
                 config = config_copy
+
             except Exception:
-                logger.info("Fallback to regular model init: "
-                            f"{traceback.format_exc(limit=1)}\n")
+                logger.info(
+                    f"Fallback to regular model init: {traceback.format_exc(limit=1)}\n"
+                )
                 model = AutoModelForCausalLM.from_config(config)
 
             model.to("cuda")
+            rank_model_storage = get_rank_model_storage(model)
+            logger.info(
+                f"Use {rank_model_storage / (1024**3):.2f} GB for model weights."
+            )
+            if load_format == LoadFormat.AUTO:
+                if hasattr(model, 'llm_checkpoint_dir'):
+                    weights = checkpoint_loader.load_weights(
+                        model.llm_checkpoint_dir)
+                else:
+                    weights = checkpoint_loader.load_weights(checkpoint_dir)
+
+                weight_mapper = checkpoint_loader.get_initialized_weight_mapper(
+                    model, config)
+                self._call_load_weights(model.load_weights, weights,
+                                        weight_mapper)
 
-            logger.info("Use %.2f GB for model weights.",
-                        self._get_rank_model_storage(model) / (1024**3))
+                if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights(
+                ):
+                    weights = checkpoint_loader.load_weights(
+                        self.spec_config.speculative_model_dir)
+                    self._call_load_weights(model.load_draft_weights, weights,
+                                            weight_mapper)
+
+            elif load_format == LoadFormat.DUMMY:
+                initialize_dummy_weights(model)
+                if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights(
+                ):
+                    model.draft_model.load_weights_from_target_model(model)
+
+            elif load_format == LoadFormat.VISION_ONLY:
+                # Vision weights are already loaded within the model.
+                logger.info(
+                    "LoadFormat.VISION_ONLY: skipping weight loading; using preloaded vision weights."
+                )
 
-            self._load_weights(model, config, checkpoint_dir, checkpoint_loader)
+            else:
+                raise NotImplementedError(
+                    f"No load support for load format: {load_format}")
 
             if isinstance(moe_load_balancer, MoeLoadBalancer):
-                self.moe_load_balancer = moe_load_balancer
+                setattr(self, "moe_load_balancer", moe_load_balancer)
                 moe_load_balancer.register_weight_slots_after_to_cuda()
                 logger.info("moe_load_balancer finalizing model...")
                 moe_load_balancer.finalize_model()
@@ -167,46 +284,12 @@ def load(
 
         if drafting_loop_wrapper is not None:
             model = drafting_loop_wrapper(model)
+            self.model_is_wrapped = True
+        else:
+            self.model_is_wrapped = False
 
         return model
 
-    def _load_weights(self, model: DecoderModelForCausalLM, config: ModelConfig,
-                      checkpoint_dir: str,
-                      checkpoint_loader: BaseCheckpointLoader):
-        """Handles the logic for loading weights based on the specified format."""
-        load_format = self.pytorch_backend_config.load_format
-
-        if load_format == LoadFormat.AUTO:
-            checkpoint_path = (getattr(model, 'llm_checkpoint_dir', None)
-                               or checkpoint_dir)
-            weights = checkpoint_loader.load_weights(checkpoint_path)
-            weight_mapper = checkpoint_loader.get_initialized_weight_mapper(
-                model, config)
-            self._call_load_weights(model.load_weights, weights, weight_mapper)
-
-            # Load draft model weights if needed for speculative decoding
-            if self.spec_config and self.spec_config.spec_dec_mode.need_load_draft_weights(
-            ):
-                draft_weights = checkpoint_loader.load_weights(
-                    self.spec_config.speculative_model_dir)
-                self._call_load_weights(model.load_draft_weights, draft_weights,
-                                        weight_mapper)
-
-        elif load_format == LoadFormat.DUMMY:
-            self._initialize_dummy_weights(model)
-            if self.spec_config and self.spec_config.spec_dec_mode.need_load_draft_weights(
-            ):
-                model.draft_model.load_weights_from_target_model(model)
-
-        elif load_format == LoadFormat.VISION_ONLY:
-            logger.info(
-                "LoadFormat.VISION_ONLY: skipping weight loading; using preloaded vision weights."
-            )
-
-        else:
-            raise NotImplementedError(
-                f"No load support for load format: {load_format}")
-
     def _load_and_validate_config(
             self, checkpoint_dir: str,
             checkpoint_loader: BaseCheckpointLoader) -> ModelConfig:
@@ -243,69 +326,10 @@ def _load_and_validate_config(
                             sub_config).num_hidden_layers = num_layers_override
         return config
 
-    @staticmethod
-    def _materialize_meta_model(model: torch.nn.Module):
-        """Converts a model on the 'meta' device to a materialized model on CUDA."""
-        memo = {}
-
-        def init_meta_tensor(t: torch.Tensor):
-            if t.device != torch.device('meta'):
-                return t
-            if t not in memo:
-                memo[t] = torch.empty_like(t, device='cuda')
-            return memo[t]
-
-        model._apply(init_meta_tensor)
-
-    @staticmethod
-    def _call_load_weights(load_method: Callable, weights, weight_mapper):
+    def _call_load_weights(self, load_method: Callable, weights, weight_mapper):
         """Calls the model's weight loading method with the correct arguments."""
         args = inspect.getfullargspec(load_method).args
         if "weight_mapper" in args:
             load_method(weights, weight_mapper=weight_mapper)
         else:
             load_method(weights)
-
-    @staticmethod
-    def _get_rank_model_storage(model: torch.nn.Module) -> int:
-        """Calculates the total memory in bytes used by the model's weights and buffers on the current device."""
-        total_bytes = 0
-        current_device_idx = torch.cuda.current_device()
-        for param in model.parameters():
-            if param.device.type == 'cuda' and param.device.index == current_device_idx:
-                total_bytes += param.element_size() * param.nelement()
-        for buf in model.buffers():
-            if buf.device.type == 'cuda' and buf.device.index == current_device_idx:
-                total_bytes += buf.element_size() * buf.nelement()
-        return total_bytes
-
-    @staticmethod
-    def _initialize_dummy_weights(model: torch.nn.Module,
-                                  low: float = -1e-3,
-                                  high: float = 1e-3,
-                                  seed: int = 0) -> None:
-        """Initializes model weights with random dummy values for testing purposes."""
-
-        # This function's logic is copied directly from the original file
-        def _get_random_min_max(dtype: torch.dtype) -> Tuple[int, int]:
-            if dtype in (torch.float8_e4m3fn, torch.int8):
-                return (-3.0, 3.0)
-            elif dtype == float4_e2m1x2:
-                return (0, 64)
-            else:
-                raise NotImplementedError(f"Unknown quantized type: {dtype}.")
-
-        for param in model.state_dict().values():
-            generator = torch.Generator(device=param.data.device)
-            generator.manual_seed(seed)
-            dtype = param.data.dtype
-
-            if param.data.element_size() < 2:
-                tmp_param = torch.empty_like(param.data,
-                                             dtype=torch.float16,
-                                             device=param.data.device)
-                quant_min, quant_max = _get_random_min_max(dtype)
-                tmp_param.uniform_(quant_min, quant_max, generator=generator)
-                param.data.copy_(tmp_param.to(dtype))
-            elif torch.is_floating_point(param):
-                param.uniform_(low, high, generator=generator)

From 746e486f5e1fec42e205f3208c73c4dc8a3e3731 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Mon, 8 Sep 2025 15:13:15 -0700
Subject: [PATCH 04/15] fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_loader.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 35bbd2d6bc7..f4f86ad277a 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -22,7 +22,6 @@
     MoeLoadBalancer, maybe_create_moe_load_balancer)
 from .config import LoadFormat, PyTorchConfig
 
-# Constants from the original file for KV cache validation
 _KV_CACHE_MAP = {
     "fp8": QuantAlgo.FP8.value,
     "nvfp4": QuantAlgo.NVFP4.value,
@@ -189,7 +188,7 @@ def load(
         checkpoint_dir: str,
         checkpoint_loader: BaseCheckpointLoader,
         drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
-                                                 torch.nn.Module]] = None
+                                                 torch.nn.Module]] = None,
     ) -> DecoderModelForCausalLM:
         """
         Loads the model, its weights, and applies necessary configurations.
@@ -308,7 +307,11 @@ def _load_and_validate_config(
             moe_load_balancer=self.pytorch_backend_config.moe_load_balancer,
             lora_config=self.lora_config,
             allreduce_strategy=self.pytorch_backend_config.allreduce_strategy,
-            mm_encoder_only=self.pytorch_backend_config.mm_encoder_only)
+            mm_encoder_only=self.pytorch_backend_config.mm_encoder_only,
+            attn_backend=self.pytorch_backend_config.attn_backend,
+            moe_backend=self.pytorch_backend_config.moe_backend,
+            moe_disable_finalize_fusion=self.pytorch_backend_config.
+            moe_disable_finalize_fusion)
 
         validate_and_set_kv_cache_quant(
             config, self.pytorch_backend_config.kv_cache_dtype)

From c64d32076619125128b13290a1343bc95a960fe6 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Tue, 9 Sep 2025 09:52:42 -0700
Subject: [PATCH 05/15] fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 .../_torch/pyexecutor/model_engine.py         | 35 +++++++++++--------
 .../_torch/pyexecutor/model_loader.py         |  9 -----
 .../executor/test_pytorch_model_engine.py     |  8 ++---
 3 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index da2a133cab0..e3ca240c891 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -141,6 +141,7 @@ def __init__(
         is_draft_model: bool = False,
         drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
                                                  torch.nn.Module]] = None,
+        model: Optional[torch.nn.Module] = None,
     ):
         self.ub_buffers = None
         self.batch_size = batch_size
@@ -163,19 +164,24 @@ def __init__(
         self.attn_runtime_features = attn_runtime_features or AttentionRuntimeFeatures(
         )
 
-        attn_backend = pytorch_backend_config.attn_backend
-        loader = ModelLoader(
-            pytorch_backend_config=pytorch_backend_config,
-            mapping=self.mapping,
-            spec_config=self.spec_config,
-            max_num_tokens=max_num_tokens,
-            max_seq_len=max_seq_len,
-            lora_config=lora_config,
-        )
-        self.model = loader.load(checkpoint_dir=model_path,
-                                 checkpoint_loader=checkpoint_loader,
-                                 drafting_loop_wrapper=drafting_loop_wrapper)
-        self.model_is_wrapped = loader.model_is_wrapped
+        if model is None:
+            loader = ModelLoader(
+                pytorch_backend_config=pytorch_backend_config,
+                mapping=self.mapping,
+                spec_config=self.spec_config,
+                max_num_tokens=max_num_tokens,
+                max_seq_len=max_seq_len,
+                lora_config=lora_config,
+            )
+            self.model = loader.load(checkpoint_dir=model_path,
+                                     checkpoint_loader=checkpoint_loader)
+        else:
+            self.model = model
+        if drafting_loop_wrapper is not None:
+            self.model = drafting_loop_wrapper(self.model)
+            self.model_is_wrapped = True
+        else:
+            self.model_is_wrapped = False
         # In case that some tests use stub models and override `_load_model`.
         if not hasattr(self.model, 'extra_attrs'):
             self.model.extra_attrs = {}
@@ -246,7 +252,8 @@ def __init__(
 
         self.is_warmup = False
 
-        self.attn_backend = get_attention_backend(attn_backend)
+        self.attn_backend = get_attention_backend(
+            pytorch_backend_config.attn_backend)
 
         if self.is_spec_decode:
             self.spec_metadata = None
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index f4f86ad277a..e061cc6ca1c 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -187,8 +187,6 @@ def load(
         self,
         checkpoint_dir: str,
         checkpoint_loader: BaseCheckpointLoader,
-        drafting_loop_wrapper: Optional[Callable[[torch.nn.Module],
-                                                 torch.nn.Module]] = None,
     ) -> DecoderModelForCausalLM:
         """
         Loads the model, its weights, and applies necessary configurations.
@@ -196,7 +194,6 @@ def load(
         Args:
             checkpoint_dir: The directory of the model checkpoint.
             checkpoint_loader: The loader object for model checkpoints.
-            drafting_loop_wrapper: An optional wrapper for speculative decoding models.
 
         Returns:
             The loaded and initialized PyTorch model.
@@ -281,12 +278,6 @@ def init_meta_tensor(t: torch.Tensor):
 
             torch.cuda.current_stream().synchronize()
 
-        if drafting_loop_wrapper is not None:
-            model = drafting_loop_wrapper(model)
-            self.model_is_wrapped = True
-        else:
-            self.model_is_wrapped = False
-
         return model
 
     def _load_and_validate_config(
diff --git a/tests/unittest/_torch/executor/test_pytorch_model_engine.py b/tests/unittest/_torch/executor/test_pytorch_model_engine.py
index 8a06a3a9f0f..ec53a1ae832 100644
--- a/tests/unittest/_torch/executor/test_pytorch_model_engine.py
+++ b/tests/unittest/_torch/executor/test_pytorch_model_engine.py
@@ -67,16 +67,14 @@ def __init__(self,
         mapping = Mapping(world_size=tensorrt_llm.mpi_world_size(),
                           tp_size=tensorrt_llm.mpi_world_size(),
                           rank=tensorrt_llm.mpi_rank())
-        self.model_is_wrapped = False
+        model = DummyModel(self.dtype)
         super().__init__(model_path="",
                          pytorch_backend_config=pytorch_backend_config,
                          checkpoint_loader=None,
                          batch_size=batch_size,
                          max_seq_len=max_seq_len,
-                         mapping=mapping)
-
-    def _load_model(self, mode_path: str, **kwargs) -> torch.nn.Module:
-        return DummyModel(self.dtype)
+                         mapping=mapping,
+                         model=model)
 
 
 def _create_request(num_tokens, req_id: int):

From 199facecd92f797b15498c6772a7ce03584a4f48 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Mon, 22 Sep 2025 12:01:26 +0800
Subject: [PATCH 06/15] clean

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_engine.py | 12 ++++++++++--
 tensorrt_llm/_torch/pyexecutor/model_loader.py | 13 +------------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 433c870a2a3..8623c7a000c 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -37,8 +37,8 @@
 from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
 from ..models.modeling_multimodal_utils import filter_mm_token_from_input_ids
 from ..models.modeling_utils import DecoderModelForCausalLM
-from ..modules.fused_moe.moe_load_balancer import (MoeLoadBalancer,
-                                                   MoeLoadBalancerIterContext)
+from ..modules.fused_moe.moe_load_balancer import (
+    MoeLoadBalancer, MoeLoadBalancerIterContext, maybe_create_moe_load_balancer)
 from ..speculative import (SpecMetadata, get_num_extra_kv_tokens,
                            get_spec_metadata,
                            update_spec_config_from_model_config)
@@ -175,6 +175,14 @@ def __init__(
             )
             self.model = loader.load(checkpoint_dir=model_path,
                                      checkpoint_loader=checkpoint_loader)
+            moe_load_balancer = maybe_create_moe_load_balancer(
+                self.model.config, self.mapping)
+            if isinstance(moe_load_balancer, MoeLoadBalancer):
+                setattr(self, "moe_load_balancer", moe_load_balancer)
+                moe_load_balancer.register_weight_slots_after_to_cuda()
+                logger.info("moe_load_balancer finalizing model...")
+                moe_load_balancer.finalize_model()
+                logger.info("moe_load_balancer finalize model done")
         else:
             self.model = model
         if drafting_loop_wrapper is not None:
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index fccce44f5f3..3d68b64acd4 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -18,8 +18,6 @@
 from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
 from ..models.modeling_utils import (DecoderModelForCausalLM, MetaInitMode,
                                      timing)
-from ..modules.fused_moe.moe_load_balancer import (
-    MoeLoadBalancer, maybe_create_moe_load_balancer)
 from .config import LoadFormat, PyTorchConfig
 
 _KV_CACHE_MAP = {
@@ -202,9 +200,7 @@ def load(
                                                 checkpoint_loader)
         load_format = self.pytorch_backend_config.load_format
 
-        with timing("Model init total"), maybe_create_moe_load_balancer(
-                config, self.mapping) as moe_load_balancer:
-
+        with timing("Model init total"):
             try:
                 # config will be modified in-place for some models, like Qwen2
                 config_copy = copy.deepcopy(config)
@@ -269,13 +265,6 @@ def init_meta_tensor(t: torch.Tensor):
                 raise NotImplementedError(
                     f"No load support for load format: {load_format}")
 
-            if isinstance(moe_load_balancer, MoeLoadBalancer):
-                setattr(self, "moe_load_balancer", moe_load_balancer)
-                moe_load_balancer.register_weight_slots_after_to_cuda()
-                logger.info("moe_load_balancer finalizing model...")
-                moe_load_balancer.finalize_model()
-                logger.info("moe_load_balancer finalize model done")
-
             torch.cuda.current_stream().synchronize()
 
         return model

From c3a01d604bc29da181b8f48cfc9347412a430698 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Mon, 22 Sep 2025 12:16:33 +0800
Subject: [PATCH 07/15] fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_loader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 3d68b64acd4..180b4c99319 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -276,6 +276,7 @@ def _load_and_validate_config(
         config = checkpoint_loader.load_config(
             checkpoint_dir,
             trust_remote_code=True,
+            mapping=self.mapping,
             enable_min_latency=self.pytorch_backend_config.enable_min_latency,
             use_cuda_graph=self.pytorch_backend_config.use_cuda_graph,
             force_dynamic_quantization=self.pytorch_backend_config.

From 7169c51e0a7834f993e9725b60717465da7abc86 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Mon, 22 Sep 2025 12:24:05 +0800
Subject: [PATCH 08/15] fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 8623c7a000c..8576644af8e 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -176,7 +176,7 @@ def __init__(
             self.model = loader.load(checkpoint_dir=model_path,
                                      checkpoint_loader=checkpoint_loader)
             moe_load_balancer = maybe_create_moe_load_balancer(
-                self.model.config, self.mapping)
+                self.model.model_config, self.mapping)
             if isinstance(moe_load_balancer, MoeLoadBalancer):
                 setattr(self, "moe_load_balancer", moe_load_balancer)
                 moe_load_balancer.register_weight_slots_after_to_cuda()

From 21ff6577069f6eea1ba56e67958f4299c42277b9 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Tue, 23 Sep 2025 15:06:51 +0800
Subject: [PATCH 09/15] fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_engine.py | 6 +++---
 tensorrt_llm/_torch/pyexecutor/model_loader.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index b490d1e77b8..eab94ceb614 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -173,10 +173,10 @@ def __init__(
                 max_seq_len=max_seq_len,
                 lora_config=lora_config,
             )
-            self.model = loader.load(checkpoint_dir=model_path,
-                                     checkpoint_loader=checkpoint_loader)
+            self.model, config = loader.load(
+                checkpoint_dir=model_path, checkpoint_loader=checkpoint_loader)
             moe_load_balancer = maybe_create_moe_load_balancer(
-                self.model.model_config, self.mapping)
+                config, self.mapping)
             if isinstance(moe_load_balancer, MoeLoadBalancer):
                 setattr(self, "moe_load_balancer", moe_load_balancer)
                 moe_load_balancer.register_weight_slots_after_to_cuda()
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 180b4c99319..c7a73a9ec53 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -185,7 +185,7 @@ def load(
         self,
         checkpoint_dir: str,
         checkpoint_loader: BaseCheckpointLoader,
-    ) -> DecoderModelForCausalLM:
+    ) -> Tuple[DecoderModelForCausalLM, ModelConfig]:
         """
         Loads the model, its weights, and applies necessary configurations.
 
@@ -267,7 +267,7 @@ def init_meta_tensor(t: torch.Tensor):
 
             torch.cuda.current_stream().synchronize()
 
-        return model
+        return model, config
 
     def _load_and_validate_config(
             self, checkpoint_dir: str,

From 8904e9c4c12e7038d48b5c4fd7c63766fa4c271c Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Wed, 24 Sep 2025 08:47:57 +0800
Subject: [PATCH 10/15] clean

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_loader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index c7a73a9ec53..4186099005f 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -179,7 +179,6 @@ def __init__(self,
         self.max_num_tokens = max_num_tokens
         self.max_seq_len = max_seq_len
         self.lora_config = lora_config
-        self.moe_load_balancer = None
 
     def load(
         self,

From 797f5e1cebea79e997541cbda9eee36e36516a67 Mon Sep 17 00:00:00 2001
From: junq <22017000+QiJune@users.noreply.github.com>
Date: Wed, 24 Sep 2025 09:38:10 +0800
Subject: [PATCH 11/15] fix

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_engine.py | 12 +++---------
 tensorrt_llm/_torch/pyexecutor/model_loader.py | 18 +++++++++++++-----
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index eab94ceb614..115bd2ce393 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -37,8 +37,8 @@
 from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
 from ..models.modeling_multimodal_utils import filter_mm_token_from_input_ids
 from ..models.modeling_utils import DecoderModelForCausalLM
-from ..modules.fused_moe.moe_load_balancer import (
-    MoeLoadBalancer, MoeLoadBalancerIterContext, maybe_create_moe_load_balancer)
+from ..modules.fused_moe.moe_load_balancer import (MoeLoadBalancer,
+                                                   MoeLoadBalancerIterContext)
 from ..speculative import (SpecMetadata, get_num_extra_kv_tokens,
                            get_spec_metadata,
                            update_spec_config_from_model_config)
@@ -173,16 +173,10 @@ def __init__(
                 max_seq_len=max_seq_len,
                 lora_config=lora_config,
             )
-            self.model, config = loader.load(
+            self.model, moe_load_balancer = loader.load(
                 checkpoint_dir=model_path, checkpoint_loader=checkpoint_loader)
-            moe_load_balancer = maybe_create_moe_load_balancer(
-                config, self.mapping)
             if isinstance(moe_load_balancer, MoeLoadBalancer):
                 setattr(self, "moe_load_balancer", moe_load_balancer)
-                moe_load_balancer.register_weight_slots_after_to_cuda()
-                logger.info("moe_load_balancer finalizing model...")
-                moe_load_balancer.finalize_model()
-                logger.info("moe_load_balancer finalize model done")
         else:
             self.model = model
         if drafting_loop_wrapper is not None:
diff --git a/tensorrt_llm/_torch/pyexecutor/model_loader.py b/tensorrt_llm/_torch/pyexecutor/model_loader.py
index 4186099005f..68e877e20b4 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_loader.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_loader.py
@@ -16,8 +16,9 @@
 from ..model_config import ModelConfig
 from ..models import AutoModelForCausalLM
 from ..models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
-from ..models.modeling_utils import (DecoderModelForCausalLM, MetaInitMode,
-                                     timing)
+from ..models.modeling_utils import MetaInitMode, timing
+from ..modules.fused_moe.moe_load_balancer import (
+    MoeLoadBalancer, maybe_create_moe_load_balancer)
 from .config import LoadFormat, PyTorchConfig
 
 _KV_CACHE_MAP = {
@@ -184,7 +185,7 @@ def load(
         self,
         checkpoint_dir: str,
         checkpoint_loader: BaseCheckpointLoader,
-    ) -> Tuple[DecoderModelForCausalLM, ModelConfig]:
+    ):
         """
         Loads the model, its weights, and applies necessary configurations.
 
@@ -199,7 +200,8 @@ def load(
                                                 checkpoint_loader)
         load_format = self.pytorch_backend_config.load_format
 
-        with timing("Model init total"):
+        with timing("Model init total"), maybe_create_moe_load_balancer(
+                config, self.mapping) as moe_load_balancer:
             try:
                 # config will be modified in-place for some models, like Qwen2
                 config_copy = copy.deepcopy(config)
@@ -264,9 +266,15 @@ def init_meta_tensor(t: torch.Tensor):
                 raise NotImplementedError(
                     f"No load support for load format: {load_format}")
 
+            if isinstance(moe_load_balancer, MoeLoadBalancer):
+                moe_load_balancer.register_weight_slots_after_to_cuda()
+                logger.info("moe_load_balancer finalizing model...")
+                moe_load_balancer.finalize_model()
+                logger.info("moe_load_balancer finalize model done")
+
             torch.cuda.current_stream().synchronize()
 
-        return model, config
+        return model, moe_load_balancer
 
     def _load_and_validate_config(
             self, checkpoint_dir: str,

From 336c2ef5408d9d1cd92c7e5116eab21e217029fc Mon Sep 17 00:00:00 2001
From: Void <18275976+yilin-void@users.noreply.github.com>
Date: Thu, 25 Sep 2025 09:20:24 +0800
Subject: [PATCH 12/15] [None][feat] DeepEP LL fp8 dispatch/combine (#7927)

Signed-off-by: Yilin Zhang <18275976+yilin-void@users.noreply.github.com>
---
 cpp/tensorrt_llm/deep_ep/CMakeLists.txt       |   2 +-
 .../cutlass_kernels/include/moe_kernels.h     |   1 +
 .../cutlass_kernels/moe_gemm/moe_kernels.cu   |   5 +-
 cpp/tensorrt_llm/thop/moeOp.cpp               |  15 ++
 .../_torch/modules/fused_moe/deep_ep_utils.py |  18 +-
 .../modules/fused_moe/fused_moe_wide_ep.py    | 154 ++++++++++--------
 6 files changed, 118 insertions(+), 77 deletions(-)

diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
index cdae331b945..f690ab5a905 100644
--- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
+++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(DEEP_EP_COMMIT 515a311f290eb6d9592fcccfcc80c40f5123ca72)
+set(DEEP_EP_COMMIT be2582ffe69b5e7d61c3bc9bf7a5316bc48261f9)
 set(NVSHMEM_URL_HASH
     SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a)
 
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
index 2e01d47eb5b..89f73af4379 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -553,6 +553,7 @@ class CutlassMoeFCRunner : public CutlassMoeFCRunnerInterface
         || std::is_same_v<T, __nv_fp8_e5m2>) &&!std::is_same_v<WeightType, cutlass::uint4b_t>;
     static constexpr bool use_w4afp8
         = std::is_same_v<WeightType, cutlass::uint4b_t> && std::is_same_v<T, __nv_fp8_e4m3>;
+    static constexpr bool use_fp8_input = std::is_same_v<InputType, __nv_fp8_e4m3>;
     static_assert(!std::is_same_v<BackBoneType, __nv_fp8_e4m3>, "Current logic requires backbone type to be >=16-bits");
     static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3>, "Current logic requires output type to be >=16-bits");
 #else
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
index ca613155a9d..38efe497f7c 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -1625,7 +1625,7 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
         else if constexpr (std::is_same_v<ExpandedActivationsType, __nv_fp8_e4m3>
             && std::is_same_v<InputActivationsType, __nv_fp8_e4m3>)
         {
-            TLLM_CHECK_WITH_INFO(!prequant_scales, "NVFP4 is not supported for AWQ");
+            TLLM_CHECK_WITH_INFO(!prequant_scales, "FP8 is not supported for AWQ");
             return quant_params.mxfp8_mxfp4.fc1.weight_block_scale
                 ? &expandInputRowsKernel<InputActivationsType, ExpandedActivationsType,
                     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::MXFPX, false>
@@ -3689,7 +3689,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, InputType, BackBoneType, Enab
             permuted_token_final_scales_, permuted_row_to_unpermuted_row_, num_rows, hidden_size, experts_per_token,
             num_experts_per_node, quant_params, use_per_expert_act_scale, expert_first_token_offset_,
             fc1_fp4_act_scale_, input_sf, swizzled_input_sf,
-            use_w4afp8 ? quant_params.groupwise.fc1.act_scales : nullptr, stream);
+            (use_w4afp8 && !use_fp8_input) ? quant_params.groupwise.fc1.act_scales : nullptr, stream);
         auto const* gemm1_input = gemm1_input_expand;
 
         sync_check_cuda_error(stream);
@@ -4755,6 +4755,7 @@ template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_fp8_e4m3>;
 #endif
 #endif
 #ifdef ENABLE_FP4
diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp
index 9b9731a076e..db1ab0e3621 100644
--- a/cpp/tensorrt_llm/thop/moeOp.cpp
+++ b/cpp/tensorrt_llm/thop/moeOp.cpp
@@ -201,6 +201,21 @@ class FusedMoeRunner : public torch::CustomClassHolder
             }
             switch (mActivationDtype)
             {
+#ifdef ENABLE_FP8
+            case c10::ScalarType::Float8_e4m3fn:
+            {
+                if (isInt4Quant() and mUseW4GroupScaling)
+                {
+                    mKernelRunner = std::make_unique<
+                        kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_fp8_e4m3>>();
+                }
+                else
+                {
+                    C10_THROW_ERROR_FORMATTED(Error, "FP8 activation type is not supported for non-W4A8 quantization");
+                }
+                break;
+            }
+#endif
             case c10::ScalarType::Half: mKernelRunner = create_weight_quant_runner<half>(); break;
             case c10::ScalarType::BFloat16: mKernelRunner = create_weight_quant_runner<__nv_bfloat16>(); break;
             default: C10_THROW_ERROR_FORMATTED(Error, "Unsupported activation type for int-type weight");
diff --git a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
index c852bdb929c..84673163b82 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
@@ -2,7 +2,7 @@
 # https://github.com/deepseek-ai/DeepEP/blob/aae9fa9a6dd0fec2a723fbb85ec4b22460fab670/README.md
 import os
 import weakref
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 
@@ -179,12 +179,18 @@ def low_latency_dispatch_fp4(self, hidden_states: torch.Tensor,
 
         return recv_hidden_states, recv_scales, recv_expert_count, handle
 
-    def low_latency_combine_fp4(self, hidden_states: torch.Tensor,
-                                global_scales: torch.Tensor,
-                                topk_idx: torch.Tensor,
-                                topk_weights: torch.Tensor, handle: Tuple):
+    def low_latency_combine_low_precision(self, precision: str,
+                                          hidden_states: torch.Tensor,
+                                          global_scales: Optional[torch.Tensor],
+                                          topk_idx: torch.Tensor,
+                                          topk_weights: torch.Tensor,
+                                          handle: Tuple):
+        """
+            Arguments:
+                precision: the precision of the low-precision kernel, "fp8" for FP8, "nvfp4" for NVFP4.
+        """
         combined_hidden_states, event, hook = \
-            self.buffer.low_latency_combine_fp4(hidden_states, global_scales, topk_idx, topk_weights, handle)
+            self.buffer.low_latency_combine_low_precision(precision, hidden_states, global_scales, topk_idx, topk_weights, handle)
         assert event.event is None
         assert hook is None
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
index 09edb4a2e11..6b1ca3cc45e 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -20,8 +20,8 @@
 from .ops import MoEOp, MoEOpSelector
 from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethod,
                            DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm,
-                           FP8QDQFusedMoEMethod, MoEWeightLoadingMode,
-                           NVFP4CutlassFusedMoEMethod,
+                           FP8QDQFusedMoEMethod, FusedMoEQuantScalesW4A8,
+                           MoEWeightLoadingMode, NVFP4CutlassFusedMoEMethod,
                            UnquantizedFusedMoEMethod, WInt4AFP8FusedMoEMethod)
 from .routing import BaseMoeRoutingMethod
 
@@ -191,13 +191,10 @@ def __init__(
         self.use_postquant_alltoall = False
         self.use_low_precision_combine = False
         if self.enable_alltoall:
-            qm = self.quant_config.quant_mode
             self.use_postquant_alltoall = (os.environ.get(
-                "TRTLLM_MOE_POST_QUANT_ALLTOALLV", "1")
-                                           == "1") and qm.has_nvfp4()
+                "TRTLLM_MOE_POST_QUANT_ALLTOALLV", "1") == "1")
             self.use_low_precision_combine = (os.environ.get(
-                "TRTLLM_MOE_USE_LOW_PRECISION_COMBINE", "0")
-                                              == "1") and qm.has_nvfp4()
+                "TRTLLM_MOE_USE_LOW_PRECISION_COMBINE", "0") == "1")
 
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
                 MnnvlMemory.initialize()
@@ -319,6 +316,35 @@ def can_use_alltoall(self, all_rank_num_tokens, all_rank_max_num_tokens):
 
         return self.enable_alltoall
 
+    def deep_ep_low_latency_dispatch_modify_output_to_adapt_fused_moe(
+        self, x: torch.Tensor, x_sf: Optional[torch.Tensor],
+        recv_expert_count: torch.Tensor, final_scales_dtype: torch.dtype
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor,
+               torch.Tensor]:
+        # x shape: [#local experts, EP size * all_rank_max_num_tokens, hidden_size]
+        # recv_expert_count shape: [#local experts]
+
+        # Adapter between `torch.ops.trtllm.fused_moe` and DeepEP
+        # TODO: remove the adapter by changing `torch.ops.trtllm.fused_moe` API
+        mask = torch.arange(x.shape[1],
+                            dtype=torch.int32, device=x.device).expand(
+                                x.shape[0],
+                                x.shape[1]) < recv_expert_count.unsqueeze(1)
+        token_selected_slots = torch.where(
+            mask,
+            torch.arange(x.shape[0] * self.mapping.moe_ep_rank,
+                         x.shape[0] * (self.mapping.moe_ep_rank + 1),
+                         dtype=torch.int32,
+                         device=x.device).unsqueeze(1), self.num_slots)
+        x = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
+        if x_sf is not None:
+            x_sf = x_sf.reshape(x_sf.shape[0] * x_sf.shape[1], x_sf.shape[2])
+        # Cheat the fused_moe API with fake top_k=1
+        token_selected_slots = token_selected_slots.view(x.shape[0], 1)
+        token_final_scales = torch.ones_like(token_selected_slots,
+                                             dtype=final_scales_dtype)
+        return x, x_sf, token_selected_slots, token_final_scales
+
     def _get_quant_method(self):
         if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
                 exclude_kv_cache=True):
@@ -468,7 +494,7 @@ def forward_chunk(
         use_allgather = not use_all_to_all
 
         # If alltoall is disabled, we need also disable use_postquant_alltoall
-        use_postquant_alltoall = self.use_postquant_alltoall and use_all_to_all
+        use_postquant_alltoall = self.use_postquant_alltoall and use_all_to_all and self.has_any_quant
 
         # Prepare additional information for profiling in case padding is applied when using alltoall.
         # Only the non-alltoall case is considered for profiling in the warmup phase.
@@ -518,28 +544,8 @@ def forward_chunk(
                     assert all_rank_max_num_tokens <= self.deep_ep_max_num_tokens
                     x, recv_expert_count, deep_ep_handle = \
                         self.deep_ep_buffer.low_latency_dispatch(x, deep_ep_topk_idx, all_rank_max_num_tokens, self.num_slots)
-                    # x shape: [#local experts, EP size * all_rank_max_num_tokens, hidden_size]
-                    # recv_expert_count shape: [#local experts]
-
-                    # Adapter between `torch.ops.trtllm.fused_moe` and DeepEP
-                    # TODO: remove the adapter by changing `torch.ops.trtllm.fused_moe` API
-                    mask = torch.arange(
-                        x.shape[1], dtype=torch.int32, device=x.device).expand(
-                            x.shape[0],
-                            x.shape[1]) < recv_expert_count.unsqueeze(1)
-                    token_selected_slots = torch.where(
-                        mask,
-                        torch.arange(
-                            x.shape[0] * self.mapping.moe_ep_rank,
-                            x.shape[0] * (self.mapping.moe_ep_rank + 1),
-                            dtype=torch.int32,
-                            device=x.device).unsqueeze(1), self.num_slots)
-                    x = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
-                    # Cheat the fused_moe API with fake top_k=1
-                    token_selected_slots = token_selected_slots.view(
-                        x.shape[0], 1)
-                    token_final_scales = torch.ones_like(
-                        token_selected_slots, dtype=token_final_scales.dtype)
+                    x, _, token_selected_slots, token_final_scales = self.deep_ep_low_latency_dispatch_modify_output_to_adapt_fused_moe(
+                        x, None, recv_expert_count, token_final_scales.dtype)
 
         x_sf = None
         x_row = x.shape[0]
@@ -621,41 +627,48 @@ def forward_chunk(
                 if x_sf is not None:
                     x_sf = x_sf.view(x_sf_dtype)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
-                token_num = x_row
-                hidden_size = x_col
-                assert x_sf is not None and self.has_nvfp4
-                assert hidden_size % 32 == 0
-                assert x.dtype == torch.uint8 and x_sf.dtype == torch.uint8
-                assert x_sf.shape[0] == token_num and x_sf.shape[
-                    1] == hidden_size // 16
-                assert x.shape[0] == token_num and x.shape[1] == hidden_size // 2
-
+                assert self.has_any_quant, "DeepEPLowLatency postquant alltoall should have quantization"
+                assert all_rank_max_num_tokens <= self.deep_ep_max_num_tokens
                 deep_ep_topk_idx = token_selected_slots
                 deep_ep_topk_weights = token_final_scales
-
-                assert all_rank_max_num_tokens <= self.deep_ep_max_num_tokens
-                x, x_sf, recv_expert_count, deep_ep_handle = \
-                    self.deep_ep_buffer.low_latency_dispatch_fp4(x, x_sf, deep_ep_topk_idx, all_rank_max_num_tokens, self.num_slots)
-                assert x.dtype == torch.uint8 and x_sf.dtype == torch.uint8
-                assert x.dim() == 3 and x_sf.dim() == 3
-                assert x.shape[2] == hidden_size // 2 and x_sf.shape[
-                    2] == hidden_size // 16
-
-                mask = torch.arange(
-                    x.shape[1], dtype=torch.int32, device=x.device).expand(
-                        x.shape[0], x.shape[1]) < recv_expert_count.unsqueeze(1)
-                token_selected_slots = torch.where(
-                    mask,
-                    torch.arange(x.shape[0] * self.mapping.moe_ep_rank,
-                                 x.shape[0] * (self.mapping.moe_ep_rank + 1),
-                                 dtype=torch.int32,
-                                 device=x.device).unsqueeze(1), self.num_slots)
-                x = x.reshape(x.shape[0] * x.shape[1], x.shape[2])
-                x_sf = x_sf.reshape(x_sf.shape[0] * x_sf.shape[1],
-                                    x_sf.shape[2])
-                token_selected_slots = token_selected_slots.view(x.shape[0], 1)
-                token_final_scales = torch.ones_like(
-                    token_selected_slots, dtype=token_final_scales.dtype)
+                if self.has_fp8_qdq:
+                    assert x.dtype == torch.float8_e4m3fn and x_sf is None, "x should be torch.float8_e4m3fn and x_sf should be None in fp8 postquant alltoall"
+                    x = x.view(torch.bfloat16)
+                    x, recv_expert_count, deep_ep_handle = \
+                        self.deep_ep_buffer.low_latency_dispatch(x, deep_ep_topk_idx, all_rank_max_num_tokens, self.num_slots)
+                    x = x.view(torch.float8_e4m3fn)
+                elif self.has_nvfp4:
+                    token_num = x_row
+                    hidden_size = x_col
+                    assert x.dtype == torch.uint8 and x_sf is not None and x_sf.dtype == torch.uint8
+                    assert hidden_size % 32 == 0, "HiddenSize should be divisible by 32 in nvfp4 postquant alltoall"
+                    assert x_sf.shape[0] == token_num and x_sf.shape[
+                        1] == hidden_size // 16
+                    assert x.shape[0] == token_num and x.shape[
+                        1] == hidden_size // 2
+
+                    x, x_sf, recv_expert_count, deep_ep_handle = \
+                        self.deep_ep_buffer.low_latency_dispatch_fp4(x, x_sf, deep_ep_topk_idx, all_rank_max_num_tokens, self.num_slots)
+                    assert x.dtype == torch.uint8 and x_sf.dtype == torch.uint8
+                    assert x.dim() == 3 and x_sf.dim() == 3
+                    assert x.shape[2] == hidden_size // 2 and x_sf.shape[
+                        2] == hidden_size // 16
+                elif self.has_w4afp8:
+                    assert isinstance(quant_scales, FusedMoEQuantScalesW4A8)
+                    pre_quant_scales = quant_scales.pre_quant_scale_1
+                    assert pre_quant_scales.shape == (
+                        1, x.shape[1]) and pre_quant_scales.dtype == x.dtype
+                    x = (x * pre_quant_scales).to(torch.float8_e4m3fn).view(
+                        torch.bfloat16)
+                    x, recv_expert_count, deep_ep_handle = \
+                        self.deep_ep_buffer.low_latency_dispatch(x, deep_ep_topk_idx, all_rank_max_num_tokens, self.num_slots)
+                    x = x.view(torch.float8_e4m3fn)
+                else:
+                    raise ValueError(
+                        f"unsupported quantization mode in postquant alltoall: {self.quant_config.quant_mode}"
+                    )
+                x, x_sf, token_selected_slots, token_final_scales = self.deep_ep_low_latency_dispatch_modify_output_to_adapt_fused_moe(
+                    x, x_sf, recv_expert_count, token_final_scales.dtype)
             else:
                 raise NotImplementedError(
                     f"Not available alltoall method type: {self.alltoall_method_type!r}"
@@ -704,11 +717,16 @@ def forward_chunk(
                     self.expert_size_per_partition,
                     num_tokens_per_expert_for_fused_moe, self.hidden_size)
                 if self.use_low_precision_combine:
-                    global_scales = torch.ops.trtllm.calculate_nvfp4_global_scale(
-                        final_hidden_states, recv_expert_count)
-                    final_hidden_states = self.deep_ep_buffer.low_latency_combine_fp4(
-                        final_hidden_states, global_scales, deep_ep_topk_idx,
-                        deep_ep_topk_weights, deep_ep_handle)
+                    assert self.has_nvfp4 or self.has_w4afp8 or self.has_fp8_qdq, "Low precision combine only supports nvfp4, w4afp8 and fp8 qdq"
+                    precision = "fp8"
+                    global_scales = None
+                    if self.has_nvfp4:
+                        precision = "nvfp4"
+                        global_scales = torch.ops.trtllm.calculate_nvfp4_global_scale(
+                            final_hidden_states, recv_expert_count)
+                    final_hidden_states = self.deep_ep_buffer.low_latency_combine_low_precision(
+                        precision, final_hidden_states, global_scales,
+                        deep_ep_topk_idx, deep_ep_topk_weights, deep_ep_handle)
                 else:
                     final_hidden_states = self.deep_ep_buffer.low_latency_combine(
                         final_hidden_states, deep_ep_topk_idx,

From 98726a3bed62874c684f9a21f02db5dce1fdaee4 Mon Sep 17 00:00:00 2001
From: Aurelien Chartier <2567591+achartier@users.noreply.github.com>
Date: Wed, 24 Sep 2025 18:28:53 -0700
Subject: [PATCH 13/15] [None][chore] Update trtllm-bench documentation on
 setting FP8 KV cache (#7885)

Signed-off-by: Aurelien Chartier <2567591+achartier@users.noreply.github.com>
---
 docs/source/developer-guide/perf-benchmarking.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md
index 6c7dbc97c34..3881a9ab263 100644
--- a/docs/source/developer-guide/perf-benchmarking.md
+++ b/docs/source/developer-guide/perf-benchmarking.md
@@ -460,9 +460,10 @@ If you would like to force the KV cache quantization, you can specify the follow
 when the checkpoint precision is `null`:
 
 ```yaml
-kv_cache_dtype: "fp8"
+kv_cache_config:
+  dtype: fp8
 ```
 
 ```{tip}
-The two valid values for `kv_cache_dtype` are `auto` and `fp8`.
+The two valid values for `kv_cache_config.dtype` are `auto` and `fp8`.
 ```

From bb6067176fb692d30386fd8cff14e0a3592c55fd Mon Sep 17 00:00:00 2001
From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com>
Date: Thu, 25 Sep 2025 10:20:04 +0800
Subject: [PATCH 14/15] [None][chroe] Update the cuda and tensorrt version in
 homepage icons. (#7963)

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 747fe0278df..a71771e7adf 100644
--- a/README.md
+++ b/README.md
@@ -7,8 +7,8 @@ TensorRT-LLM
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.9.1-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
+[![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.13.2-green)](https://developer.nvidia.com/tensorrt)
 [![version](https://img.shields.io/badge/release-1.1.0rc6-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 

From 0945403174ae794315e7af8caab71e71e2c9f8ef Mon Sep 17 00:00:00 2001
From: fredricz-20070104 <226039983+fredricz-20070104@users.noreply.github.com>
Date: Thu, 25 Sep 2025 13:15:26 +0800
Subject: [PATCH 15/15] [TRTLLM-6541][test] Add NIM perf test cases (#7924)

Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
---
 .../test_lists/qa/llm_perf_cluster_nim.yml    | 141 ++++++++++++++++++
 .../test_lists/qa/llm_perf_nim.yml            |   4 +-
 2 files changed, 143 insertions(+), 2 deletions(-)
 create mode 100644 tests/integration/test_lists/qa/llm_perf_cluster_nim.yml

diff --git a/tests/integration/test_lists/qa/llm_perf_cluster_nim.yml b/tests/integration/test_lists/qa/llm_perf_cluster_nim.yml
new file mode 100644
index 00000000000..e56252fd7e3
--- /dev/null
+++ b/tests/integration/test_lists/qa/llm_perf_cluster_nim.yml
@@ -0,0 +1,141 @@
+version: 0.0.1
+llm_perf_cluster_nim:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+  tests:
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-quant:fp8]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:512,32-quant:fp8]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500]
+  - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000]
+  - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8]
+  - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8]
+  - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-streaming-float4-maxbs:2048-maxnt:8192-input_output_len:256,256-reqs:200]
+  # for chunked prefill cases
+  - perf/test_perf.py::test_perf[deepseek_v3_lite_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200]
+  # Phi-4-multimodal-instruct
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-con:250]
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:1000,1000-con:250]
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:128,128]
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-input_output_len:512,32]
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_image-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250]
+  - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_audio-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250]
+  #Mistral-Small-3.1-24B-Instruct-2503
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200] TIMEOUT(120)
+
+
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 2
+  tests:
+  - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:nvfp4-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-float8-maxbs:256-input_output_len:512,32-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:128,128-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:256-input_output_len:512,32-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:256-input_output_len:512,32-gpus:2]
+  - perf/test_perf.py::test_perf[llama_v2_13b-bench-float16-input_output_len:128,128-loras:8-gpus:2]
+  #Mistral-Small-3.1-24B-Instruct-2503
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-input_output_len:1000,2000-reqs:8-con:1-gpus:2]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-input_output_len:1000,2000-reqs:500-con:200-gpus:2]
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:1-maxnt:20000-input_output_len:20000,2000-reqs:8-con:1-gpus:2] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[mistral_small_v3.1_24b-bench-pytorch-bfloat16-maxbs:4096-maxnt:20000-input_output_len:20000,2000-reqs:300-con:200-gpus:2] TIMEOUT(120)
+
+# Tests for systems with 4+ GPUs
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+  tests:
+  - perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4]
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:4-tp:4-gpus:4] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120)
+  # for chunked prefill cases
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:4-tp:4-gpus:4] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:2000-ep:4-tp:4-gpus:4] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:512-ep:4-gpus:4]
+  #llama_v3.1_405b_instruct_fp4
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1024,2048-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-gpus:4] TIMEOUT(120)
+  #llama_v3.3_70b_instruct_fp4
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:1000-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120)
+  #llama_v4_scout_17b_16e_instruct_fp4
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:500-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:500-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:4096-maxnt:20000-kv_frac:0.85-input_output_len:20000,2000-reqs:200-gpus:4] TIMEOUT(120)
+
+
+# Tests for systems with 8+ GPUs
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 8
+  tests:
+  #llama_v3.3_nemotron_super_49b
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-con:250-gpus:8]
+  #llama_v3.3_70b_instruct_fp4
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-reqs:3000-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-reqs:3000-tp:8-gpus:8]
+
+  #llama_v4_scout_17b_16e_instruct_fp4
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:128,128-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:512,32-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:500,2000-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-kv_frac:0.85-input_output_len:1000,1000-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
+  - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8]
+  #deepseek_r1_fp8
+  - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
+  - perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
+  #deepseek_r1_nvfp4
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] TIMEOUT (120) #max throughput test
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
+  # for chunked prefill cases
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-maxnt:2048-kv_frac:0.85-input_output_len:5000,500-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:256-maxnt:1024-kv_frac:0.85-input_output_len:2000,2000-reqs:200-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  #deepseek_r1_0528_fp4
+  - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,1000-reqs:20000-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-kv_frac:0.85-input_output_len:1000,2000-reqs:3000-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[deepseek_r1_0528_fp4-bench-pytorch-float4-maxbs:1000-maxnt:5000-kv_frac:0.85-input_output_len:5000,500-reqs:20000-ep:4-tp:4-gpus:4] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:500,2000-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40)
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8] TIMEOUT (40)
+  - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:2000,500-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp4-bench-pytorch-float4-input_output_len:1000,2000-con:8-ep:8-tp:8-gpus:8]
+  #gpt_oss_120b
+  # max throughput test
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:1280-con:256-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:2560-con:512-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:5120-con:1024-ep:8-tp:8-gpus:8] TIMEOUT(120)
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:20480-con:4096-ep:8-tp:8-gpus:8] TIMEOUT(180)
+  # min latency test
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:8-con:1-ep:8-tp:8-gpus:8]
+  - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-float4-maxbs:720-maxnt:16384-input_output_len:1024,1024-reqs:100-con:32-ep:8-tp:8-gpus:8]
diff --git a/tests/integration/test_lists/qa/llm_perf_nim.yml b/tests/integration/test_lists/qa/llm_perf_nim.yml
index 9b436e58136..a0b8b40302b 100644
--- a/tests/integration/test_lists/qa/llm_perf_nim.yml
+++ b/tests/integration/test_lists/qa/llm_perf_nim.yml
@@ -150,7 +150,6 @@ llm_perf_nim:
   - perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-gpus:2]
   #t5
   - perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20-gpus:2]
-  - perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20-gpus:2]
 
 - condition:
     ranges:
@@ -168,7 +167,6 @@ llm_perf_nim:
   #llama_v3.1_70b
   #trt backend
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-input_output_len:1024,1024-tp:2-gpus:2]
-  - perf/test_perf.py::test_perf[llama_70b_sq_per_tensor-cppmanager-exe-plugin_ifb-float16-input_output_len:128,128+512,32-gpus:2]
   #mixtral_8x7b_v0.1
   #trt backend
   - perf/test_perf.py::test_perf[mixtral_8x7b-cppmanager-exe-plugin_ifb-float16-mp-input_output_len:128,128+512,32-gpus:2]
@@ -199,6 +197,8 @@ llm_perf_nim:
   #trt backend
   - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:1000,1000-quant:fp8-tp:2]
   - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-float16-input_output_len:500,2000-quant:fp8-tp:2]
+  # torch backend
+  - perf/test_perf.py::test_perf[mistral_7b_v0.1-bench-pytorch-float16-input_output_len:128,128]
   #phi_3_mini_128k_instruct
   #trt backend
   - perf/test_perf.py::test_perf[phi_3_mini_128k_instruct-bench-float16-maxbs:128-input_output_len:1000,1000-quant:fp8-tp:2]