From d7ca877631c350dd7e4a7b831dc99b4633959f74 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Sun, 30 Mar 2025 13:11:32 +0530 Subject: [PATCH 01/31] initial commit --- setup.py | 2 + src/diffusers/dependency_versions_table.py | 1 + src/diffusers/quantizers/auto.py | 4 + src/diffusers/quantizers/modelopt/__init__.py | 1 + .../quantizers/modelopt/modelopt_quantizer.py | 165 ++++++++++++++++++ src/diffusers/quantizers/modelopt/utils.py | 13 ++ .../quantizers/quantization_config.py | 24 +++ src/diffusers/utils/__init__.py | 2 + src/diffusers/utils/import_utils.py | 27 +++ 9 files changed, 239 insertions(+) create mode 100644 src/diffusers/quantizers/modelopt/__init__.py create mode 100644 src/diffusers/quantizers/modelopt/modelopt_quantizer.py create mode 100644 src/diffusers/quantizers/modelopt/utils.py diff --git a/setup.py b/setup.py index fdc166a81ecf..f3193bd8f176 100644 --- a/setup.py +++ b/setup.py @@ -132,6 +132,7 @@ "gguf>=0.10.0", "torchao>=0.7.0", "bitsandbytes>=0.43.3", + "nvidia_modelopt>=0.25.0", "regex!=2019.12.17", "requests", "tensorboard", @@ -243,6 +244,7 @@ def run(self): extras["gguf"] = deps_list("gguf", "accelerate") extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate") extras["torchao"] = deps_list("torchao", "accelerate") +extras["nvidia_modelopt"] = deps_list("nvidia_modelopt", "accelerate") if os.name == "nt": # windows extras["flax"] = [] # jax is not supported on windows diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 8ec95ed6fc8d..5d28e5d72995 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -39,6 +39,7 @@ "gguf": "gguf>=0.10.0", "torchao": "torchao>=0.7.0", "bitsandbytes": "bitsandbytes>=0.43.3", + "nvidia_modelopt": "nvidia_modelopt>=0.25.0", "regex": "regex!=2019.12.17", "requests": "requests", "tensorboard": "tensorboard", diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index ce214ae7bc17..19c2a796c2e1 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -21,9 +21,11 @@ from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer from .gguf import GGUFQuantizer +from .modelopt import ModelOptQuantizer from .quantization_config import ( BitsAndBytesConfig, GGUFQuantizationConfig, + ModelOptConfig, QuantizationConfigMixin, QuantizationMethod, QuantoConfig, @@ -39,6 +41,7 @@ "gguf": GGUFQuantizer, "quanto": QuantoQuantizer, "torchao": TorchAoHfQuantizer, + "modelopt": ModelOptQuantizer, } AUTO_QUANTIZATION_CONFIG_MAPPING = { @@ -47,6 +50,7 @@ "gguf": GGUFQuantizationConfig, "quanto": QuantoConfig, "torchao": TorchAoConfig, + "modelopt": ModelOptConfig, } diff --git a/src/diffusers/quantizers/modelopt/__init__.py b/src/diffusers/quantizers/modelopt/__init__.py new file mode 100644 index 000000000000..343b817bdb23 --- /dev/null +++ b/src/diffusers/quantizers/modelopt/__init__.py @@ -0,0 +1 @@ +from .modelopt_quantizer import ModelOptQuantizer diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py new file mode 100644 index 000000000000..6e372e578b8f --- /dev/null +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -0,0 +1,165 @@ +from typing import TYPE_CHECKING, Any, Dict, List, Union + +from ...utils import ( + get_module_from_name, + is_accelerate_available, + is_nvidia_modelopt_available, + is_nvidia_modelopt_version, + is_torch_available, + logging, +) +from ..base import DiffusersQuantizer + + +if TYPE_CHECKING: + from ...models.modeling_utils import ModelMixin + + +if is_torch_available(): + import torch + +if is_accelerate_available(): + from accelerate.utils import set_module_tensor_to_device + +if is_nvidia_modelopt_available(): + from .utils import _replace_with_modelopt_layers + +logger = logging.get_logger(__name__) + + +class ModelOptQuantizer(DiffusersQuantizer): + r""" + Diffusers Quantizer for TensorRT Model Optimizer + """ + + use_keep_in_fp32_modules = True + requires_calibration = False + required_packages = ["modelopt"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + + def validate_environment(self, *args, **kwargs): + if not is_nvidia_modelopt_available(): + raise ImportError( + "Loading an nvidia-modelopt quantized model requires nvidia-modelopt library (`pip install nvidia-modelopt`)" + ) + if not is_nvidia_modelopt_version(">=", "0.25.0"): + raise ImportError( + "Loading an nvidia-modelopt quantized model requires `nvidia-modelopt>=0.25.0`. " + "Please upgrade your installation with `pip install --upgrade nvidia-modelopt" + ) + + self.offload = False + + device_map = kwargs.get("device_map", None) + if isinstance(device_map, dict): + if "cpu" in device_map.values() or "disk" in device_map.values(): + if self.pre_quantized: + raise ValueError( + "You are attempting to perform cpu/disk offload with a pre-quantized modelopt model " + "This is not supported yet. Please remove the CPU or disk device from the `device_map` argument." + ) + else: + self.offload = True + + def check_if_quantized_param( + self, + model: "ModelMixin", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ): + # ModelOpt imports diffusers internally. This is here to prevent circular imports + from modelopt.torch.quantization.nn import QuantInputBase, SequentialQuantizer, TensorQuantizer + from modelopt.torch.quantization.qtensor import BaseQuantizedTensor + + def is_param_quantized(module): + for _module in module.modules(): + if isinstance(_module, TensorQuantizer) and not _module._dequantize: + return True + elif isinstance(_module, SequentialQuantizer): + for q in _module: + if isinstance(q, TensorQuantizer) and not q._dequantize: + return True + return False + + module, tensor_name = get_module_from_name(model, param_name) + if self.pre_quantized and any(isinstance(module, t) for t in [BaseQuantizedTensor]): + return True + elif isinstance(module, QuantInputBase) and "weight" in tensor_name: + return is_param_quantized(module) + return False + + def create_quantized_param( + self, + model: "ModelMixin", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + *args, + **kwargs, + ): + """ + Create the quantized parameter by calling .calibrate() after setting it to the module. + """ + # ModelOpt imports diffusers internally. This is here to prevent circular imports + import modelopt.torch.quantization as mtq + + dtype = kwargs.get("dtype", torch.float32) + module, tensor_name = get_module_from_name(model, param_name) + if self.pre_quantized: + setattr(module, tensor_name, param_value) + else: + config = self.quantization_config.get_config_from_quant_type() + set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) + module = mtq.calibrate(module, algorithm=config["algorithm"]) + module.weight.requires_grad = False + + def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: + max_memory = {key: val * 0.90 for key, val in max_memory.items()} + return max_memory + + def adjust_target_dtype(self, target_dtype: "torch.dtype") -> "torch.dtype": + if self.quantization_config.quant_type == "FP8": + target_dtype = torch.float8_e4m3fn + return target_dtype + + def update_torch_dtype(self, torch_dtype: "torch.dtype" = None) -> "torch.dtype": + if torch_dtype is None: + logger.info("You did not specify `torch_dtype` in `from_pretrained`. Setting it to `torch.float32`.") + torch_dtype = torch.float32 + return torch_dtype + + def _process_model_before_weight_loading( + self, + model: "ModelMixin", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + self.modules_to_not_convert = self.quantization_config.modules_to_not_convert + + if not isinstance(self.modules_to_not_convert, list): + self.modules_to_not_convert = [self.modules_to_not_convert] + + self.modules_to_not_convert.extend(keep_in_fp32_modules) + + config = self.quantization_config.get_config_from_quant_type() + model = _replace_with_modelopt_layers( + model, + quantization_config=config, + ) + model.config.quantization_config = self.quantization_config + + def _process_model_after_weight_loading(self, model, **kwargs): + return model + + @property + def is_trainable(self): + return True + + @property + def is_serializable(self): + return True diff --git a/src/diffusers/quantizers/modelopt/utils.py b/src/diffusers/quantizers/modelopt/utils.py new file mode 100644 index 000000000000..746ddbf98213 --- /dev/null +++ b/src/diffusers/quantizers/modelopt/utils.py @@ -0,0 +1,13 @@ +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +def _replace_with_modelopt_layers(model, quantization_config): + # ModelOpt imports diffusers internally. These are placed here to avoid circular imports + import modelopt.torch.opt as mto + import modelopt.torch.quantization as mtq + + model = mto.apply_mode(model, mode=[("quantize", quantization_config)], registry=mtq.mode.QuantizeModeRegistry) + return model diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 0bc433be0ff3..02dd553439d1 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -46,6 +46,7 @@ class QuantizationMethod(str, Enum): GGUF = "gguf" TORCHAO = "torchao" QUANTO = "quanto" + MODELOPT = "modelopt" if is_torchao_available(): @@ -722,3 +723,26 @@ def post_init(self): accepted_weights = ["float8", "int8", "int4", "int2"] if self.weights_dtype not in accepted_weights: raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights_dtype}") + + +@dataclass +class ModelOptConfig(QuantizationConfigMixin): + """This is a config class to use nvidia modelopt for quantization. + + Args: + QuantizationConfigMixin (_type_): _description_ + """ + + def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, **kwargs) -> None: + self.quant_method = QuantizationMethod.MODELOPT + self.quant_type = "FP8" + self.modules_to_not_convert = modules_to_not_convert + + def get_config_from_quant_type(self) -> Dict[str, Any]: + """ + Get the config from the quantization type. + """ + # ModelOpt imports diffusers internally. This is here to prevent circular imports + from modelopt.torch.quantization.config import FP8_PER_TENSOR_REAL_QUANT_CFG + + return FP8_PER_TENSOR_REAL_QUANT_CFG diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 50a470772772..b42a88f1132f 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -78,6 +78,8 @@ is_librosa_available, is_matplotlib_available, is_note_seq_available, + is_nvidia_modelopt_available, + is_nvidia_modelopt_version, is_onnx_available, is_optimum_quanto_available, is_optimum_quanto_version, diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 98b9c75451c8..4a9b4ba52b87 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -196,6 +196,14 @@ def _is_package_available(pkg_name: str): except importlib_metadata.PackageNotFoundError: _optimum_quanto_available = False +_nvidia_modelopt_available = importlib.util.find_spec("modelopt") is not None +if _nvidia_modelopt_available: + try: + _nvidia_modelopt_version = importlib_metadata.version("nvidia_modelopt") + logger.debug(f"Successfully import nvidia_modelopt version {_nvidia_modelopt_version}") + except importlib_metadata.PackageNotFoundError: + _nvidia_modelopt_available = False + def is_torch_available(): return _torch_available @@ -329,6 +337,10 @@ def is_optimum_quanto_available(): return _optimum_quanto_available +def is_nvidia_modelopt_available(): + return _nvidia_modelopt_available + + def is_timm_available(): return _timm_available @@ -740,6 +752,21 @@ def is_optimum_quanto_version(operation: str, version: str): return compare_versions(parse(_optimum_quanto_version), operation, version) +def is_nvidia_modelopt_version(operation: str, version: str): + """ + Compares the current Nvidia ModelOpt version to a given reference with an operation. + + Args: + operation (`str`): + A string representation of an operator, such as `">"` or `"<="` + version (`str`): + A version string + """ + if not _nvidia_modelopt_available: + return False + return compare_versions(parse(_nvidia_modelopt_version), operation, version) + + def get_objects_from_module(module): """ Returns a dict of object names and values in a module, while skipping private/internal objects From a016c561c42029a11fc90075e79ac1bdd3b2551a Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Mon, 31 Mar 2025 16:09:42 +0530 Subject: [PATCH 02/31] update --- .../quantizers/quantization_config.py | 77 ++++++++++++++++++- 1 file changed, 74 insertions(+), 3 deletions(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 02dd553439d1..c6bf5f056c23 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -735,14 +735,85 @@ class ModelOptConfig(QuantizationConfigMixin): def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, **kwargs) -> None: self.quant_method = QuantizationMethod.MODELOPT - self.quant_type = "FP8" + self.quant_type = quant_type + QUANT_TYPES = [ + "FP8_WO", + "FP8_AINT8", + "INT8_WO", + "INT8_AFP8", + "INT8_AFP8_QKVFP8", + "INT4_WO", + "INT4_AFP8", + "INT4_AFP8_QKVFP8", + ] + if quant_type not in QUANT_TYPES: + logger.warning( + f"Quantization type {quant_type} not supported. Supported types are {QUANT_TYPES}, picking FP8_WO as default" + ) + self.quant_type = "FP8_WO" self.modules_to_not_convert = modules_to_not_convert + self.advanced_quant = kwargs def get_config_from_quant_type(self) -> Dict[str, Any]: """ Get the config from the quantization type. """ # ModelOpt imports diffusers internally. This is here to prevent circular imports - from modelopt.torch.quantization.config import FP8_PER_TENSOR_REAL_QUANT_CFG + external_conf = self.advanced_quant.pop("modelopt_config", None) + if external_conf: + return external_conf + + BASE_CONFIG = { + "quant_cfg": { + "*weight_quantizer": {"fake_quant": False}, + "*input_quantizer": {}, + "*output_quantizer": {"enable": False}, + "*q_bmm_quantizer": {}, + "*k_bmm_quantizer": {}, + "*v_bmm_quantizer": {}, + "*softmax_quantizer": {}, + "default": {"enable": False}, + }, + "algorithm": "max", + } + + quant_cfg = BASE_CONFIG["quant_cfg"] + if "FP8" in self.quant_type: + for k in quant_cfg: + if "enable" not in quant_cfg[k]: + quant_cfg[k]["num_bits"] = (4, 3) + elif "INT8" in self.quant_type: + for k in quant_cfg: + if "enable" not in quant_cfg[k]: + quant_cfg[k]["num_bits"] = 8 + elif "INT4" in self.quant_type: + for k in quant_cfg: + if "enable" not in quant_cfg[k]: + quant_cfg[k]["num_bits"] = 4 + else: + raise ValueError(f"Unknown quantization type: {self.quant_type}") + + if "WO" in self.quant_type: + for k in quant_cfg: + if "*weight_quantizer" not in k: + quant_cfg[k]["enable"] = False + + per_channel = self.advanced_quant.pop("per_channel", False) + if per_channel: + quant_cfg["*weight_quantizer"]["axis"] = self.advanced_quant.pop("axis", -1) + quant_cfg["*input_quantizer"]["axis"] = self.advanced_quant.pop("axis", -1) + + block_quantize = self.advanced_quant.pop("block_quantize", False) + if block_quantize: + quant_cfg["*weight_quantizer"]["block_sizes"] = { + self.advanced_quant.pop("axis", -1): self.advanced_quant.pop("block_size", 128) + } + quant_cfg["*input_quantizer"]["block_sizes"] = { + self.advanced_quant.pop("axis", -1): self.advanced_quant.pop("block_size", 128) + } + + if self.modules_to_not_convert is not None: + for module in self.modules_to_not_convert: + quant_cfg["*" + module + "*"] = {"enable": False} - return FP8_PER_TENSOR_REAL_QUANT_CFG + return BASE_CONFIG From eb73ab03562ef801c729fbad0ffdc45c5c7b7737 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 1 Apr 2025 16:57:23 +0530 Subject: [PATCH 03/31] updates --- src/diffusers/__init__.py | 21 +++++++++++++++++++ .../quantizers/quantization_config.py | 10 ++++----- .../utils/dummy_nvidia_modelopt_objects.py | 17 +++++++++++++++ 3 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 src/diffusers/utils/dummy_nvidia_modelopt_objects.py diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 656f9b27db90..7f2bf08a0a71 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -15,6 +15,7 @@ is_note_seq_available, is_onnx_available, is_optimum_quanto_available, + is_nvidia_modelopt_available, is_scipy_available, is_sentencepiece_available, is_torch_available, @@ -107,6 +108,18 @@ else: _import_structure["quantizers.quantization_config"].append("QuantoConfig") +try: + if not is_torch_available() and not is_accelerate_available() and not is_nvidia_modelopt_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_nvidia_modelopt_objects + + _import_structure["utils.dummy_nvidia_modelopt_objects"] = [ + name for name in dir(dummy_nvidia_modelopt_objects) if not name.startswith("_") + ] +else: + _import_structure["quantizers.quantization_config"].append("ModelOptConfig") + try: if not is_onnx_available(): raise OptionalDependencyNotAvailable() @@ -693,6 +706,14 @@ else: from .quantizers.quantization_config import QuantoConfig + try: + if not is_nvidia_modelopt_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_nvidia_modelopt_objects import * + else: + from .quantizers.quantization_config import ModelOptConfig + try: if not is_onnx_available(): raise OptionalDependencyNotAvailable() diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index c6bf5f056c23..fa97a0beb5a4 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -738,13 +738,13 @@ def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] self.quant_type = quant_type QUANT_TYPES = [ "FP8_WO", - "FP8_AINT8", + # "FP8_AINT8", "INT8_WO", - "INT8_AFP8", - "INT8_AFP8_QKVFP8", + # "INT8_AFP8", + # "INT8_AFP8_QKVFP8", "INT4_WO", - "INT4_AFP8", - "INT4_AFP8_QKVFP8", + # "INT4_AFP8", + # "INT4_AFP8_QKVFP8", ] if quant_type not in QUANT_TYPES: logger.warning( diff --git a/src/diffusers/utils/dummy_nvidia_modelopt_objects.py b/src/diffusers/utils/dummy_nvidia_modelopt_objects.py new file mode 100644 index 000000000000..58b8760cdf8c --- /dev/null +++ b/src/diffusers/utils/dummy_nvidia_modelopt_objects.py @@ -0,0 +1,17 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class ModelOptConfig(metaclass=DummyObject): + _backends = ["nvidia_modelopt"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["nvidia_modelopt"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["nvidia_modelopt"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["nvidia_modelopt"]) From 7fdb79ec2927f2107dc613cfd89787e26179036e Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 8 Apr 2025 19:17:26 +0530 Subject: [PATCH 04/31] update --- setup.py | 5 +++-- src/diffusers/__init__.py | 2 +- src/diffusers/dependency_versions_table.py | 3 ++- .../quantizers/modelopt/modelopt_quantizer.py | 13 +++++-------- src/diffusers/quantizers/modelopt/utils.py | 13 ------------- 5 files changed, 11 insertions(+), 25 deletions(-) delete mode 100644 src/diffusers/quantizers/modelopt/utils.py diff --git a/setup.py b/setup.py index f3193bd8f176..8be6b416ceb8 100644 --- a/setup.py +++ b/setup.py @@ -119,6 +119,7 @@ "peft>=0.6.0", "protobuf>=3.20.3,<4", "pytest", + "pulp", "pytest-timeout", "pytest-xdist", "python>=3.8.0", @@ -132,7 +133,7 @@ "gguf>=0.10.0", "torchao>=0.7.0", "bitsandbytes>=0.43.3", - "nvidia_modelopt>=0.25.0", + "nvidia_modelopt>=0.27.0", "regex!=2019.12.17", "requests", "tensorboard", @@ -244,7 +245,7 @@ def run(self): extras["gguf"] = deps_list("gguf", "accelerate") extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate") extras["torchao"] = deps_list("torchao", "accelerate") -extras["nvidia_modelopt"] = deps_list("nvidia_modelopt", "accelerate") +extras["nvidia_modelopt"] = deps_list("nvidia_modelopt", "onnx", "pulp", "accelerate") if os.name == "nt": # windows extras["flax"] = [] # jax is not supported on windows diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 7f2bf08a0a71..5df313996937 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -13,9 +13,9 @@ is_k_diffusion_available, is_librosa_available, is_note_seq_available, + is_nvidia_modelopt_available, is_onnx_available, is_optimum_quanto_available, - is_nvidia_modelopt_available, is_scipy_available, is_sentencepiece_available, is_torch_available, diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 5d28e5d72995..1b68b0f86d14 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -26,6 +26,7 @@ "peft": "peft>=0.6.0", "protobuf": "protobuf>=3.20.3,<4", "pytest": "pytest", + "pulp": "pulp", "pytest-timeout": "pytest-timeout", "pytest-xdist": "pytest-xdist", "python": "python>=3.8.0", @@ -39,7 +40,7 @@ "gguf": "gguf>=0.10.0", "torchao": "torchao>=0.7.0", "bitsandbytes": "bitsandbytes>=0.43.3", - "nvidia_modelopt": "nvidia_modelopt>=0.25.0", + "nvidia_modelopt": "nvidia_modelopt>=0.27.0", "regex": "regex!=2019.12.17", "requests": "requests", "tensorboard": "tensorboard", diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index 6e372e578b8f..a97d020fceb8 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -21,8 +21,6 @@ if is_accelerate_available(): from accelerate.utils import set_module_tensor_to_device -if is_nvidia_modelopt_available(): - from .utils import _replace_with_modelopt_layers logger = logging.get_logger(__name__) @@ -112,9 +110,8 @@ def create_quantized_param( if self.pre_quantized: setattr(module, tensor_name, param_value) else: - config = self.quantization_config.get_config_from_quant_type() set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) - module = mtq.calibrate(module, algorithm=config["algorithm"]) + mtq.compress(module) module.weight.requires_grad = False def adjust_max_memory(self, max_memory: Dict[str, Union[int, str]]) -> Dict[str, Union[int, str]]: @@ -139,6 +136,9 @@ def _process_model_before_weight_loading( keep_in_fp32_modules: List[str] = [], **kwargs, ): + # ModelOpt imports diffusers internally. This is here to prevent circular imports + import modelopt.torch.quantization as mtq + self.modules_to_not_convert = self.quantization_config.modules_to_not_convert if not isinstance(self.modules_to_not_convert, list): @@ -147,10 +147,7 @@ def _process_model_before_weight_loading( self.modules_to_not_convert.extend(keep_in_fp32_modules) config = self.quantization_config.get_config_from_quant_type() - model = _replace_with_modelopt_layers( - model, - quantization_config=config, - ) + mtq.quantize(model, config) model.config.quantization_config = self.quantization_config def _process_model_after_weight_loading(self, model, **kwargs): diff --git a/src/diffusers/quantizers/modelopt/utils.py b/src/diffusers/quantizers/modelopt/utils.py deleted file mode 100644 index 746ddbf98213..000000000000 --- a/src/diffusers/quantizers/modelopt/utils.py +++ /dev/null @@ -1,13 +0,0 @@ -from ...utils import logging - - -logger = logging.get_logger(__name__) - - -def _replace_with_modelopt_layers(model, quantization_config): - # ModelOpt imports diffusers internally. These are placed here to avoid circular imports - import modelopt.torch.opt as mto - import modelopt.torch.quantization as mtq - - model = mto.apply_mode(model, mode=[("quantize", quantization_config)], registry=mtq.mode.QuantizeModeRegistry) - return model From a83bb98f1c9a2258743c21411ed49450d3df2c46 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 8 Apr 2025 19:31:09 +0530 Subject: [PATCH 05/31] update --- setup.py | 3 ++- src/diffusers/dependency_versions_table.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8be6b416ceb8..d797044630d1 100644 --- a/setup.py +++ b/setup.py @@ -129,6 +129,7 @@ "GitPython<3.1.19", "scipy", "onnx", + "torchprofile>=0.0.4", "optimum_quanto>=0.2.6", "gguf>=0.10.0", "torchao>=0.7.0", @@ -245,7 +246,7 @@ def run(self): extras["gguf"] = deps_list("gguf", "accelerate") extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate") extras["torchao"] = deps_list("torchao", "accelerate") -extras["nvidia_modelopt"] = deps_list("nvidia_modelopt", "onnx", "pulp", "accelerate") +extras["nvidia_modelopt"] = deps_list("nvidia_modelopt", "onnx", "pulp", "torchprofile", "torchvision", "accelerate") if os.name == "nt": # windows extras["flax"] = [] # jax is not supported on windows diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 1b68b0f86d14..20d1ba1caa11 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -36,6 +36,7 @@ "GitPython": "GitPython<3.1.19", "scipy": "scipy", "onnx": "onnx", + "torchprofile": "torchprofile>=0.0.4", "optimum_quanto": "optimum_quanto>=0.2.6", "gguf": "gguf>=0.10.0", "torchao": "torchao>=0.7.0", From 9d9f0b9e001d1712567b91b18129982556eb7c0b Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Thu, 10 Apr 2025 12:31:00 +0530 Subject: [PATCH 06/31] update --- setup.py | 2 +- src/diffusers/__init__.py | 4 ++-- src/diffusers/quantizers/auto.py | 8 ++++---- src/diffusers/quantizers/modelopt/__init__.py | 2 +- src/diffusers/quantizers/modelopt/modelopt_quantizer.py | 2 +- src/diffusers/quantizers/quantization_config.py | 2 +- src/diffusers/utils/dummy_nvidia_modelopt_objects.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index d797044630d1..25c650607146 100644 --- a/setup.py +++ b/setup.py @@ -246,7 +246,7 @@ def run(self): extras["gguf"] = deps_list("gguf", "accelerate") extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate") extras["torchao"] = deps_list("torchao", "accelerate") -extras["nvidia_modelopt"] = deps_list("nvidia_modelopt", "onnx", "pulp", "torchprofile", "torchvision", "accelerate") +extras["nvidia_modelopt"] = deps_list("nvidia_modelopt", "onnx", "pulp", "torchprofile", "accelerate") if os.name == "nt": # windows extras["flax"] = [] # jax is not supported on windows diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 5df313996937..e37ff35c78f1 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -118,7 +118,7 @@ name for name in dir(dummy_nvidia_modelopt_objects) if not name.startswith("_") ] else: - _import_structure["quantizers.quantization_config"].append("ModelOptConfig") + _import_structure["quantizers.quantization_config"].append("NVIDIAModelOptConfig") try: if not is_onnx_available(): @@ -712,7 +712,7 @@ except OptionalDependencyNotAvailable: from .utils.dummy_nvidia_modelopt_objects import * else: - from .quantizers.quantization_config import ModelOptConfig + from .quantizers.quantization_config import NVIDIAModelOptConfig try: if not is_onnx_available(): diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index 19c2a796c2e1..f405c8ec5ec0 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -21,11 +21,11 @@ from .bitsandbytes import BnB4BitDiffusersQuantizer, BnB8BitDiffusersQuantizer from .gguf import GGUFQuantizer -from .modelopt import ModelOptQuantizer +from .modelopt import NVIDIAModelOptQuantizer from .quantization_config import ( BitsAndBytesConfig, GGUFQuantizationConfig, - ModelOptConfig, + NVIDIAModelOptConfig, QuantizationConfigMixin, QuantizationMethod, QuantoConfig, @@ -41,7 +41,7 @@ "gguf": GGUFQuantizer, "quanto": QuantoQuantizer, "torchao": TorchAoHfQuantizer, - "modelopt": ModelOptQuantizer, + "modelopt": NVIDIAModelOptQuantizer, } AUTO_QUANTIZATION_CONFIG_MAPPING = { @@ -50,7 +50,7 @@ "gguf": GGUFQuantizationConfig, "quanto": QuantoConfig, "torchao": TorchAoConfig, - "modelopt": ModelOptConfig, + "modelopt": NVIDIAModelOptConfig, } diff --git a/src/diffusers/quantizers/modelopt/__init__.py b/src/diffusers/quantizers/modelopt/__init__.py index 343b817bdb23..ae0951cb30d1 100644 --- a/src/diffusers/quantizers/modelopt/__init__.py +++ b/src/diffusers/quantizers/modelopt/__init__.py @@ -1 +1 @@ -from .modelopt_quantizer import ModelOptQuantizer +from .modelopt_quantizer import NVIDIAModelOptQuantizer diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index a97d020fceb8..abffc958efd6 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -25,7 +25,7 @@ logger = logging.get_logger(__name__) -class ModelOptQuantizer(DiffusersQuantizer): +class NVIDIAModelOptQuantizer(DiffusersQuantizer): r""" Diffusers Quantizer for TensorRT Model Optimizer """ diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index fa97a0beb5a4..0cb8ddcfc427 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -726,7 +726,7 @@ def post_init(self): @dataclass -class ModelOptConfig(QuantizationConfigMixin): +class NVIDIAModelOptConfig(QuantizationConfigMixin): """This is a config class to use nvidia modelopt for quantization. Args: diff --git a/src/diffusers/utils/dummy_nvidia_modelopt_objects.py b/src/diffusers/utils/dummy_nvidia_modelopt_objects.py index 58b8760cdf8c..046b28223b3d 100644 --- a/src/diffusers/utils/dummy_nvidia_modelopt_objects.py +++ b/src/diffusers/utils/dummy_nvidia_modelopt_objects.py @@ -2,7 +2,7 @@ from ..utils import DummyObject, requires_backends -class ModelOptConfig(metaclass=DummyObject): +class NVIDIAModelOptConfig(metaclass=DummyObject): _backends = ["nvidia_modelopt"] def __init__(self, *args, **kwargs): From 7b09750b75acc0a4467de7de9c1f36b00c05618d Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Mon, 21 Apr 2025 18:44:36 +0530 Subject: [PATCH 07/31] update --- .../quantizers/modelopt/modelopt_quantizer.py | 5 ++-- .../quantizers/quantization_config.py | 28 ++++++++----------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index abffc958efd6..ced4fdd869d1 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -111,6 +111,7 @@ def create_quantized_param( setattr(module, tensor_name, param_value) else: set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) + mtq.calibrate(module, self.quantization_config.modelopt_config["algorithm"]) mtq.compress(module) module.weight.requires_grad = False @@ -137,6 +138,7 @@ def _process_model_before_weight_loading( **kwargs, ): # ModelOpt imports diffusers internally. This is here to prevent circular imports + import modelopt.torch.opt as mto import modelopt.torch.quantization as mtq self.modules_to_not_convert = self.quantization_config.modules_to_not_convert @@ -146,8 +148,7 @@ def _process_model_before_weight_loading( self.modules_to_not_convert.extend(keep_in_fp32_modules) - config = self.quantization_config.get_config_from_quant_type() - mtq.quantize(model, config) + mto.apply_mode(model, mode=[("quantize", self.quantization_config.modelopt_config)], registry=mtq.mode.QuantizeModeRegistry) model.config.quantization_config = self.quantization_config def _process_model_after_weight_loading(self, model, **kwargs): diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 0cb8ddcfc427..8d51ce8f5970 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -699,7 +699,7 @@ class QuantoConfig(QuantizationConfigMixin): Args: weights_dtype (`str`, *optional*, defaults to `"int8"`): The target dtype for the weights after quantization. Supported values are ("float8","int8","int4","int2") - modules_to_not_convert (`list`, *optional*, default to `None`): + modules_to_not_convert (`list`, *optional*, default to `None`): The list of modules to not quantize, useful for quantizing models that explicitly require to have some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers). """ @@ -733,7 +733,7 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): QuantizationConfigMixin (_type_): _description_ """ - def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, **kwargs) -> None: + def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, channel_quantize: Optional[int] = None, block_quantize: Optional[int] = None, modelopt_config: Optional[dict] = None, **kwargs) -> None: self.quant_method = QuantizationMethod.MODELOPT self.quant_type = quant_type QUANT_TYPES = [ @@ -752,17 +752,14 @@ def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] ) self.quant_type = "FP8_WO" self.modules_to_not_convert = modules_to_not_convert - self.advanced_quant = kwargs + self.channel_quantize = channel_quantize + self.block_quantize = block_quantize + self.modelopt_config = self.get_config_from_quant_type() if not modelopt_config else modelopt_config def get_config_from_quant_type(self) -> Dict[str, Any]: """ Get the config from the quantization type. """ - # ModelOpt imports diffusers internally. This is here to prevent circular imports - external_conf = self.advanced_quant.pop("modelopt_config", None) - if external_conf: - return external_conf - BASE_CONFIG = { "quant_cfg": { "*weight_quantizer": {"fake_quant": False}, @@ -798,19 +795,16 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: if "*weight_quantizer" not in k: quant_cfg[k]["enable"] = False - per_channel = self.advanced_quant.pop("per_channel", False) - if per_channel: - quant_cfg["*weight_quantizer"]["axis"] = self.advanced_quant.pop("axis", -1) - quant_cfg["*input_quantizer"]["axis"] = self.advanced_quant.pop("axis", -1) - - block_quantize = self.advanced_quant.pop("block_quantize", False) - if block_quantize: + if self.block_quantize and self.channel_quantize: quant_cfg["*weight_quantizer"]["block_sizes"] = { - self.advanced_quant.pop("axis", -1): self.advanced_quant.pop("block_size", 128) + self.channel_quantize: self.block_quantize } quant_cfg["*input_quantizer"]["block_sizes"] = { - self.advanced_quant.pop("axis", -1): self.advanced_quant.pop("block_size", 128) + self.channel_quantize: self.block_quantize } + elif self.channel_quantize: + quant_cfg["*weight_quantizer"]["axis"] = self.channel_quantize + quant_cfg["*input_quantizer"]["axis"] = self.channel_quantize if self.modules_to_not_convert is not None: for module in self.modules_to_not_convert: From 71d8a7ededd752d1c60c15d0f369456ed655330e Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Thu, 24 Apr 2025 04:43:42 +0530 Subject: [PATCH 08/31] update --- .../quantizers/quantization_config.py | 117 ++++++++++++------ 1 file changed, 82 insertions(+), 35 deletions(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 8d51ce8f5970..cba633ad6177 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -730,36 +730,75 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): """This is a config class to use nvidia modelopt for quantization. Args: - QuantizationConfigMixin (_type_): _description_ + quant_type (`str`): + The type of quantization we want to use, following is how to use: + **weightquant_activationquant --> FP8_FP8** + In the above example we have use FP8 for both weight and activation quantization. + Following are the all the options: + - FP8 + - INT8 + - INT4 + - NF4 + - NVFP4 + modules_to_not_convert (`List[str]`, *optional*, default to `None`): + The list of modules to not quantize, useful for quantizing models that explicitly require to have some + weight_only (`bool`, *optional*, default to `False`): + If set to `True`, the quantization will be applied only to the weights of the model. + channel_quantize (`int`, *optional*, default to `None`): + The channel quantization axis, useful for quantizing models across different axes. + block_quantize (`int`, *optional*, default to `None`): + The block size, useful to further quantize each channel/axes into blocks. + algorithm (`str`, *optional*, default to `"max"`): + The algorithm to use for quantization, currently only supports `"max"`. + modelopt_config (`dict`, *optional*, default to `None`): + The modelopt config, useful for passing custom configs to modelopt. """ - def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, channel_quantize: Optional[int] = None, block_quantize: Optional[int] = None, modelopt_config: Optional[dict] = None, **kwargs) -> None: + def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, weight_only: bool=True, channel_quantize: Optional[int] = None, block_quantize: Optional[int] = None, algorithm: str = "max", modelopt_config: Optional[dict] = None, **kwargs) -> None: self.quant_method = QuantizationMethod.MODELOPT self.quant_type = quant_type - QUANT_TYPES = [ - "FP8_WO", - # "FP8_AINT8", - "INT8_WO", - # "INT8_AFP8", - # "INT8_AFP8_QKVFP8", - "INT4_WO", - # "INT4_AFP8", - # "INT4_AFP8_QKVFP8", - ] - if quant_type not in QUANT_TYPES: + self.type_bit_map = { + "FP8": (4, 3), + # "INT8": 8, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/pull/166 + "INT4": 4, + "NF4": 4, + "NVFP4": (2,1), + } + + parts = self.quant_type.split("_") + w_type = parts[0] + act_type = parts[1] if len(parts) > 1 else None + if len(parts) > 2: logger.warning( - f"Quantization type {quant_type} not supported. Supported types are {QUANT_TYPES}, picking FP8_WO as default" + f"Quantization type {self.quant_type} is not supported. Picking FP8_INT8 as default" ) - self.quant_type = "FP8_WO" + w_type = "FP8" + act_type = None + else: + if w_type not in self.type_bit_map: + logger.warning( + f"Weight Quantization type {w_type} is not supported. Picking FP8 as default" + ) + w_type = "FP8" + if act_type is not None and act_type not in self.type_bit_map: + logger.warning( + f"Activation Quantization type {act_type} is not supported. Picking INT8 as default" + ) + act_type = None + self.quant_type = w_type + ("_" + act_type if act_type is not None else "") self.modules_to_not_convert = modules_to_not_convert + self.weight_only = weight_only self.channel_quantize = channel_quantize self.block_quantize = block_quantize + self.algorithm = algorithm self.modelopt_config = self.get_config_from_quant_type() if not modelopt_config else modelopt_config def get_config_from_quant_type(self) -> Dict[str, Any]: """ Get the config from the quantization type. """ + import modelopt.torch.quantization as mtq + BASE_CONFIG = { "quant_cfg": { "*weight_quantizer": {"fake_quant": False}, @@ -769,42 +808,50 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: "*k_bmm_quantizer": {}, "*v_bmm_quantizer": {}, "*softmax_quantizer": {}, - "default": {"enable": False}, + **mtq.config._default_disabled_quantizer_cfg, }, - "algorithm": "max", + "algorithm": self.algorithm, } quant_cfg = BASE_CONFIG["quant_cfg"] - if "FP8" in self.quant_type: - for k in quant_cfg: - if "enable" not in quant_cfg[k]: - quant_cfg[k]["num_bits"] = (4, 3) - elif "INT8" in self.quant_type: + if self.weight_only: for k in quant_cfg: - if "enable" not in quant_cfg[k]: - quant_cfg[k]["num_bits"] = 8 - elif "INT4" in self.quant_type: - for k in quant_cfg: - if "enable" not in quant_cfg[k]: - quant_cfg[k]["num_bits"] = 4 - else: - raise ValueError(f"Unknown quantization type: {self.quant_type}") - - if "WO" in self.quant_type: - for k in quant_cfg: - if "*weight_quantizer" not in k: + if "*weight_quantizer" not in k and not quant_cfg[k]: quant_cfg[k]["enable"] = False + parts = self.quant_type.split("_") + w_type = parts[0] + act_type = parts[1].replace("A", "") if len(parts) > 1 else None + for k in quant_cfg: + if k not in mtq.config._default_disabled_quantizer_cfg and "enable" not in quant_cfg[k]: + if k == "*input_quantizer": + if act_type is not None: + quant_cfg[k]["num_bits"] = self.type_bit_map[act_type] + continue + quant_cfg[k]["num_bits"] = self.type_bit_map[w_type] + if self.block_quantize and self.channel_quantize: quant_cfg["*weight_quantizer"]["block_sizes"] = { self.channel_quantize: self.block_quantize } quant_cfg["*input_quantizer"]["block_sizes"] = { - self.channel_quantize: self.block_quantize + self.channel_quantize: self.block_quantize, "type": "dynamic" } elif self.channel_quantize: quant_cfg["*weight_quantizer"]["axis"] = self.channel_quantize quant_cfg["*input_quantizer"]["axis"] = self.channel_quantize + quant_cfg["*input_quantizer"]["type"] = "dynamic" + + # Only fixed sizes are supported for now in modelopt + if "NF4" in w_type: + BASE_CONFIG["quant_cfg"]["*weight_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.channel_quantize: self.block_quantize}}) + elif "NVFP4" in w_type: + BASE_CONFIG["quant_cfg"]["*weight_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) + if act_type: + if "NF4" in act_type: + BASE_CONFIG["quant_cfg"]["*input_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.channel_quantize: self.block_quantize}}) + elif "NVFP4" in act_type: + BASE_CONFIG["quant_cfg"]["*input_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) if self.modules_to_not_convert is not None: for module in self.modules_to_not_convert: From 6c74c69f00bd6c0d46c697b334c63b93d0c0bf39 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Thu, 24 Apr 2025 14:33:47 +0530 Subject: [PATCH 09/31] update --- src/diffusers/quantizers/quantization_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index cba633ad6177..7172aa4362dd 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -761,7 +761,7 @@ def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] "FP8": (4, 3), # "INT8": 8, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/pull/166 "INT4": 4, - "NF4": 4, + # "NF4": 4, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/183 "NVFP4": (2,1), } From 6c65138331d3d1727525dd6196ee3c5e5c45f9e6 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 29 Apr 2025 17:52:21 +0530 Subject: [PATCH 10/31] addressed PR comments --- setup.py | 6 ++---- src/diffusers/dependency_versions_table.py | 4 +--- src/diffusers/quantizers/quantization_config.py | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/setup.py b/setup.py index 7a03d2c10408..a9f255b99465 100644 --- a/setup.py +++ b/setup.py @@ -119,7 +119,6 @@ "peft>=0.6.0", "protobuf>=3.20.3,<4", "pytest", - "pulp", "pytest-timeout", "pytest-xdist", "python>=3.8.0", @@ -129,12 +128,11 @@ "GitPython<3.1.19", "scipy", "onnx", - "torchprofile>=0.0.4", "optimum_quanto>=0.2.6", "gguf>=0.10.0", "torchao>=0.7.0", "bitsandbytes>=0.43.3", - "nvidia_modelopt>=0.27.0", + "nvidia_modelopt[torch, hf]>=0.27.0", "regex!=2019.12.17", "requests", "tensorboard", @@ -247,7 +245,7 @@ def run(self): extras["gguf"] = deps_list("gguf", "accelerate") extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate") extras["torchao"] = deps_list("torchao", "accelerate") -extras["nvidia_modelopt"] = deps_list("nvidia_modelopt", "onnx", "pulp", "torchprofile", "accelerate") +extras["nvidia_modelopt"] = deps_list("nvidia_modelopt[torch, hf]") if os.name == "nt": # windows extras["flax"] = [] # jax is not supported on windows diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index bec756ed69d0..be7daa65c15f 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -26,7 +26,6 @@ "peft": "peft>=0.6.0", "protobuf": "protobuf>=3.20.3,<4", "pytest": "pytest", - "pulp": "pulp", "pytest-timeout": "pytest-timeout", "pytest-xdist": "pytest-xdist", "python": "python>=3.8.0", @@ -36,12 +35,11 @@ "GitPython": "GitPython<3.1.19", "scipy": "scipy", "onnx": "onnx", - "torchprofile": "torchprofile>=0.0.4", "optimum_quanto": "optimum_quanto>=0.2.6", "gguf": "gguf>=0.10.0", "torchao": "torchao>=0.7.0", "bitsandbytes": "bitsandbytes>=0.43.3", - "nvidia_modelopt": "nvidia_modelopt>=0.27.0", + "nvidia_modelopt[torch, hf]": "nvidia_modelopt[torch, hf]>=0.27.0", "regex": "regex!=2019.12.17", "requests": "requests", "tensorboard": "tensorboard", diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 7172aa4362dd..788dc8337bc5 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -699,7 +699,7 @@ class QuantoConfig(QuantizationConfigMixin): Args: weights_dtype (`str`, *optional*, defaults to `"int8"`): The target dtype for the weights after quantization. Supported values are ("float8","int8","int4","int2") - modules_to_not_convert (`list`, *optional*, default to `None`): + modules_to_not_convert (`list`, *optional*, default to `None`): The list of modules to not quantize, useful for quantizing models that explicitly require to have some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers). """ From 915dbf02ccc288427eb805187db9c8af02872973 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Wed, 30 Apr 2025 11:03:33 +0530 Subject: [PATCH 11/31] update --- src/diffusers/quantizers/modelopt/modelopt_quantizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index ced4fdd869d1..3ead4c2e4c94 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -139,7 +139,6 @@ def _process_model_before_weight_loading( ): # ModelOpt imports diffusers internally. This is here to prevent circular imports import modelopt.torch.opt as mto - import modelopt.torch.quantization as mtq self.modules_to_not_convert = self.quantization_config.modules_to_not_convert @@ -148,7 +147,7 @@ def _process_model_before_weight_loading( self.modules_to_not_convert.extend(keep_in_fp32_modules) - mto.apply_mode(model, mode=[("quantize", self.quantization_config.modelopt_config)], registry=mtq.mode.QuantizeModeRegistry) + mto.apply_mode(model, mode=[("quantize", self.quantization_config.modelopt_config)]) model.config.quantization_config = self.quantization_config def _process_model_after_weight_loading(self, model, **kwargs): From f823a2cb3960494adf92b277b0680d387edaf1bd Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 6 May 2025 09:45:28 +0530 Subject: [PATCH 12/31] addressed PR comments --- .../quantizers/modelopt/modelopt_quantizer.py | 31 +++++++------------ .../quantizers/quantization_config.py | 16 ++++------ 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index 3ead4c2e4c94..f2798b4d1b75 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -70,25 +70,13 @@ def check_if_quantized_param( **kwargs, ): # ModelOpt imports diffusers internally. This is here to prevent circular imports - from modelopt.torch.quantization.nn import QuantInputBase, SequentialQuantizer, TensorQuantizer from modelopt.torch.quantization.qtensor import BaseQuantizedTensor + from modelopt.torch.quantization.utils import is_quantized - def is_param_quantized(module): - for _module in module.modules(): - if isinstance(_module, TensorQuantizer) and not _module._dequantize: - return True - elif isinstance(_module, SequentialQuantizer): - for q in _module: - if isinstance(q, TensorQuantizer) and not q._dequantize: - return True - return False - - module, tensor_name = get_module_from_name(model, param_name) + module, _ = get_module_from_name(model, param_name) if self.pre_quantized and any(isinstance(module, t) for t in [BaseQuantizedTensor]): return True - elif isinstance(module, QuantInputBase) and "weight" in tensor_name: - return is_param_quantized(module) - return False + return is_quantized(module) def create_quantized_param( self, @@ -140,12 +128,17 @@ def _process_model_before_weight_loading( # ModelOpt imports diffusers internally. This is here to prevent circular imports import modelopt.torch.opt as mto - self.modules_to_not_convert = self.quantization_config.modules_to_not_convert + modules_to_not_convert = self.quantization_config.modules_to_not_convert - if not isinstance(self.modules_to_not_convert, list): - self.modules_to_not_convert = [self.modules_to_not_convert] + if modules_to_not_convert is None: + modules_to_not_convert = [] + if isinstance(modules_to_not_convert, str): + modules_to_not_convert = [modules_to_not_convert] + modules_to_not_convert.extend(keep_in_fp32_modules) - self.modules_to_not_convert.extend(keep_in_fp32_modules) + for module in modules_to_not_convert: + self.quantization_config.modelopt_config["quant_cfg"]["*" + module + "*"] = {"enable": False} + self.quantization_config.modules_to_not_convert = modules_to_not_convert mto.apply_mode(model, mode=[("quantize", self.quantization_config.modelopt_config)]) model.config.quantization_config = self.quantization_config diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 788dc8337bc5..f4507b5b2575 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -798,7 +798,7 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: Get the config from the quantization type. """ import modelopt.torch.quantization as mtq - + BASE_CONFIG = { "quant_cfg": { "*weight_quantizer": {"fake_quant": False}, @@ -829,7 +829,7 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: quant_cfg[k]["num_bits"] = self.type_bit_map[act_type] continue quant_cfg[k]["num_bits"] = self.type_bit_map[w_type] - + if self.block_quantize and self.channel_quantize: quant_cfg["*weight_quantizer"]["block_sizes"] = { self.channel_quantize: self.block_quantize @@ -844,17 +844,13 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: # Only fixed sizes are supported for now in modelopt if "NF4" in w_type: - BASE_CONFIG["quant_cfg"]["*weight_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.channel_quantize: self.block_quantize}}) + quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.channel_quantize: self.block_quantize}}) elif "NVFP4" in w_type: - BASE_CONFIG["quant_cfg"]["*weight_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) + quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) if act_type: if "NF4" in act_type: - BASE_CONFIG["quant_cfg"]["*input_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.channel_quantize: self.block_quantize}}) + quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.channel_quantize: self.block_quantize}}) elif "NVFP4" in act_type: - BASE_CONFIG["quant_cfg"]["*input_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) - - if self.modules_to_not_convert is not None: - for module in self.modules_to_not_convert: - quant_cfg["*" + module + "*"] = {"enable": False} + quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) return BASE_CONFIG From 8f88f295d35a62e6f59bdd03efdb890ba2ed23ab Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 6 May 2025 12:20:07 +0530 Subject: [PATCH 13/31] update --- .../quantizers/quantization_config.py | 90 +++++++++++++------ 1 file changed, 65 insertions(+), 25 deletions(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index f4507b5b2575..27fcd462b9a3 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -28,7 +28,7 @@ from dataclasses import dataclass from enum import Enum from functools import partial -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union from packaging import version @@ -732,7 +732,7 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): Args: quant_type (`str`): The type of quantization we want to use, following is how to use: - **weightquant_activationquant --> FP8_FP8** + **weightquant_activationquant ==> FP8_FP8** In the above example we have use FP8 for both weight and activation quantization. Following are the all the options: - FP8 @@ -748,50 +748,90 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): The channel quantization axis, useful for quantizing models across different axes. block_quantize (`int`, *optional*, default to `None`): The block size, useful to further quantize each channel/axes into blocks. + scale_channel_quantize (`int`, *optional*, default to `None`): + The scale channel quantization axis, useful for quantizing calculated scale across different axes. + scale_block_quantize (`int`, *optional*, default to `None`): + The scale block size, useful for quantizing each scale channel/axes into blocks. algorithm (`str`, *optional*, default to `"max"`): The algorithm to use for quantization, currently only supports `"max"`. + forward_loop (`Callable`, *optional*, default to `None`): + The forward loop function to use for calibration during quantization. modelopt_config (`dict`, *optional*, default to `None`): The modelopt config, useful for passing custom configs to modelopt. + kwargs (`Dict[str, Any]`, *optional*): + Additional parameters which are to be used for calibration. """ + quanttype_to_numbits = { + "FP8": (4, 3), + # "INT8": 8, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/pull/166 + "INT4": 4, + # "NF4": 4, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/183 + "NVFP4": (2,1), + } - def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, weight_only: bool=True, channel_quantize: Optional[int] = None, block_quantize: Optional[int] = None, algorithm: str = "max", modelopt_config: Optional[dict] = None, **kwargs) -> None: + def __init__( + self, + quant_type: str, + modules_to_not_convert: Optional[List[str]] = None, + weight_only: bool=True, + channel_quantize: Optional[int] = None, + block_quantize: Optional[int] = None, + scale_channel_quantize: Optional[int] = None, + scale_block_quantize: Optional[int] = None, + algorithm: str = "max", + forward_loop: Optional[Callable] = None, + modelopt_config: Optional[dict] = None, + **kwargs + ) -> None: self.quant_method = QuantizationMethod.MODELOPT - self.quant_type = quant_type - self.type_bit_map = { - "FP8": (4, 3), - # "INT8": 8, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/pull/166 - "INT4": 4, - # "NF4": 4, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/183 - "NVFP4": (2,1), + self._normalize_quant_type(quant_type) + self.modules_to_not_convert = modules_to_not_convert + self.weight_only = weight_only + self.channel_quantize = channel_quantize + self.block_quantize = block_quantize + self.calib_cfg = { + "method": algorithm, + "forward_loop": forward_loop, + **kwargs, } + self.scale_channel_quantize = scale_channel_quantize + self.scale_block_quantize = scale_block_quantize + self.modelopt_config = self.get_config_from_quant_type() if not modelopt_config else modelopt_config - parts = self.quant_type.split("_") + def _normalize_quant_type(self, quant_type: str) -> str: + """ + Validates and normalizes the quantization type string. + + Splits the quant_type into weight and activation components, verifies them + against supported types, and replaces unsupported values with safe defaults. + + Args: + quant_type (str): The input quantization type string (e.g., 'FP8_INT8'). + + Returns: + str: A valid quantization type string (e.g., 'FP8_INT8' or 'FP8'). + """ + parts = quant_type.split("_") w_type = parts[0] act_type = parts[1] if len(parts) > 1 else None if len(parts) > 2: logger.warning( - f"Quantization type {self.quant_type} is not supported. Picking FP8_INT8 as default" + f"Quantization type {quant_type} is not supported. Picking FP8_INT8 as default" ) w_type = "FP8" act_type = None else: - if w_type not in self.type_bit_map: + if w_type not in NVIDIAModelOptConfig.quanttype_to_numbits: logger.warning( f"Weight Quantization type {w_type} is not supported. Picking FP8 as default" ) w_type = "FP8" - if act_type is not None and act_type not in self.type_bit_map: + if act_type is not None and act_type not in NVIDIAModelOptConfig.quanttype_to_numbits: logger.warning( f"Activation Quantization type {act_type} is not supported. Picking INT8 as default" ) act_type = None self.quant_type = w_type + ("_" + act_type if act_type is not None else "") - self.modules_to_not_convert = modules_to_not_convert - self.weight_only = weight_only - self.channel_quantize = channel_quantize - self.block_quantize = block_quantize - self.algorithm = algorithm - self.modelopt_config = self.get_config_from_quant_type() if not modelopt_config else modelopt_config def get_config_from_quant_type(self) -> Dict[str, Any]: """ @@ -810,7 +850,7 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: "*softmax_quantizer": {}, **mtq.config._default_disabled_quantizer_cfg, }, - "algorithm": self.algorithm, + "algorithm": self.calib_cfg, } quant_cfg = BASE_CONFIG["quant_cfg"] @@ -826,9 +866,9 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: if k not in mtq.config._default_disabled_quantizer_cfg and "enable" not in quant_cfg[k]: if k == "*input_quantizer": if act_type is not None: - quant_cfg[k]["num_bits"] = self.type_bit_map[act_type] + quant_cfg[k]["num_bits"] = NVIDIAModelOptConfig.quanttype_to_numbits[act_type] continue - quant_cfg[k]["num_bits"] = self.type_bit_map[w_type] + quant_cfg[k]["num_bits"] = NVIDIAModelOptConfig.quanttype_to_numbits[w_type] if self.block_quantize and self.channel_quantize: quant_cfg["*weight_quantizer"]["block_sizes"] = { @@ -844,12 +884,12 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: # Only fixed sizes are supported for now in modelopt if "NF4" in w_type: - quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.channel_quantize: self.block_quantize}}) + quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}}) elif "NVFP4" in w_type: quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) if act_type: if "NF4" in act_type: - quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.channel_quantize: self.block_quantize}}) + quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}}) elif "NVFP4" in act_type: quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) From 212603fb0d58e7f4412a4c19316ab8bd7337075c Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 6 May 2025 13:45:41 +0530 Subject: [PATCH 14/31] update --- src/diffusers/quantizers/quantization_config.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 27fcd462b9a3..1f84f59ca3d7 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -791,8 +791,7 @@ def __init__( self.block_quantize = block_quantize self.calib_cfg = { "method": algorithm, - "forward_loop": forward_loop, - **kwargs, + "forward_loop": forward_loop } self.scale_channel_quantize = scale_channel_quantize self.scale_block_quantize = scale_block_quantize From 24f1bcb6901245866579bdce54a1c8de78887af4 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 6 May 2025 13:50:08 +0530 Subject: [PATCH 15/31] update --- src/diffusers/quantizers/modelopt/modelopt_quantizer.py | 2 +- src/diffusers/quantizers/quantization_config.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index f2798b4d1b75..e1307589bd2f 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -99,7 +99,7 @@ def create_quantized_param( setattr(module, tensor_name, param_value) else: set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) - mtq.calibrate(module, self.quantization_config.modelopt_config["algorithm"]) + mtq.calibrate(module, self.quantization_config.modelopt_config["algorithm"], self.quantization_config.modelopt_config.forward_loop) mtq.compress(module) module.weight.requires_grad = False diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 1f84f59ca3d7..142788b0d6de 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -791,8 +791,9 @@ def __init__( self.block_quantize = block_quantize self.calib_cfg = { "method": algorithm, - "forward_loop": forward_loop + # add more options here if needed } + self.forward_loop = forward_loop self.scale_channel_quantize = scale_channel_quantize self.scale_block_quantize = scale_block_quantize self.modelopt_config = self.get_config_from_quant_type() if not modelopt_config else modelopt_config From 65097f179da30d1a4cd53a1d602479230f33d0a0 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 6 May 2025 13:52:19 +0530 Subject: [PATCH 16/31] update --- src/diffusers/quantizers/modelopt/modelopt_quantizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index e1307589bd2f..e4b4de8358f1 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -99,7 +99,7 @@ def create_quantized_param( setattr(module, tensor_name, param_value) else: set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) - mtq.calibrate(module, self.quantization_config.modelopt_config["algorithm"], self.quantization_config.modelopt_config.forward_loop) + mtq.calibrate(module, self.quantization_config.modelopt_config["algorithm"], self.quantization_config.forward_loop) mtq.compress(module) module.weight.requires_grad = False From 97f94ae3b3abb2f0eb9a0b58c5eb6281b32762c6 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 6 May 2025 17:12:41 +0530 Subject: [PATCH 17/31] update --- src/diffusers/quantizers/modelopt/modelopt_quantizer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index e4b4de8358f1..8d457484efcf 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -73,10 +73,12 @@ def check_if_quantized_param( from modelopt.torch.quantization.qtensor import BaseQuantizedTensor from modelopt.torch.quantization.utils import is_quantized - module, _ = get_module_from_name(model, param_name) + module, tensor_name = get_module_from_name(model, param_name) if self.pre_quantized and any(isinstance(module, t) for t in [BaseQuantizedTensor]): return True - return is_quantized(module) + elif is_quantized(module) and "weight" in tensor_name: + return True + return False def create_quantized_param( self, From 752544f6af86ba63af68fa31cdab8799a0bc6c39 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Fri, 9 May 2025 16:15:56 +0530 Subject: [PATCH 18/31] update --- src/diffusers/quantizers/quantization_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 142788b0d6de..316bd1e01357 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -763,7 +763,7 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): """ quanttype_to_numbits = { "FP8": (4, 3), - # "INT8": 8, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/pull/166 + "INT8": 8, "INT4": 4, # "NF4": 4, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/183 "NVFP4": (2,1), From 482fe78cf0cdcb939074c79ed6ae75a8965d9df1 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Mon, 21 Jul 2025 23:34:31 +0530 Subject: [PATCH 19/31] updates --- .../quantizers/modelopt/modelopt_quantizer.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index 8d457484efcf..d5b42ddb10a0 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -70,11 +70,10 @@ def check_if_quantized_param( **kwargs, ): # ModelOpt imports diffusers internally. This is here to prevent circular imports - from modelopt.torch.quantization.qtensor import BaseQuantizedTensor from modelopt.torch.quantization.utils import is_quantized module, tensor_name = get_module_from_name(model, param_name) - if self.pre_quantized and any(isinstance(module, t) for t in [BaseQuantizedTensor]): + if self.pre_quantized: return True elif is_quantized(module) and "weight" in tensor_name: return True @@ -98,9 +97,9 @@ def create_quantized_param( dtype = kwargs.get("dtype", torch.float32) module, tensor_name = get_module_from_name(model, param_name) if self.pre_quantized: - setattr(module, tensor_name, param_value) + module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device)) else: - set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) + set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) mtq.calibrate(module, self.quantization_config.modelopt_config["algorithm"], self.quantization_config.forward_loop) mtq.compress(module) module.weight.requires_grad = False @@ -129,6 +128,9 @@ def _process_model_before_weight_loading( ): # ModelOpt imports diffusers internally. This is here to prevent circular imports import modelopt.torch.opt as mto + + if self.pre_quantized: + return modules_to_not_convert = self.quantization_config.modules_to_not_convert @@ -141,11 +143,20 @@ def _process_model_before_weight_loading( for module in modules_to_not_convert: self.quantization_config.modelopt_config["quant_cfg"]["*" + module + "*"] = {"enable": False} self.quantization_config.modules_to_not_convert = modules_to_not_convert - mto.apply_mode(model, mode=[("quantize", self.quantization_config.modelopt_config)]) model.config.quantization_config = self.quantization_config def _process_model_after_weight_loading(self, model, **kwargs): + # ModelOpt imports diffusers internally. This is here to prevent circular imports + from modelopt.torch.opt import ModeloptStateManager + + if self.pre_quantized: + return model + + for _, m in model.named_modules(): + if hasattr(m, ModeloptStateManager._state_key) and m is not model: + ModeloptStateManager.remove_state(m) + return model @property From d48835d4803d104e98384b15795d7418935ba491 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Sat, 16 Aug 2025 15:42:23 +0530 Subject: [PATCH 20/31] update --- docs/source/en/quantization/modelopt.md | 134 ++++++++++++++++++ setup.py | 4 +- .../quantizers/modelopt/modelopt_quantizer.py | 5 - .../quantizers/quantization_config.py | 4 +- 4 files changed, 138 insertions(+), 9 deletions(-) create mode 100644 docs/source/en/quantization/modelopt.md diff --git a/docs/source/en/quantization/modelopt.md b/docs/source/en/quantization/modelopt.md new file mode 100644 index 000000000000..c3c293b17bfa --- /dev/null +++ b/docs/source/en/quantization/modelopt.md @@ -0,0 +1,134 @@ + + +# Nvidia ModelOpt + +[nvidia_modelopt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed. + +Before you begin, make sure you have nvidia_modelopt installed. + +```bash +pip install -U "nvidia_modelopt[hf]" +``` + + +Quantize a model by passing [`NVIDIAModelOptConfig`] to [`~ModelMixin.from_pretrained`] (you can also load pre-quantized models). This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers. + +The example below only quantizes the weights to FP8. + +```python +import torch +from diffusers import AutoModel, SanaPipeline, NVIDIAModelOptConfig + +model_id = "Efficient-Large-Model/Sana_600M_1024px_diffusers" +dtype = torch.bfloat16 + +quantization_config = NVIDIAModelOptConfig(quant_type="FP8", quant_method="modelopt") +transformer = AutoModel.from_pretrained( + model_id, + subfolder="transformer", + quantization_config=quantization_config, + torch_dtype=dtype, +) +pipe = SanaPipeline.from_pretrained( + model_id, + transformer=transformer, + torch_dtype=dtype, +) +pipe.to("cuda") + +print(f"Pipeline memory usage: {torch.cuda.max_memory_reserved() / 1024**3:.3f} GB") + +prompt = "A cat holding a sign that says hello world" +image = pipe( + prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512 +).images[0] +image.save("output.png") +``` + +## NVIDIAModelOptConfig + +The `NVIDIAModelOptConfig` class accepts three parameters: +- `quant_type`: A string value mentioning one of the quantization types below. +- `modules_to_not_convert`: A list of module full/partial module names for which quantization should not be performed. For example, to not perform any quantization of the [`SanaTransformer2DModel`]'s conv blocks, one would specify: `modules_to_not_convert=["conv"]`. +- `kwargs`: A dict of keyword arguments to pass to the underlying quantization method which will be invoked based on `quant_type`. + +## Supported quantization types + +ModelOpt supports weight-only, channel and block quantization int8, fp8, int4, nf4, and nvfp4. The quantization methods are designed to reduce the memory footprint of the model weights while maintaining the performance of the model during inference. + +Weight-only quantization stores the model weights in a specific low-bit data type but performs computation with a higher-precision data type, like `bfloat16`. This lowers the memory requirements from model weights but retains the memory peaks for activation computation. + +The quantization methods supported are as follows: + +| **Quantization Type** | **Supported Schemes** | **Required Kwargs** | **Additional Notes** | +|-----------------------|-----------------------|---------------------|----------------------| +| **INT8** | `int8 weight only`, `int8 channel quantization`, `int8 block quantization` | `quant_type`, `quant_type + channel_quantize`, `quant_type + channel_quantize + block_quantize` | +| **FP8** | `fp8 weight only`, `fp8 channel quantization`, `fp8 block quantization` | `quant_type`, `quant_type + channel_quantize`, `quant_type + channel_quantize + block_quantize` | +| **INT4** | `int4 weight only`, `int4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`| +| **NF4** | `nf4 weight only`, `nf4 double block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize + scale_channel_quantize` + `scale_block_quantize` | `channel_quantize = -1 and scale_channel_quantize = -1 are only supported for now` | +| **NVFP4** | `nvfp4 weight only`, `nvfp4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`| + +Note - Channel and Block quantization generally don't work well with convolutional layers. Please use the `modules_to_not_convert` argument to skip quantization for convolutional layers. + +Refer to the [official modelopt documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available. + +## Serializing and Deserializing quantized models + +To serialize a quantized model in a given dtype, first load the model with the desired quantization dtype and then save it using the [`~ModelMixin.save_pretrained`] method. + +```python +import torch +from diffusers import AutoModel, NVIDIAModelOptConfig +from modelopt.torch.opt import enable_huggingface_checkpointing + +enable_huggingface_checkpointing() + +model_id = "Efficient-Large-Model/Sana_600M_1024px_diffusers" +quant_config_fp8 = {"quant_type": "FP8", "quant_method": "modelopt"} +quant_config_fp8 = NVIDIAModelOptConfig(**quant_config_fp8) +model = AutoModel.from_pretrained( + model_id, + subfolder="transformer", + quantization_config=quant_config_fp8, + torch_dtype=torch.bfloat16, +) +model.save_pretrained('path/to/sana_fp8', safe_serialization=False) +``` + +To load a serialized quantized model, use the [`~ModelMixin.from_pretrained`] method. + +```python +import torch +from diffusers import AutoModel, NVIDIAModelOptConfig, SanaPipeline +from modelopt.torch.opt import enable_huggingface_checkpointing + +enable_huggingface_checkpointing() + +quantization_config = NVIDIAModelOptConfig(quant_type="FP8", quant_method="modelopt") +transformer = AutoModel.from_pretrained( + "path/to/sana_fp8", + subfolder="transformer", + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, +) +pipe = SanaPipeline.from_pretrained( + "Efficient-Large-Model/Sana_600M_1024px_diffusers", + transformer=transformer, + torch_dtype=torch.bfloat16, +) +pipe.to("cuda") +prompt = "A cat holding a sign that says hello world" +image = pipe( + prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512 +).images[0] +image.save("output.png") +``` diff --git a/setup.py b/setup.py index ae256da6d67c..7d134982d208 100644 --- a/setup.py +++ b/setup.py @@ -132,7 +132,7 @@ "gguf>=0.10.0", "torchao>=0.7.0", "bitsandbytes>=0.43.3", - "nvidia_modelopt[torch, hf]>=0.27.0", + "nvidia_modelopt[hf]>=0.27.0", "regex!=2019.12.17", "requests", "tensorboard", @@ -245,7 +245,7 @@ def run(self): extras["gguf"] = deps_list("gguf", "accelerate") extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate") extras["torchao"] = deps_list("torchao", "accelerate") -extras["nvidia_modelopt"] = deps_list("nvidia_modelopt[torch, hf]") +extras["nvidia_modelopt"] = deps_list("nvidia_modelopt[hf]") if os.name == "nt": # windows extras["flax"] = [] # jax is not supported on windows diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index d5b42ddb10a0..b8058e6f0ad3 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -42,11 +42,6 @@ def validate_environment(self, *args, **kwargs): raise ImportError( "Loading an nvidia-modelopt quantized model requires nvidia-modelopt library (`pip install nvidia-modelopt`)" ) - if not is_nvidia_modelopt_version(">=", "0.25.0"): - raise ImportError( - "Loading an nvidia-modelopt quantized model requires `nvidia-modelopt>=0.25.0`. " - "Please upgrade your installation with `pip install --upgrade nvidia-modelopt" - ) self.offload = False diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index c590577d2941..c82a91575776 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -872,14 +872,14 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: continue quant_cfg[k]["num_bits"] = NVIDIAModelOptConfig.quanttype_to_numbits[w_type] - if self.block_quantize and self.channel_quantize: + if self.block_quantize is not None and self.channel_quantize is not None: quant_cfg["*weight_quantizer"]["block_sizes"] = { self.channel_quantize: self.block_quantize } quant_cfg["*input_quantizer"]["block_sizes"] = { self.channel_quantize: self.block_quantize, "type": "dynamic" } - elif self.channel_quantize: + elif self.channel_quantize is not None: quant_cfg["*weight_quantizer"]["axis"] = self.channel_quantize quant_cfg["*input_quantizer"]["axis"] = self.channel_quantize quant_cfg["*input_quantizer"]["type"] = "dynamic" From 670202dbd3816acc43b8acd7816850f4eafa2574 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Sat, 16 Aug 2025 16:01:16 +0530 Subject: [PATCH 21/31] update --- tests/quantization/modelopt/__init__.py | 0 tests/quantization/modelopt/test_modelopt.py | 323 +++++++++++++++++++ 2 files changed, 323 insertions(+) create mode 100644 tests/quantization/modelopt/__init__.py create mode 100644 tests/quantization/modelopt/test_modelopt.py diff --git a/tests/quantization/modelopt/__init__.py b/tests/quantization/modelopt/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/quantization/modelopt/test_modelopt.py b/tests/quantization/modelopt/test_modelopt.py new file mode 100644 index 000000000000..c37b89d29085 --- /dev/null +++ b/tests/quantization/modelopt/test_modelopt.py @@ -0,0 +1,323 @@ +import gc +import tempfile +import unittest + +from diffusers import SanaPipeline, SanaTransformer2DModel, NVIDIAModelOptConfig +from diffusers.utils import is_nvidia_modelopt_available, is_torch_available +from diffusers.utils.testing_utils import ( + backend_empty_cache, + backend_reset_peak_memory_stats, + enable_full_determinism, + nightly, + numpy_cosine_similarity_distance, + require_accelerate, + require_big_accelerator, + require_torch_cuda_compatibility, + torch_device, +) + +if is_nvidia_modelopt_available(): + import modelopt.torch.quantization as mtq + +if is_torch_available(): + import torch + from ..utils import LoRALayer, get_memory_consumption_stat + +enable_full_determinism() + + +@nightly +@require_big_accelerator +@require_accelerate +class ModelOptBaseTesterMixin: + model_id = "Efficient-Large-Model/Sana_600M_1024px_diffusers" + model_cls = SanaTransformer2DModel + pipeline_cls = SanaPipeline + torch_dtype = torch.bfloat16 + expected_memory_reduction = 0.0 + keep_in_fp32_module = "" + modules_to_not_convert = "" + _test_torch_compile = False + + def setUp(self): + backend_reset_peak_memory_stats(torch_device) + backend_empty_cache(torch_device) + gc.collect() + + def tearDown(self): + backend_reset_peak_memory_stats(torch_device) + backend_empty_cache(torch_device) + gc.collect() + + def get_dummy_init_kwargs(self): + return {"quant_type": "FP8"} + + def get_dummy_model_init_kwargs(self): + return { + "pretrained_model_name_or_path": self.model_id, + "torch_dtype": self.torch_dtype, + "quantization_config": NVIDIAModelOptConfig(**self.get_dummy_init_kwargs()), + "subfolder": "transformer", + } + + def test_modelopt_layers(self): + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + for name, module in model.named_modules(): + if isinstance(module, torch.nn.Linear): + assert mtq.utils.is_quantized(module) + + def test_modelopt_memory_usage(self): + inputs = self.get_dummy_inputs() + inputs = { + k: v.to(device=torch_device, dtype=torch.bfloat16) + for k, v in inputs.items() + if not isinstance(v, bool) + } + + unquantized_model = self.model_cls.from_pretrained( + self.model_id, torch_dtype=self.torch_dtype, subfolder="transformer" + ) + unquantized_model.to(torch_device) + unquantized_model_memory = get_memory_consumption_stat( + unquantized_model, inputs + ) + + quantized_model = self.model_cls.from_pretrained( + **self.get_dummy_model_init_kwargs() + ) + quantized_model.to(torch_device) + quantized_model_memory = get_memory_consumption_stat(quantized_model, inputs) + + assert ( + unquantized_model_memory / quantized_model_memory + >= self.expected_memory_reduction + ) + + def test_keep_modules_in_fp32(self): + _keep_in_fp32_modules = self.model_cls._keep_in_fp32_modules + self.model_cls._keep_in_fp32_modules = self.keep_in_fp32_module + + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + model.to(torch_device) + + for name, module in model.named_modules(): + if isinstance(module, torch.nn.Linear): + if name in model._keep_in_fp32_modules: + assert module.weight.dtype == torch.float32 + self.model_cls._keep_in_fp32_modules = _keep_in_fp32_modules + + def test_modules_to_not_convert(self): + init_kwargs = self.get_dummy_model_init_kwargs() + quantization_config_kwargs = self.get_dummy_init_kwargs() + quantization_config_kwargs.update( + {"modules_to_not_convert": self.modules_to_not_convert} + ) + quantization_config = NVIDIAModelOptConfig(**quantization_config_kwargs) + init_kwargs.update({"quantization_config": quantization_config}) + + model = self.model_cls.from_pretrained(**init_kwargs) + model.to(torch_device) + + for name, module in model.named_modules(): + if name in self.modules_to_not_convert: + assert not mtq.utils.is_quantized(module) + + def test_dtype_assignment(self): + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + + with self.assertRaises(ValueError): + model.to(torch.float16) + + with self.assertRaises(ValueError): + device_0 = f"{torch_device}:0" + model.to(device=device_0, dtype=torch.float16) + + with self.assertRaises(ValueError): + model.float() + + with self.assertRaises(ValueError): + model.half() + + model.to(torch_device) + + def test_serialization(self): + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + inputs = self.get_dummy_inputs() + + model.to(torch_device) + with torch.no_grad(): + model_output = model(**inputs) + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + saved_model = self.model_cls.from_pretrained( + tmp_dir, + torch_dtype=torch.bfloat16, + ) + + saved_model.to(torch_device) + with torch.no_grad(): + saved_model_output = saved_model(**inputs) + + assert torch.allclose( + model_output.sample, saved_model_output.sample, rtol=1e-5, atol=1e-5 + ) + + def test_torch_compile(self): + if not self._test_torch_compile: + return + + model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) + compiled_model = torch.compile( + model, mode="max-autotune", fullgraph=True, dynamic=False + ) + + model.to(torch_device) + with torch.no_grad(): + model_output = model(**self.get_dummy_inputs()).sample + + compiled_model.to(torch_device) + with torch.no_grad(): + compiled_model_output = compiled_model(**self.get_dummy_inputs()).sample + + model_output = model_output.detach().float().cpu().numpy() + compiled_model_output = compiled_model_output.detach().float().cpu().numpy() + + max_diff = numpy_cosine_similarity_distance( + model_output.flatten(), compiled_model_output.flatten() + ) + assert max_diff < 1e-3 + + def test_device_map_error(self): + with self.assertRaises(ValueError): + _ = self.model_cls.from_pretrained( + **self.get_dummy_model_init_kwargs(), + device_map={0: "8GB", "cpu": "16GB"}, + ) + + def get_dummy_inputs(self): + batch_size = 1 + seq_len = 16 + height = width = 32 + num_latent_channels = 4 + caption_channels = 8 + + torch.manual_seed(0) + hidden_states = torch.randn( + (batch_size, num_latent_channels, height, width) + ).to(torch_device, dtype=torch.bfloat16) + encoder_hidden_states = torch.randn((batch_size, seq_len, caption_channels)).to( + torch_device, dtype=torch.bfloat16 + ) + timestep = ( + torch.tensor([1.0]) + .to(torch_device, dtype=torch.bfloat16) + .expand(batch_size) + ) + + return { + "hidden_states": hidden_states, + "encoder_hidden_states": encoder_hidden_states, + "timestep": timestep, + } + + def test_model_cpu_offload(self): + init_kwargs = self.get_dummy_init_kwargs() + transformer = self.model_cls.from_pretrained( + self.model_id, + quantization_config=NVIDIAModelOptConfig(**init_kwargs), + subfolder="transformer", + torch_dtype=torch.bfloat16, + ) + pipe = self.pipeline_cls.from_pretrained( + self.model_id, transformer=transformer, torch_dtype=torch.bfloat16 + ) + pipe.enable_model_cpu_offload(device=torch_device) + _ = pipe("a cat holding a sign that says hello", num_inference_steps=2) + + def test_training(self): + quantization_config = NVIDIAModelOptConfig(**self.get_dummy_init_kwargs()) + quantized_model = self.model_cls.from_pretrained( + self.model_id, + subfolder="transformer", + quantization_config=quantization_config, + torch_dtype=torch.bfloat16, + ).to(torch_device) + + for param in quantized_model.parameters(): + param.requires_grad = False + if param.ndim == 1: + param.data = param.data.to(torch.float32) + + for _, module in quantized_model.named_modules(): + if hasattr(module, "to_q"): + module.to_q = LoRALayer(module.to_q, rank=4) + if hasattr(module, "to_k"): + module.to_k = LoRALayer(module.to_k, rank=4) + if hasattr(module, "to_v"): + module.to_v = LoRALayer(module.to_v, rank=4) + + with torch.amp.autocast(str(torch_device), dtype=torch.bfloat16): + inputs = self.get_dummy_inputs() + output = quantized_model(**inputs)[0] + output.norm().backward() + + for module in quantized_model.modules(): + if isinstance(module, LoRALayer): + self.assertTrue(module.adapter[1].weight.grad is not None) + + +class SanaTransformerFP8WeightsTest(ModelOptBaseTesterMixin, unittest.TestCase): + expected_memory_reduction = 0.6 + + def get_dummy_init_kwargs(self): + return {"quant_type": "FP8"} + + +class SanaTransformerINT8WeightsTest(ModelOptBaseTesterMixin, unittest.TestCase): + expected_memory_reduction = 0.6 + _test_torch_compile = True + + def get_dummy_init_kwargs(self): + return {"quant_type": "INT8"} + + +@require_torch_cuda_compatibility(8.0) +class SanaTransformerINT4WeightsTest(ModelOptBaseTesterMixin, unittest.TestCase): + expected_memory_reduction = 0.55 + + def get_dummy_init_kwargs(self): + return { + "quant_type": "INT4", + "block_quantize": 128, + "channel_quantize": -1, + "modules_to_not_convert": ["conv", "patch_embed"], + } + + +@require_torch_cuda_compatibility(8.0) +class SanaTransformerNF4WeightsTest(ModelOptBaseTesterMixin, unittest.TestCase): + expected_memory_reduction = 0.65 + + def get_dummy_init_kwargs(self): + return { + "quant_type": "NF4", + "block_quantize": 128, + "channel_quantize": -1, + "scale_block_quantize": 8, + "scale_channel_quantize": -1, + "modules_to_not_convert": ["conv"], + } + + +@require_torch_cuda_compatibility(8.0) +class SanaTransformerNVFP4WeightsTest(ModelOptBaseTesterMixin, unittest.TestCase): + expected_memory_reduction = 0.65 + + def get_dummy_init_kwargs(self): + return { + "quant_type": "NVFP4", + "block_quantize": 128, + "channel_quantize": -1, + "modules_to_not_convert": ["conv"], + } From 395e75b2c713a6cd15ece3dc95d48c0f81179078 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Fri, 22 Aug 2025 09:38:28 +0530 Subject: [PATCH 22/31] addressed PR comments --- docs/source/en/quantization/modelopt.md | 3 +- .../quantizers/modelopt/modelopt_quantizer.py | 2 +- .../quantizers/quantization_config.py | 28 +++++++++++-------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/docs/source/en/quantization/modelopt.md b/docs/source/en/quantization/modelopt.md index c3c293b17bfa..f76d52096e1e 100644 --- a/docs/source/en/quantization/modelopt.md +++ b/docs/source/en/quantization/modelopt.md @@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> -# Nvidia ModelOpt +# NVIDIA ModelOpt [nvidia_modelopt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed. @@ -19,7 +19,6 @@ Before you begin, make sure you have nvidia_modelopt installed. pip install -U "nvidia_modelopt[hf]" ``` - Quantize a model by passing [`NVIDIAModelOptConfig`] to [`~ModelMixin.from_pretrained`] (you can also load pre-quantized models). This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers. The example below only quantizes the weights to FP8. diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index b8058e6f0ad3..17023dd72bd0 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -32,7 +32,7 @@ class NVIDIAModelOptQuantizer(DiffusersQuantizer): use_keep_in_fp32_modules = True requires_calibration = False - required_packages = ["modelopt"] + required_packages = ["nvidia_modelopt"] def __init__(self, quantization_config, **kwargs): super().__init__(quantization_config, **kwargs) diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index c82a91575776..1de2adfada02 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -767,9 +767,13 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): "FP8": (4, 3), "INT8": 8, "INT4": 4, - # "NF4": 4, # TODO: enable this upon modelopt release https://github.com/NVIDIA/TensorRT-Model-Optimizer/issues/183 + "NF4": 4, "NVFP4": (2,1), } + quanttype_to_scalingbits = { + "NF4": 8, + "NVFP4": (4, 3), + } def __init__( self, @@ -884,15 +888,17 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: quant_cfg["*input_quantizer"]["axis"] = self.channel_quantize quant_cfg["*input_quantizer"]["type"] = "dynamic" - # Only fixed sizes are supported for now in modelopt - if "NF4" in w_type: - quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}}) - elif "NVFP4" in w_type: - quant_cfg["*weight_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) - if act_type: - if "NF4" in act_type: - quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":8, "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}}) - elif "NVFP4" in act_type: - quant_cfg["*input_quantizer"]["block_sizes"].update({"scale_bits":(4,3), "type": "dynamic"}) + # Only fixed scaling sizes are supported for now in modelopt + if self.scale_channel_quantize is not None and self.scale_block_quantize is not None: + if w_type in NVIDIAModelOptConfig.quanttype_to_scalingbits: + quant_cfg["*weight_quantizer"]["block_sizes"].update({ + "scale_bits": NVIDIAModelOptConfig.quanttype_to_scalingbits[w_type], + "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize} + }) + if act_type and act_type in NVIDIAModelOptConfig.quanttype_to_scalingbits: + quant_cfg["*input_quantizer"]["block_sizes"].update({ + "scale_bits": NVIDIAModelOptConfig.quanttype_to_scalingbits[act_type], + "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize} + }) return BASE_CONFIG From bbbc840c4b1fcafd8d1536f8a9ed63240d7c39ea Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Fri, 22 Aug 2025 09:41:36 +0530 Subject: [PATCH 23/31] updates --- tests/quantization/modelopt/test_modelopt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/quantization/modelopt/test_modelopt.py b/tests/quantization/modelopt/test_modelopt.py index c37b89d29085..9f8312438ef6 100644 --- a/tests/quantization/modelopt/test_modelopt.py +++ b/tests/quantization/modelopt/test_modelopt.py @@ -319,5 +319,7 @@ def get_dummy_init_kwargs(self): "quant_type": "NVFP4", "block_quantize": 128, "channel_quantize": -1, + "scale_block_quantize": 8, + "scale_channel_quantize": -1, "modules_to_not_convert": ["conv"], } From c53d25190ea7e58c4c11dd2c91802703390221af Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Fri, 22 Aug 2025 09:46:08 +0530 Subject: [PATCH 24/31] code formatting --- src/diffusers/dependency_versions_table.py | 2 +- .../quantizers/modelopt/modelopt_quantizer.py | 15 ++--- .../quantizers/quantization_config.py | 55 +++++++++---------- src/diffusers/utils/import_utils.py | 3 +- tests/quantization/modelopt/test_modelopt.py | 53 ++++++------------ 5 files changed, 52 insertions(+), 76 deletions(-) diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 3bcaf5722682..5bf0d2a85a85 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -39,7 +39,7 @@ "gguf": "gguf>=0.10.0", "torchao": "torchao>=0.7.0", "bitsandbytes": "bitsandbytes>=0.43.3", - "nvidia_modelopt[torch, hf]": "nvidia_modelopt[torch, hf]>=0.27.0", + "nvidia_modelopt[hf]": "nvidia_modelopt[hf]>=0.27.0", "regex": "regex!=2019.12.17", "requests": "requests", "tensorboard": "tensorboard", diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index 17023dd72bd0..ae793a131a07 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -4,7 +4,6 @@ get_module_from_name, is_accelerate_available, is_nvidia_modelopt_available, - is_nvidia_modelopt_version, is_torch_available, logging, ) @@ -94,8 +93,10 @@ def create_quantized_param( if self.pre_quantized: module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device)) else: - set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) - mtq.calibrate(module, self.quantization_config.modelopt_config["algorithm"], self.quantization_config.forward_loop) + set_module_tensor_to_device(model, param_name, target_device, param_value, dtype) + mtq.calibrate( + module, self.quantization_config.modelopt_config["algorithm"], self.quantization_config.forward_loop + ) mtq.compress(module) module.weight.requires_grad = False @@ -123,7 +124,7 @@ def _process_model_before_weight_loading( ): # ModelOpt imports diffusers internally. This is here to prevent circular imports import modelopt.torch.opt as mto - + if self.pre_quantized: return @@ -144,14 +145,14 @@ def _process_model_before_weight_loading( def _process_model_after_weight_loading(self, model, **kwargs): # ModelOpt imports diffusers internally. This is here to prevent circular imports from modelopt.torch.opt import ModeloptStateManager - + if self.pre_quantized: return model - + for _, m in model.named_modules(): if hasattr(m, ModeloptStateManager._state_key) and m is not model: ModeloptStateManager.remove_state(m) - + return model @property diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 1de2adfada02..361d787b040d 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -734,9 +734,8 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): Args: quant_type (`str`): The type of quantization we want to use, following is how to use: - **weightquant_activationquant ==> FP8_FP8** - In the above example we have use FP8 for both weight and activation quantization. - Following are the all the options: + **weightquant_activationquant ==> FP8_FP8** In the above example we have use FP8 for both weight and + activation quantization. Following are the all the options: - FP8 - INT8 - INT4 @@ -763,12 +762,13 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): kwargs (`Dict[str, Any]`, *optional*): Additional parameters which are to be used for calibration. """ + quanttype_to_numbits = { "FP8": (4, 3), "INT8": 8, "INT4": 4, "NF4": 4, - "NVFP4": (2,1), + "NVFP4": (2, 1), } quanttype_to_scalingbits = { "NF4": 8, @@ -779,7 +779,7 @@ def __init__( self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, - weight_only: bool=True, + weight_only: bool = True, channel_quantize: Optional[int] = None, block_quantize: Optional[int] = None, scale_channel_quantize: Optional[int] = None, @@ -787,7 +787,7 @@ def __init__( algorithm: str = "max", forward_loop: Optional[Callable] = None, modelopt_config: Optional[dict] = None, - **kwargs + **kwargs, ) -> None: self.quant_method = QuantizationMethod.MODELOPT self._normalize_quant_type(quant_type) @@ -808,8 +808,8 @@ def _normalize_quant_type(self, quant_type: str) -> str: """ Validates and normalizes the quantization type string. - Splits the quant_type into weight and activation components, verifies them - against supported types, and replaces unsupported values with safe defaults. + Splits the quant_type into weight and activation components, verifies them against supported types, and + replaces unsupported values with safe defaults. Args: quant_type (str): The input quantization type string (e.g., 'FP8_INT8'). @@ -821,21 +821,15 @@ def _normalize_quant_type(self, quant_type: str) -> str: w_type = parts[0] act_type = parts[1] if len(parts) > 1 else None if len(parts) > 2: - logger.warning( - f"Quantization type {quant_type} is not supported. Picking FP8_INT8 as default" - ) + logger.warning(f"Quantization type {quant_type} is not supported. Picking FP8_INT8 as default") w_type = "FP8" act_type = None else: if w_type not in NVIDIAModelOptConfig.quanttype_to_numbits: - logger.warning( - f"Weight Quantization type {w_type} is not supported. Picking FP8 as default" - ) + logger.warning(f"Weight Quantization type {w_type} is not supported. Picking FP8 as default") w_type = "FP8" if act_type is not None and act_type not in NVIDIAModelOptConfig.quanttype_to_numbits: - logger.warning( - f"Activation Quantization type {act_type} is not supported. Picking INT8 as default" - ) + logger.warning(f"Activation Quantization type {act_type} is not supported. Picking INT8 as default") act_type = None self.quant_type = w_type + ("_" + act_type if act_type is not None else "") @@ -877,11 +871,10 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: quant_cfg[k]["num_bits"] = NVIDIAModelOptConfig.quanttype_to_numbits[w_type] if self.block_quantize is not None and self.channel_quantize is not None: - quant_cfg["*weight_quantizer"]["block_sizes"] = { - self.channel_quantize: self.block_quantize - } + quant_cfg["*weight_quantizer"]["block_sizes"] = {self.channel_quantize: self.block_quantize} quant_cfg["*input_quantizer"]["block_sizes"] = { - self.channel_quantize: self.block_quantize, "type": "dynamic" + self.channel_quantize: self.block_quantize, + "type": "dynamic", } elif self.channel_quantize is not None: quant_cfg["*weight_quantizer"]["axis"] = self.channel_quantize @@ -891,14 +884,18 @@ def get_config_from_quant_type(self) -> Dict[str, Any]: # Only fixed scaling sizes are supported for now in modelopt if self.scale_channel_quantize is not None and self.scale_block_quantize is not None: if w_type in NVIDIAModelOptConfig.quanttype_to_scalingbits: - quant_cfg["*weight_quantizer"]["block_sizes"].update({ - "scale_bits": NVIDIAModelOptConfig.quanttype_to_scalingbits[w_type], - "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize} - }) + quant_cfg["*weight_quantizer"]["block_sizes"].update( + { + "scale_bits": NVIDIAModelOptConfig.quanttype_to_scalingbits[w_type], + "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}, + } + ) if act_type and act_type in NVIDIAModelOptConfig.quanttype_to_scalingbits: - quant_cfg["*input_quantizer"]["block_sizes"].update({ - "scale_bits": NVIDIAModelOptConfig.quanttype_to_scalingbits[act_type], - "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize} - }) + quant_cfg["*input_quantizer"]["block_sizes"].update( + { + "scale_bits": NVIDIAModelOptConfig.quanttype_to_scalingbits[act_type], + "scale_block_sizes": {self.scale_channel_quantize: self.scale_block_quantize}, + } + ) return BASE_CONFIG diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index f26651867b25..1e132c76c1c8 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -841,11 +841,10 @@ def is_optimum_quanto_version(operation: str, version: str): return compare_versions(parse(_optimum_quanto_version), operation, version) - def is_nvidia_modelopt_version(operation: str, version: str): """ Compares the current Nvidia ModelOpt version to a given reference with an operation. - + Args: operation (`str`): A string representation of an operator, such as `">"` or `"<="` diff --git a/tests/quantization/modelopt/test_modelopt.py b/tests/quantization/modelopt/test_modelopt.py index 9f8312438ef6..bf21071f888d 100644 --- a/tests/quantization/modelopt/test_modelopt.py +++ b/tests/quantization/modelopt/test_modelopt.py @@ -2,7 +2,7 @@ import tempfile import unittest -from diffusers import SanaPipeline, SanaTransformer2DModel, NVIDIAModelOptConfig +from diffusers import NVIDIAModelOptConfig, SanaPipeline, SanaTransformer2DModel from diffusers.utils import is_nvidia_modelopt_available, is_torch_available from diffusers.utils.testing_utils import ( backend_empty_cache, @@ -16,11 +16,13 @@ torch_device, ) + if is_nvidia_modelopt_available(): import modelopt.torch.quantization as mtq if is_torch_available(): import torch + from ..utils import LoRALayer, get_memory_consumption_stat enable_full_determinism() @@ -69,29 +71,20 @@ def test_modelopt_layers(self): def test_modelopt_memory_usage(self): inputs = self.get_dummy_inputs() inputs = { - k: v.to(device=torch_device, dtype=torch.bfloat16) - for k, v in inputs.items() - if not isinstance(v, bool) + k: v.to(device=torch_device, dtype=torch.bfloat16) for k, v in inputs.items() if not isinstance(v, bool) } unquantized_model = self.model_cls.from_pretrained( self.model_id, torch_dtype=self.torch_dtype, subfolder="transformer" ) unquantized_model.to(torch_device) - unquantized_model_memory = get_memory_consumption_stat( - unquantized_model, inputs - ) + unquantized_model_memory = get_memory_consumption_stat(unquantized_model, inputs) - quantized_model = self.model_cls.from_pretrained( - **self.get_dummy_model_init_kwargs() - ) + quantized_model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) quantized_model.to(torch_device) quantized_model_memory = get_memory_consumption_stat(quantized_model, inputs) - assert ( - unquantized_model_memory / quantized_model_memory - >= self.expected_memory_reduction - ) + assert unquantized_model_memory / quantized_model_memory >= self.expected_memory_reduction def test_keep_modules_in_fp32(self): _keep_in_fp32_modules = self.model_cls._keep_in_fp32_modules @@ -109,9 +102,7 @@ def test_keep_modules_in_fp32(self): def test_modules_to_not_convert(self): init_kwargs = self.get_dummy_model_init_kwargs() quantization_config_kwargs = self.get_dummy_init_kwargs() - quantization_config_kwargs.update( - {"modules_to_not_convert": self.modules_to_not_convert} - ) + quantization_config_kwargs.update({"modules_to_not_convert": self.modules_to_not_convert}) quantization_config = NVIDIAModelOptConfig(**quantization_config_kwargs) init_kwargs.update({"quantization_config": quantization_config}) @@ -159,18 +150,14 @@ def test_serialization(self): with torch.no_grad(): saved_model_output = saved_model(**inputs) - assert torch.allclose( - model_output.sample, saved_model_output.sample, rtol=1e-5, atol=1e-5 - ) + assert torch.allclose(model_output.sample, saved_model_output.sample, rtol=1e-5, atol=1e-5) def test_torch_compile(self): if not self._test_torch_compile: return model = self.model_cls.from_pretrained(**self.get_dummy_model_init_kwargs()) - compiled_model = torch.compile( - model, mode="max-autotune", fullgraph=True, dynamic=False - ) + compiled_model = torch.compile(model, mode="max-autotune", fullgraph=True, dynamic=False) model.to(torch_device) with torch.no_grad(): @@ -183,9 +170,7 @@ def test_torch_compile(self): model_output = model_output.detach().float().cpu().numpy() compiled_model_output = compiled_model_output.detach().float().cpu().numpy() - max_diff = numpy_cosine_similarity_distance( - model_output.flatten(), compiled_model_output.flatten() - ) + max_diff = numpy_cosine_similarity_distance(model_output.flatten(), compiled_model_output.flatten()) assert max_diff < 1e-3 def test_device_map_error(self): @@ -203,17 +188,13 @@ def get_dummy_inputs(self): caption_channels = 8 torch.manual_seed(0) - hidden_states = torch.randn( - (batch_size, num_latent_channels, height, width) - ).to(torch_device, dtype=torch.bfloat16) - encoder_hidden_states = torch.randn((batch_size, seq_len, caption_channels)).to( + hidden_states = torch.randn((batch_size, num_latent_channels, height, width)).to( torch_device, dtype=torch.bfloat16 ) - timestep = ( - torch.tensor([1.0]) - .to(torch_device, dtype=torch.bfloat16) - .expand(batch_size) + encoder_hidden_states = torch.randn((batch_size, seq_len, caption_channels)).to( + torch_device, dtype=torch.bfloat16 ) + timestep = torch.tensor([1.0]).to(torch_device, dtype=torch.bfloat16).expand(batch_size) return { "hidden_states": hidden_states, @@ -229,9 +210,7 @@ def test_model_cpu_offload(self): subfolder="transformer", torch_dtype=torch.bfloat16, ) - pipe = self.pipeline_cls.from_pretrained( - self.model_id, transformer=transformer, torch_dtype=torch.bfloat16 - ) + pipe = self.pipeline_cls.from_pretrained(self.model_id, transformer=transformer, torch_dtype=torch.bfloat16) pipe.enable_model_cpu_offload(device=torch_device) _ = pipe("a cat holding a sign that says hello", num_inference_steps=2) From 1ddcc9c5daa6616eb42e36953a9324638891bd76 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Fri, 22 Aug 2025 14:51:52 +0530 Subject: [PATCH 25/31] update --- docs/source/en/quantization/modelopt.md | 6 ++++-- .../quantizers/modelopt/modelopt_quantizer.py | 21 +++++++++++++++++++ .../quantizers/quantization_config.py | 4 ++++ tests/quantization/modelopt/test_modelopt.py | 10 ++++----- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/docs/source/en/quantization/modelopt.md b/docs/source/en/quantization/modelopt.md index f76d52096e1e..a38b3e7ef355 100644 --- a/docs/source/en/quantization/modelopt.md +++ b/docs/source/en/quantization/modelopt.md @@ -57,7 +57,10 @@ image.save("output.png") The `NVIDIAModelOptConfig` class accepts three parameters: - `quant_type`: A string value mentioning one of the quantization types below. -- `modules_to_not_convert`: A list of module full/partial module names for which quantization should not be performed. For example, to not perform any quantization of the [`SanaTransformer2DModel`]'s conv blocks, one would specify: `modules_to_not_convert=["conv"]`. +- `modules_to_not_convert`: A list of module full/partial module names for which quantization should not be performed. For example, to not perform any quantization of the [`SD3Transformer2DModel`]'s pos_embed projection blocks, one would specify: `modules_to_not_convert=["pos_embed.proj.weight"]`. +- `disable_conv_quantization`: A boolean value which when set to `True` disables quantization for all convolutional layers in the model. This is useful as channel and block quantization generally don't work well with convolutional layers (used with INT4, NF4, NVFP4). If you want to disable quantization for specific convolutional layers, use `modules_to_not_convert` instead. +- `algorithm`: The algorithm to use for determining scale, defaults to `"max"`. You can check modelopt documentation for more algorithms and details. +- `forward_loop`: The forward loop function to use for calibrating activation during quantization. If not provided, it relies on static scale values computed using the weights only. - `kwargs`: A dict of keyword arguments to pass to the underlying quantization method which will be invoked based on `quant_type`. ## Supported quantization types @@ -76,7 +79,6 @@ The quantization methods supported are as follows: | **NF4** | `nf4 weight only`, `nf4 double block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize + scale_channel_quantize` + `scale_block_quantize` | `channel_quantize = -1 and scale_channel_quantize = -1 are only supported for now` | | **NVFP4** | `nvfp4 weight only`, `nvfp4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`| -Note - Channel and Block quantization generally don't work well with convolutional layers. Please use the `modules_to_not_convert` argument to skip quantization for convolutional layers. Refer to the [official modelopt documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available. diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index ae793a131a07..22fbf36f9041 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -16,6 +16,7 @@ if is_torch_available(): import torch + import torch.nn as nn if is_accelerate_available(): from accelerate.utils import set_module_tensor_to_device @@ -114,6 +115,24 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype" = None) -> "torch.dtype" logger.info("You did not specify `torch_dtype` in `from_pretrained`. Setting it to `torch.float32`.") torch_dtype = torch.float32 return torch_dtype + + def get_conv_param_names(self, model: "ModelMixin") -> List[str]: + """ + Get parameter names for all convolutional layers in a HuggingFace ModelMixin. + Includes Conv1d/2d/3d and ConvTranspose1d/2d/3d. + """ + conv_types = ( + nn.Conv1d, nn.Conv2d, nn.Conv3d, + nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d, + ) + + conv_param_names = [] + for name, module in model.named_modules(): + if isinstance(module, conv_types): + for param_name, _ in module.named_parameters(recurse=False): + conv_param_names.append(f"{name}.{param_name}") + + return conv_param_names def _process_model_before_weight_loading( self, @@ -135,6 +154,8 @@ def _process_model_before_weight_loading( if isinstance(modules_to_not_convert, str): modules_to_not_convert = [modules_to_not_convert] modules_to_not_convert.extend(keep_in_fp32_modules) + if self.quantization_config.disable_conv_quantization: + modules_to_not_convert.extend(self.get_conv_param_names(model)) for module in modules_to_not_convert: self.quantization_config.modelopt_config["quant_cfg"]["*" + module + "*"] = {"enable": False} diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index 361d787b040d..c4d155d9ea42 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -759,6 +759,8 @@ class NVIDIAModelOptConfig(QuantizationConfigMixin): The forward loop function to use for calibration during quantization. modelopt_config (`dict`, *optional*, default to `None`): The modelopt config, useful for passing custom configs to modelopt. + disable_conv_quantization (`bool`, *optional*, default to `False`): + If set to `True`, the quantization will be disabled for convolutional layers. kwargs (`Dict[str, Any]`, *optional*): Additional parameters which are to be used for calibration. """ @@ -787,6 +789,7 @@ def __init__( algorithm: str = "max", forward_loop: Optional[Callable] = None, modelopt_config: Optional[dict] = None, + disable_conv_quantization: bool = False, **kwargs, ) -> None: self.quant_method = QuantizationMethod.MODELOPT @@ -803,6 +806,7 @@ def __init__( self.scale_channel_quantize = scale_channel_quantize self.scale_block_quantize = scale_block_quantize self.modelopt_config = self.get_config_from_quant_type() if not modelopt_config else modelopt_config + self.disable_conv_quantization = disable_conv_quantization def _normalize_quant_type(self, quant_type: str) -> str: """ diff --git a/tests/quantization/modelopt/test_modelopt.py b/tests/quantization/modelopt/test_modelopt.py index bf21071f888d..989eafee9d80 100644 --- a/tests/quantization/modelopt/test_modelopt.py +++ b/tests/quantization/modelopt/test_modelopt.py @@ -2,7 +2,7 @@ import tempfile import unittest -from diffusers import NVIDIAModelOptConfig, SanaPipeline, SanaTransformer2DModel +from diffusers import NVIDIAModelOptConfig, StableDiffusion3Pipeline, SD3Transformer2DModel from diffusers.utils import is_nvidia_modelopt_available, is_torch_available from diffusers.utils.testing_utils import ( backend_empty_cache, @@ -32,9 +32,9 @@ @require_big_accelerator @require_accelerate class ModelOptBaseTesterMixin: - model_id = "Efficient-Large-Model/Sana_600M_1024px_diffusers" - model_cls = SanaTransformer2DModel - pipeline_cls = SanaPipeline + model_id = "hf-internal-testing/tiny-sd3-pipe" + model_cls = SD3Transformer2DModel + pipeline_cls = StableDiffusion3Pipeline torch_dtype = torch.bfloat16 expected_memory_reduction = 0.0 keep_in_fp32_module = "" @@ -270,7 +270,7 @@ def get_dummy_init_kwargs(self): "quant_type": "INT4", "block_quantize": 128, "channel_quantize": -1, - "modules_to_not_convert": ["conv", "patch_embed"], + "disable_conv_quantization": True, } From 5df692689ade2c73b90e9c65927d0e341bd38a65 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Fri, 22 Aug 2025 15:08:06 +0530 Subject: [PATCH 26/31] addressed PR comments --- .github/workflows/nightly_tests.yml | 3 +++ src/diffusers/dependency_versions_table.py | 2 +- .../quantizers/modelopt/modelopt_quantizer.py | 18 +++++++++++------- src/diffusers/utils/import_utils.py | 9 +-------- src/diffusers/utils/testing_utils.py | 13 +++++++++++++ tests/quantization/modelopt/test_modelopt.py | 4 +++- 6 files changed, 32 insertions(+), 17 deletions(-) diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index 92165640934d..54caf0e580b4 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -340,6 +340,9 @@ jobs: - backend: "optimum_quanto" test_location: "quanto" additional_deps: [] + - backend: "nvidia_modelopt" + test_location: "modelopt" + additional_deps: ["peft"] runs-on: group: aws-g6e-xlarge-plus container: diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 5bf0d2a85a85..79dc4c50a050 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -39,7 +39,7 @@ "gguf": "gguf>=0.10.0", "torchao": "torchao>=0.7.0", "bitsandbytes": "bitsandbytes>=0.43.3", - "nvidia_modelopt[hf]": "nvidia_modelopt[hf]>=0.27.0", + "nvidia_modelopt[hf]": "nvidia_modelopt[hf]>=0.33.1", "regex": "regex!=2019.12.17", "requests": "requests", "tensorboard": "tensorboard", diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index 22fbf36f9041..910faf7a5a6f 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -115,23 +115,27 @@ def update_torch_dtype(self, torch_dtype: "torch.dtype" = None) -> "torch.dtype" logger.info("You did not specify `torch_dtype` in `from_pretrained`. Setting it to `torch.float32`.") torch_dtype = torch.float32 return torch_dtype - + def get_conv_param_names(self, model: "ModelMixin") -> List[str]: """ - Get parameter names for all convolutional layers in a HuggingFace ModelMixin. - Includes Conv1d/2d/3d and ConvTranspose1d/2d/3d. + Get parameter names for all convolutional layers in a HuggingFace ModelMixin. Includes Conv1d/2d/3d and + ConvTranspose1d/2d/3d. """ conv_types = ( - nn.Conv1d, nn.Conv2d, nn.Conv3d, - nn.ConvTranspose1d, nn.ConvTranspose2d, nn.ConvTranspose3d, + nn.Conv1d, + nn.Conv2d, + nn.Conv3d, + nn.ConvTranspose1d, + nn.ConvTranspose2d, + nn.ConvTranspose3d, ) - + conv_param_names = [] for name, module in model.named_modules(): if isinstance(module, conv_types): for param_name, _ in module.named_parameters(recurse=False): conv_param_names.append(f"{name}.{param_name}") - + return conv_param_names def _process_model_before_weight_loading( diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 1e132c76c1c8..c718f7f652c3 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -225,14 +225,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b _flash_attn_available, _flash_attn_version = _is_package_available("flash_attn") _flash_attn_3_available, _flash_attn_3_version = _is_package_available("flash_attn_3") _kornia_available, _kornia_version = _is_package_available("kornia") - -_nvidia_modelopt_available = importlib.util.find_spec("modelopt") is not None -if _nvidia_modelopt_available: - try: - _nvidia_modelopt_version = importlib_metadata.version("nvidia_modelopt") - logger.debug(f"Successfully import nvidia_modelopt version {_nvidia_modelopt_version}") - except importlib_metadata.PackageNotFoundError: - _nvidia_modelopt_available = False +_nvidia_modelopt_available, _nvidia_modelopt_version = _is_package_available("modelopt", get_dist_name=True) def is_torch_available(): diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index a0307c108ad4..0c8f074e1684 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -38,6 +38,7 @@ is_gguf_available, is_kernels_available, is_note_seq_available, + is_nvidia_modelopt_available, is_onnx_available, is_opencv_available, is_optimum_quanto_available, @@ -635,6 +636,18 @@ def decorator(test_case): return decorator +def require_modelopt_version_greater_or_equal(modelopt_version): + def decorator(test_case): + correct_nvidia_modelopt_version = is_nvidia_modelopt_available() and version.parse( + version.parse(importlib.metadata.version("modelopt")).base_version + ) >= version.parse(modelopt_version) + return unittest.skipUnless( + correct_nvidia_modelopt_version, f"Test requires modelopt with version greater than {modelopt_version}." + )(test_case) + + return decorator + + def require_kernels_version_greater_or_equal(kernels_version): def decorator(test_case): correct_kernels_version = is_kernels_available() and version.parse( diff --git a/tests/quantization/modelopt/test_modelopt.py b/tests/quantization/modelopt/test_modelopt.py index 989eafee9d80..6b0624a28083 100644 --- a/tests/quantization/modelopt/test_modelopt.py +++ b/tests/quantization/modelopt/test_modelopt.py @@ -2,7 +2,7 @@ import tempfile import unittest -from diffusers import NVIDIAModelOptConfig, StableDiffusion3Pipeline, SD3Transformer2DModel +from diffusers import NVIDIAModelOptConfig, SD3Transformer2DModel, StableDiffusion3Pipeline from diffusers.utils import is_nvidia_modelopt_available, is_torch_available from diffusers.utils.testing_utils import ( backend_empty_cache, @@ -12,6 +12,7 @@ numpy_cosine_similarity_distance, require_accelerate, require_big_accelerator, + require_modelopt_version_greater_or_equal, require_torch_cuda_compatibility, torch_device, ) @@ -31,6 +32,7 @@ @nightly @require_big_accelerator @require_accelerate +@require_modelopt_version_greater_or_equal("0.33.1") class ModelOptBaseTesterMixin: model_id = "hf-internal-testing/tiny-sd3-pipe" model_cls = SD3Transformer2DModel From 0bf90b064ec41b9b391804e75f98f949ea785146 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 26 Aug 2025 18:53:52 +0530 Subject: [PATCH 27/31] addressed PR comments --- .github/workflows/nightly_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml index 54caf0e580b4..479e5503eed2 100644 --- a/.github/workflows/nightly_tests.yml +++ b/.github/workflows/nightly_tests.yml @@ -342,7 +342,7 @@ jobs: additional_deps: [] - backend: "nvidia_modelopt" test_location: "modelopt" - additional_deps: ["peft"] + additional_deps: [] runs-on: group: aws-g6e-xlarge-plus container: From cf054d2f38bb5a0ebbd1f7e11e8c65519e2178a8 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Tue, 26 Aug 2025 20:13:06 +0530 Subject: [PATCH 28/31] addressed PR comments --- src/diffusers/quantizers/auto.py | 3 ++ .../quantizers/modelopt/modelopt_quantizer.py | 1 + .../quantizers/quantization_config.py | 30 +++++++++++++++++-- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/diffusers/quantizers/auto.py b/src/diffusers/quantizers/auto.py index f405c8ec5ec0..070bcd0b2151 100644 --- a/src/diffusers/quantizers/auto.py +++ b/src/diffusers/quantizers/auto.py @@ -141,6 +141,9 @@ def merge_quantization_configs( if isinstance(quantization_config, dict): quantization_config = cls.from_dict(quantization_config) + if isinstance(quantization_config, NVIDIAModelOptConfig): + quantization_config.check_model_patching() + if warning_msg != "": warnings.warn(warning_msg) diff --git a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py index 910faf7a5a6f..534f752321b3 100644 --- a/src/diffusers/quantizers/modelopt/modelopt_quantizer.py +++ b/src/diffusers/quantizers/modelopt/modelopt_quantizer.py @@ -186,4 +186,5 @@ def is_trainable(self): @property def is_serializable(self): + self.quantization_config.check_model_patching(operation="saving") return True diff --git a/src/diffusers/quantizers/quantization_config.py b/src/diffusers/quantizers/quantization_config.py index c4d155d9ea42..bf857956512c 100644 --- a/src/diffusers/quantizers/quantization_config.py +++ b/src/diffusers/quantizers/quantization_config.py @@ -25,6 +25,7 @@ import inspect import json import os +import warnings from dataclasses import dataclass from enum import Enum from functools import partial @@ -269,7 +270,14 @@ def __init__( if bnb_4bit_quant_storage is None: self.bnb_4bit_quant_storage = torch.uint8 elif isinstance(bnb_4bit_quant_storage, str): - if bnb_4bit_quant_storage not in ["float16", "float32", "int8", "uint8", "float64", "bfloat16"]: + if bnb_4bit_quant_storage not in [ + "float16", + "float32", + "int8", + "uint8", + "float64", + "bfloat16", + ]: raise ValueError( "`bnb_4bit_quant_storage` must be a valid string (one of 'float16', 'float32', 'int8', 'uint8', 'float64', 'bfloat16') " ) @@ -480,7 +488,12 @@ class TorchAoConfig(QuantizationConfigMixin): ``` """ - def __init__(self, quant_type: str, modules_to_not_convert: Optional[List[str]] = None, **kwargs) -> None: + def __init__( + self, + quant_type: str, + modules_to_not_convert: Optional[List[str]] = None, + **kwargs, + ) -> None: self.quant_method = QuantizationMethod.TORCHAO self.quant_type = quant_type self.modules_to_not_convert = modules_to_not_convert @@ -808,6 +821,19 @@ def __init__( self.modelopt_config = self.get_config_from_quant_type() if not modelopt_config else modelopt_config self.disable_conv_quantization = disable_conv_quantization + def check_model_patching(self, operation: str = "loading"): + # ModelOpt imports diffusers internally. This is here to prevent circular imports + from modelopt.torch.opt.plugins.huggingface import _PATCHED_CLASSES + + if len(_PATCHED_CLASSES) == 0: + warning_msg = ( + f"Not {operation} weights in modelopt format. This might cause unreliable behavior." + "Please make sure to run the following code before loading/saving model weights:\n\n" + " from modelopt.torch.opt import enable_huggingface_checkpointing\n" + " enable_huggingface_checkpointing()\n" + ) + warnings.warn(warning_msg) + def _normalize_quant_type(self, quant_type: str) -> str: """ Validates and normalizes the quantization type string. From dd39595756dfc4310f5fb8075099c11ad35be58d Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Mon, 1 Sep 2025 13:48:55 +0530 Subject: [PATCH 29/31] addressed PR comments --- docs/source/en/quantization/modelopt.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/source/en/quantization/modelopt.md b/docs/source/en/quantization/modelopt.md index a38b3e7ef355..06933d47c221 100644 --- a/docs/source/en/quantization/modelopt.md +++ b/docs/source/en/quantization/modelopt.md @@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. --> # NVIDIA ModelOpt -[nvidia_modelopt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed. +[NVIDIA-ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed. Before you begin, make sure you have nvidia_modelopt installed. @@ -53,6 +53,12 @@ image = pipe( image.save("output.png") ``` +> **Note:** +> +> The quantization methods in NVIDIA-ModelOpt are designed to reduce the memory footprint of model weights using various QAT (Quantization-Aware Training) and PTQ (Post-Training Quantization) techniques while maintaining model performance. However, the actual performance gain during inference depends on the deployment framework (e.g., TRT-LLM, TensorRT) and the specific hardware configuration. +> +> More details can be found [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples). + ## NVIDIAModelOptConfig The `NVIDIAModelOptConfig` class accepts three parameters: From 8f601866dbfd19afcba47b02a61d0dc06f0cd8fe Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Mon, 1 Sep 2025 21:05:55 +0530 Subject: [PATCH 30/31] fix docs and dependencies --- docs/source/en/_toctree.yml | 2 ++ setup.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a0ddf8f25654..a97c82796fca 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -188,6 +188,8 @@ title: torchao - local: quantization/quanto title: quanto + - local: quantization/modelopt + title: NVIDIA ModelOpt - title: Model accelerators and hardware isExpanded: false diff --git a/setup.py b/setup.py index 87ce14e1d567..ba3ad8e2b307 100644 --- a/setup.py +++ b/setup.py @@ -132,7 +132,7 @@ "gguf>=0.10.0", "torchao>=0.7.0", "bitsandbytes>=0.43.3", - "nvidia_modelopt[hf]>=0.27.0", + "nvidia_modelopt[hf]>=0.33.1", "regex!=2019.12.17", "requests", "tensorboard", From 1a8806f2a44b60055096bc0416f578f4d7087399 Mon Sep 17 00:00:00 2001 From: ishan-modi Date: Mon, 1 Sep 2025 21:21:09 +0530 Subject: [PATCH 31/31] fixed dependency test --- tests/others/test_dependencies.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/others/test_dependencies.py b/tests/others/test_dependencies.py index a08129a1e9c9..db22f10c4b3c 100644 --- a/tests/others/test_dependencies.py +++ b/tests/others/test_dependencies.py @@ -39,6 +39,8 @@ def test_backend_registration(self): backend = "invisible-watermark" elif backend == "opencv": backend = "opencv-python" + elif backend == "nvidia_modelopt": + backend = "nvidia_modelopt[hf]" assert backend in deps, f"{backend} is not in the deps table!" def test_pipeline_imports(self):