[quantization] Dequant fp8 when cuda or xpu not available (#42511)

MekkCyber · web-flow · commit afd039d1caa1 · 2025-12-01T16:28:57.000+01:00
* up

* style

* add tests

* update
diff --git a/src/transformers/integrations/finegrained_fp8.py b/src/transformers/integrations/finegrained_fp8.py
@@ -14,8 +14,7 @@
 # limitations under the License.
 
 import re
-from collections.abc import Sequence
-from typing import Any
+from typing import Optional
 
 from ..core_model_loading import ConversionOps
 from ..utils import is_accelerate_available, is_torch_accelerator_available, is_torch_available, logging
@@ -549,6 +548,9 @@ def replace_with_fp8_linear(
     quantization_config=None,
 ):
     """Helper function to replace model layers with FP8 versions."""
+    if quantization_config.dequantize:
+        return model
+
     if modules_to_not_convert is None:
         modules_to_not_convert = []
     modules_to_not_convert += ["lm_head"]
@@ -652,41 +654,45 @@ def convert(self, input_dict: torch.Tensor, **kwargs) -> dict[str, torch.Tensor]
 class Fp8Dequantize(ConversionOps):
     """Inverse operation of :class:`Fp8Quantize`. Takes a pair (weight, scale) and reconstructs the fp32 tensor."""
 
-    def __init__(self, block_size: tuple[int, int] | None = None):
-        self.block_size = block_size
+    def __init__(self, hf_quantizer):
+        self.hf_quantizer = hf_quantizer
 
     def convert(
         self,
-        value: Sequence[torch.Tensor] | dict[str, torch.Tensor],
-        *,
-        context: dict[str, Any],
-    ) -> torch.Tensor:
-        if isinstance(value, dict):
-            tensors = list(value.values())
-        else:
-            tensors = list(value) if isinstance(value, Sequence) else [value]
-        if len(tensors) != 2:
-            raise ValueError("Fp8Dequantize expects exactly two tensors: quantized weights and scales.")
-        quantized, scales = tensors
-        if not isinstance(quantized, torch.Tensor) or not isinstance(scales, torch.Tensor):
-            raise TypeError("Fp8Dequantize expects tensors as inputs.")
-
-        quantized_fp32 = quantized.to(torch.float32)
-        rows, cols = quantized_fp32.shape[-2:]
-        block_size = self.block_size
-        if block_size is None:
-            quant_config = context.get("quantization_config")
-            block_size = getattr(quant_config, "weight_block_size", None)
-        if block_size is None:
-            block_size = (rows, cols)
+        input_dict: dict[str, torch.Tensor],
+        model: Optional[torch.nn.Module] = None,
+        full_layer_name: str | None = None,
+        missing_keys=None,
+        **kwargs,
+    ) -> dict[str, torch.Tensor]:
+        if len(input_dict) != 2:
+            # in case of no scales, the weights are not quantized, so we return the weights as is
+            return {
+                full_layer_name: input_dict["weight$"][0]
+                if isinstance(input_dict["weight$"], list)
+                else input_dict["weight$"]
+            }
+        quantized = input_dict["weight$"][0] if isinstance(input_dict["weight$"], list) else input_dict["weight$"]
+        scales = (
+            input_dict["weight_scale_inv"][0]
+            if isinstance(input_dict["weight_scale_inv"], list)
+            else input_dict["weight_scale_inv"]
+        )
+
+        rows, cols = quantized.shape[-2:]
+        block_size = self.hf_quantizer.quantization_config.weight_block_size
+
         block_m, block_n = block_size
         if rows % block_m != 0 or cols % block_n != 0:
             raise ValueError(
                 f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_m}, {block_n})."
             )
 
-        reshaped = quantized_fp32.reshape(-1, rows // block_m, block_m, cols // block_n, block_n)
+        reshaped = quantized.reshape(-1, rows // block_m, block_m, cols // block_n, block_n)
         expanded_scales = scales.to(torch.float32).reshape(-1, rows // block_m, cols // block_n)
         expanded_scales = expanded_scales.unsqueeze(-1).unsqueeze(2)
         dequantized = reshaped * expanded_scales
-        return dequantized.reshape(quantized_fp32.shape)
+
+        return {
+            full_layer_name: dequantized.reshape(quantized.shape),
+        }
diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py
@@ -224,7 +224,15 @@ def merge_quantization_configs(
         if (
             isinstance(
                 quantization_config,
-                (GPTQConfig, AwqConfig, AutoRoundConfig, FbgemmFp8Config, CompressedTensorsConfig, Mxfp4Config),
+                (
+                    GPTQConfig,
+                    AwqConfig,
+                    AutoRoundConfig,
+                    FbgemmFp8Config,
+                    CompressedTensorsConfig,
+                    Mxfp4Config,
+                    FineGrainedFP8Config,
+                ),
             )
             and quantization_config_from_args is not None
         ):
@@ -234,7 +242,7 @@ def merge_quantization_configs(
 
             warning_msg += f"However, loading attributes (e.g. {list(loading_attr_dict.keys())}) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored."
 
-        if warning_msg != "" and not isinstance(quantization_config, Mxfp4Config):
+        if warning_msg != "" and not isinstance(quantization_config, (Mxfp4Config, FineGrainedFP8Config)):
             warnings.warn(warning_msg)
         else:
             # in the case of mxfp4, we don't want to print the warning message, bit confusing for users
diff --git a/src/transformers/quantizers/quantizer_finegrained_fp8.py b/src/transformers/quantizers/quantizer_finegrained_fp8.py
@@ -38,8 +38,15 @@ def validate_environment(self, *args, **kwargs):
         if not is_accelerate_available():
             raise ImportError("Loading an FP8 quantized model requires accelerate (`pip install accelerate`)")
 
-        if not (torch.cuda.is_available() or is_torch_xpu_available()):
-            raise RuntimeError("No GPU or XPU found. A GPU or XPU is needed for FP8 quantization.")
+        if (not (torch.cuda.is_available() or is_torch_xpu_available())) and not self.quantization_config.dequantize:
+            if self.pre_quantized:
+                logger.warning_once(
+                    "Using FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is available"
+                )
+                self.quantization_config.dequantize = True
+                return
+            else:
+                raise RuntimeError("No GPU or XPU found. A GPU or XPU is needed for FP8 quantization.")
 
         if torch.cuda.is_available():
             compute_capability = torch.cuda.get_device_capability()
@@ -231,3 +238,18 @@ def get_quantize_ops(self):
         from ..integrations.finegrained_fp8 import Fp8Quantize
 
         return Fp8Quantize(self)
+
+    def get_weight_conversions(self):
+        from ..core_model_loading import WeightConverter
+        from ..integrations.finegrained_fp8 import Fp8Dequantize
+
+        if self.pre_quantized and self.quantization_config.dequantize:
+            return [
+                # either use the dollar sign, or permute the source patterns to start matching against the scales first
+                WeightConverter(
+                    source_patterns=["weight$", "weight_scale_inv"],
+                    target_patterns="weight",
+                    operations=[Fp8Dequantize(self)],
+                )
+            ]
+        return []
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
@@ -1981,6 +1981,8 @@ class FineGrainedFP8Config(QuantizationConfigMixin):
             The scheme used for activation, the defaults and only support scheme for now is "dynamic".
         weight_block_size (`typing.tuple[int, int]`, *optional*, defaults to `(128, 128)`):
             The size of the weight blocks for quantization, default is (128, 128).
+        dequantize (`bool`, *optional*, defaults to `False`):
+            Whether to dequantize the model during loading.
         modules_to_not_convert (`list`, *optional*):
             A list of module names that should not be converted during quantization.
     """
@@ -1989,13 +1991,15 @@ def __init__(
         self,
         activation_scheme: str = "dynamic",
         weight_block_size: tuple[int, int] = (128, 128),
+        dequantize: bool = False,
         modules_to_not_convert: list | None = None,
         **kwargs,
     ):
         self.quant_method = QuantizationMethod.FP8
         self.modules_to_not_convert = modules_to_not_convert
         self.activation_scheme = activation_scheme
         self.weight_block_size = weight_block_size
+        self.dequantize = dequantize
         self.post_init()
 
     def post_init(self):
@@ -2010,6 +2014,9 @@ def post_init(self):
         if self.weight_block_size[0] <= 0 or self.weight_block_size[1] <= 0:
             raise ValueError("weight_block_size must be a tuple of two positive integers")
 
+    def get_loading_attributes(self):
+        return {"dequantize": self.dequantize}
+
 
 class QuarkConfig(QuantizationConfigMixin):
     def __init__(
diff --git a/tests/quantization/finegrained_fp8/test_fp8.py b/tests/quantization/finegrained_fp8/test_fp8.py
@@ -15,8 +15,11 @@
 import gc
 import tempfile
 import unittest
+from contextlib import ExitStack, contextmanager
+from unittest.mock import patch
 
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FineGrainedFP8Config, OPTForCausalLM
+from transformers.quantizers.quantizer_finegrained_fp8 import FineGrainedFP8HfQuantizer
 from transformers.testing_utils import (
     backend_empty_cache,
     get_device_properties,
@@ -37,6 +40,15 @@
     from accelerate import init_empty_weights
 
 
+@contextmanager
+def _patch_no_accelerator():
+    with ExitStack() as stack:
+        stack.enter_context(patch("torch.cuda.is_available", return_value=False))
+        if hasattr(torch, "xpu"):
+            stack.enter_context(patch("torch.xpu.is_available", return_value=False))
+        yield
+
+
 @require_torch_accelerator
 class FineGrainedFP8ConfigTest(unittest.TestCase):
     def test_to_dict(self):
@@ -71,9 +83,11 @@ def test_from_dict(self):
 )
 class FP8QuantizerTest(unittest.TestCase):
     model_name = "meta-llama/Llama-3.2-1B"
+    quantized_model_name = "hf-internal-testing/Llama-3.2-1B-Instruct-fp8"
     input_text = "Once upon a time"
     max_new_tokens = 10
     EXPECTED_OUTPUT = "Once upon a time, there was a man who was very rich."
+    EXPECTED_DEQUANTIZED_OUTPUT = "Once upon a time, in a small village nestled in the rolling hills"
     device_map = torch_device
     offload_device_map = {
         "model.embed_tokens": 0,
@@ -152,6 +166,25 @@ def test_quantized_model_conversion(self):
 
         self.assertEqual(nb_linears - 25, nb_fp8_linear)
 
+    def test_quantizer_validation_no_accelerator(self):
+        """Test quantizer validation when CUDA/XPU is not available"""
+        with _patch_no_accelerator():
+            config = FineGrainedFP8Config()
+            quantizer = FineGrainedFP8HfQuantizer(config)
+            quantizer.pre_quantized = False
+
+            with self.assertRaises(RuntimeError):
+                quantizer.validate_environment()
+
+    def test_dequantization_no_accelerator(self):
+        """Test dequantization when CUDA/XPU is not available"""
+        with _patch_no_accelerator():
+            config = FineGrainedFP8Config()
+            quantizer = FineGrainedFP8HfQuantizer(config)
+            quantizer.pre_quantized = True
+            quantizer.validate_environment()
+            self.assertTrue(quantizer.quantization_config.dequantize)
+
     def test_quantized_model(self):
         """
         Simple test that checks if the quantized model is working properly
@@ -162,6 +195,32 @@ def test_quantized_model(self):
         output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
         self.assertEqual(output_tokens, self.EXPECTED_OUTPUT)
 
+    def test_dequantized_model(self):
+        """
+        Simple test that checks if the dequantized model is working properly
+        """
+        quantization_config = FineGrainedFP8Config(dequantize=True)
+        dequantized_model = AutoModelForCausalLM.from_pretrained(
+            self.quantized_model_name, device_map=self.device_map, quantization_config=quantization_config
+        )
+        input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(self.device_map)
+        output = dequantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
+        output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        self.assertEqual(output_tokens, self.EXPECTED_DEQUANTIZED_OUTPUT)
+        del dequantized_model
+
+    def test_dequantize_when_no_accelerator(self):
+        """
+        Simple test that checks if the dequantized model is working properly when no accelerator is available
+        """
+        with _patch_no_accelerator():
+            dequantized_model = AutoModelForCausalLM.from_pretrained(self.quantized_model_name, device_map="cpu")
+            input_ids = self.tokenizer(self.input_text, return_tensors="pt").to("cpu")
+            output = dequantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False)
+            output_tokens = self.tokenizer.decode(output[0], skip_special_tokens=True)
+            self.assertEqual(output_tokens, self.EXPECTED_DEQUANTIZED_OUTPUT)
+            del dequantized_model
+
     def test_save_pretrained(self):
         """
         Simple test that checks if the quantized model is working properly after being saved and loaded