From 719e5abb36cab9ae1d6c184e66c45ae62fca7276 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Thu, 9 Oct 2025 15:31:54 +0800 Subject: [PATCH 1/4] fp8 exporting bugfix Signed-off-by: Zhang, Weiwei1 --- auto_round/export/export_to_autoround/export_to_fp8.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 7f069cb60..1f6cdbc65 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -109,11 +109,9 @@ def pack_layer(layer_name, model, data_type, device=None): torch_dtype = torch.float8_e5m2 info = torch.finfo(torch_dtype) if zp is not None: - q_weight = ( - weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) + zp.to(packing_device) - if isinstance(zp, torch.Tensor) - else zp - ) + if isinstance(zp, torch.Tensor): + zp = zp.to(packing_device) + q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) + zp else: q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1) q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len) @@ -235,3 +233,4 @@ def wrapper(name): save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype) return model + From c2daa79099713a7fcdd45f8a97f3fd5e7170c711 Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Thu, 6 Nov 2025 16:52:46 +0800 Subject: [PATCH 2/4] refine exllama backend cuda UT Signed-off-by: Zhang, Weiwei1 --- test/test_cuda/test_exllamav2_backend.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index 5c12e0557..38905e9bd 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -12,7 +12,7 @@ from auto_round import AutoRound, AutoRoundConfig from auto_round.eval.evaluation import simple_evaluate_user_model -from auto_round.testing_utils import require_autogptq, require_gptqmodel +from auto_round.testing_utils import require_autogptq, require_gptqmodel, require_package_version_ut class LLMDataLoader: @@ -24,7 +24,7 @@ def __iter__(self): yield torch.ones([1, 10], dtype=torch.long) -class TestAutoRoundMarlinBackend(unittest.TestCase): +class TestAutoRoundexllamaBackend(unittest.TestCase): @classmethod def setUpClass(self): @@ -99,6 +99,7 @@ def test_gptqmodel_exllmav2_4bits_asym(self): shutil.rmtree("./saved", ignore_errors=True) @require_autogptq + @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym(self): model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True) @@ -130,6 +131,7 @@ def test_gptq_exllamav2_4bits_sym(self): shutil.rmtree(self.save_folder, ignore_errors=True) @require_autogptq + @require_package_version_ut("torch", "<2.6.0") def test_gptq_exllamav2_4bits_sym_group_size(self): for group_size in [-1, 32, 64, 128, 256, 1024]: ## 384, 768 has accuracy issue print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!") @@ -166,3 +168,4 @@ def test_gptq_exllamav2_4bits_sym_group_size(self): if __name__ == "__main__": unittest.main() + From 76226fd8bd098ef1fce66e3e7371e00b7089cb2b Mon Sep 17 00:00:00 2001 From: "Zhang, Weiwei1" Date: Fri, 7 Nov 2025 11:49:27 +0800 Subject: [PATCH 3/4] fix non_auto_device_map typo Signed-off-by: Zhang, Weiwei1 --- auto_round/utils/device.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index b0ecf9019..9dee028f6 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -623,6 +623,8 @@ def set_non_auto_device_map( infos = device_map.split(",") device_map_dict = {} for info in infos: + if ":" not in info: + continue index = info.find(":") key = info[:index] value = info[index + 1 :] From df3a10cedd0f5655a8c2b391203e549400b8e488 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 7 Nov 2025 03:51:13 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_autoround/export_to_fp8.py | 1 - test/test_cuda/test_exllamav2_backend.py | 1 - 2 files changed, 2 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py index 88f6d750c..8b8a618e2 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8.py +++ b/auto_round/export/export_to_autoround/export_to_fp8.py @@ -228,4 +228,3 @@ def wrapper(name): save_model(model, output_dir, safe_serialization=safe_serialization, dtype=dtype) return model - diff --git a/test/test_cuda/test_exllamav2_backend.py b/test/test_cuda/test_exllamav2_backend.py index 38905e9bd..c489b37b2 100644 --- a/test/test_cuda/test_exllamav2_backend.py +++ b/test/test_cuda/test_exllamav2_backend.py @@ -168,4 +168,3 @@ def test_gptq_exllamav2_4bits_sym_group_size(self): if __name__ == "__main__": unittest.main() -