intel · Kaihui-intel · Oct 30, 2025 · Oct 30, 2025
diff --git a/README.md b/README.md
@@ -323,7 +323,7 @@ The support for Gaudi device is limited.
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -701,7 +701,7 @@ def tune(args):
                 logger.error("Cannot find correct gguf file for evaluation, please check.")
                 sys.exit(-1)
             model = AutoModelForCausalLM.from_pretrained(
-                eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+                eval_folder, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
             )
             model.eval()
             tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file)

diff --git a/auto_round/compressors/diffusion/README.md b/auto_round/compressors/diffusion/README.md
@@ -15,7 +15,7 @@ from diffusers import AutoPipelineForText2Image
 
 # Load the model
 model_name = "black-forest-labs/FLUX.1-dev"
-pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+pipe = AutoPipelineForText2Image.from_pretrained(model_name, dtype=torch.bfloat16)
 
 # Quantize the model
 autoround = AutoRound(

diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
@@ -174,7 +174,7 @@ def eval(args):
                 " but may affect accuracy."
             )
         model = AutoModelForCausalLM.from_pretrained(
-            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+            model, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
         )
         model.eval()
         st = time.time()
@@ -252,7 +252,7 @@ def eval_task_by_task(
             )
 
         model = AutoModelForCausalLM.from_pretrained(
-            model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
+            model, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
         )
         model.eval()
         parallelism = False

diff --git a/auto_round/experimental/kv_cache.py b/auto_round/experimental/kv_cache.py
@@ -263,7 +263,7 @@ def prep_attention_module_for_calibration(module: torch.nn.Module):
 
 def normalize_static_kv_dtype(static_kv_dtype: Union[str, torch.dtype]) -> torch.dtype:
     valid_dtype_name_lst = ["float16", "bfloat16", "fp8", "float32", "float"]
-    valid_torch_dtype = {
+    valid_dtype = {
         "float16": torch.float16,
         "bfloat16": torch.bfloat16,
         "fp8": torch.float8_e4m3fn,
@@ -272,13 +272,13 @@ def normalize_static_kv_dtype(static_kv_dtype: Union[str, torch.dtype]) -> torch
         "float": torch.float32,  # Alias for float32
     }
     if static_kv_dtype in valid_dtype_name_lst:
-        new_dtype = valid_torch_dtype[static_kv_dtype]
-    elif static_kv_dtype in valid_torch_dtype.values():
+        new_dtype = valid_dtype[static_kv_dtype]
+    elif static_kv_dtype in valid_dtype.values():
         new_dtype = static_kv_dtype
     else:
         raise ValueError(
             f"Invalid static kv dtype: {static_kv_dtype}. "
-            f"Valid options are: {', '.join(valid_dtype_name_lst  + list(valid_torch_dtype.values()))}."
+            f"Valid options are: {', '.join(valid_dtype_name_lst  + list(valid_dtype.values()))}."
         )
     return new_dtype
 

diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -105,10 +105,10 @@ def pack_layer(layer_name, model, data_type, device=None):
     weight = layer.weight
     weight, orig_shape, pad_len = reshape_pad_tensor_by_group_size(weight, layer.group_size)
     act_scale = layer.act_scale.view(-1) if hasattr(layer, "act_scale") else None
-    torch_dtype = torch.float8_e4m3fn
+    dtype = torch.float8_e4m3fn
     if "fp8_e5m2" in data_type:
-        torch_dtype = torch.float8_e5m2
-    info = torch.finfo(torch_dtype)
+        dtype = torch.float8_e5m2
+    info = torch.finfo(dtype)
     if zp is not None:
         if isinstance(zp, torch.Tensor):
             zp = zp.to(packing_device)
@@ -117,7 +117,7 @@ def pack_layer(layer_name, model, data_type, device=None):
         q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1)
     q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len)
     q_weight = torch.clamp(q_weight, info.min, info.max)
-    q_weight = q_weight.to(torch_dtype)
+    q_weight = q_weight.to(dtype)
     if type(layer) == torch.nn.Linear:
         in_features = layer.in_features
         out_features = layer.out_features

diff --git a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py
@@ -172,7 +172,7 @@ def __init__(
 
         # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
         if self.ftype == gguf.LlamaFileType.GUESSED:
-            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
+            # NOTE: can't use field "dtype" in config.json, because some finetunes lie.
             _, first_tensor = next(self.get_tensors())
             if first_tensor.dtype == torch.float16:
                 logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")

diff --git a/auto_round/export/export_to_llmcompressor/export_to_static_fp.py b/auto_round/export/export_to_llmcompressor/export_to_static_fp.py
@@ -73,10 +73,10 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device:
     weight = layer.weight
     weight, orig_shape, pad_len = reshape_pad_tensor_by_group_size(weight, layer.group_size)
     act_scale = layer.act_scale.view(-1) if hasattr(layer, "act_scale") else None
-    torch_dtype = torch.float8_e4m3fn
+    dtype = torch.float8_e4m3fn
     if "fp8_e5m2" in data_type:
-        torch_dtype = torch.float8_e5m2
-    info = torch.finfo(torch_dtype)
+        dtype = torch.float8_e5m2
+    info = torch.finfo(dtype)
     if zp is not None:
         if isinstance(zp, torch.Tensor):
             zp = zp.to(packing_device)
@@ -85,7 +85,7 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device:
         q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1)
     q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len)
     q_weight = torch.clamp(q_weight, info.min, info.max)
-    q_weight = q_weight.to(torch_dtype)
+    q_weight = q_weight.to(dtype)
     if type(layer) == torch.nn.Linear:
         in_features = layer.in_features
         out_features = layer.out_features

diff --git a/auto_round/export/utils.py b/auto_round/export/utils.py
@@ -58,7 +58,7 @@ def save_model(
     if dtype is not None and dtype != model.dtype and os.path.exists(os.path.join(save_dir, "config.json")):
         with open(config_path, "r") as file:
             data = json.load(file)
-        data["torch_dtype"] = str(dtype).split(".")[-1]
+        data["dtype"] = str(dtype).split(".")[-1]
         with open(config_path, "w") as file:
             json.dump(data, file, indent=2)
     config_file = "quantization_config.json"

diff --git a/auto_round/inference/auto_quantizer.py b/auto_round/inference/auto_quantizer.py
@@ -329,10 +329,10 @@ def validate_environment(self, *args, **kwargs):
                     "auto-round` or install from source"
                 )
 
-    def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
-        if torch_dtype is None:
-            torch_dtype = torch.bfloat16
-        return torch_dtype
+    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
+        if dtype is None:
+            dtype = torch.bfloat16
+        return dtype
 
     def post_init_model(self, model):
         """Post-initialization that require device information, for example buffers initialization on device.

diff --git a/auto_round/modelling/gpt_oss.py b/auto_round/modelling/gpt_oss.py
@@ -62,7 +62,7 @@ def __init__(self, config: GptOssConfig, original: GptOssMLP):
         super().__init__()
         hidden_size = config.hidden_size
         intermediate_size = config.intermediate_size
-        dtype_str = getattr(config, "torch_dtype", None) or getattr(config, "dtype", None)
+        dtype_str = getattr(config, "dtype", None)
         dtype = torch.bfloat16 if str(dtype_str).endswith("bfloat16") else torch.float32
         top_k = config.num_experts_per_tok
         self.hidden_size = hidden_size

diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
@@ -195,9 +195,9 @@ def llm_load_model(
     )
 
     device_str, use_auto_mapping = get_device_and_parallelism(device)
-    torch_dtype = "auto"
+    dtype = "auto"
     if device_str is not None and "hpu" in device_str:
-        torch_dtype = torch.bfloat16
+        dtype = torch.bfloat16
 
     is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower()))
 
@@ -210,7 +210,7 @@ def llm_load_model(
     if _use_hpu_compile_mode():
         model = model_cls.from_pretrained(
             pretrained_model_name_or_path,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             attn_implementation="eager",
             trust_remote_code=trust_remote_code,
             device_map="auto" if use_auto_mapping else None,
@@ -219,7 +219,7 @@ def llm_load_model(
         try:
             model = model_cls.from_pretrained(
                 pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
                 trust_remote_code=trust_remote_code,
                 device_map="auto" if use_auto_mapping else None,
             )
@@ -228,7 +228,7 @@ def llm_load_model(
                 orig_func = set_fake_cuda_device_capability()
                 model = model_cls.from_pretrained(
                     pretrained_model_name_or_path,
-                    torch_dtype=torch_dtype,
+                    dtype=dtype,
                     trust_remote_code=trust_remote_code,
                     device_map="auto" if use_auto_mapping else None,
                 )
@@ -241,7 +241,7 @@ def llm_load_model(
             logger.warning(f"fail to load {pretrained_model_name_or_path}, set trust_remote_code to False and retry.")
             model = model_cls.from_pretrained(
                 pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
                 trust_remote_code=False,
                 device_map="auto" if use_auto_mapping else None,
             )
@@ -256,7 +256,7 @@ def llm_load_model(
 def mllm_load_model(
     pretrained_model_name_or_path,
     device="cpu",
-    torch_dtype="auto",
+    dtype="auto",
     use_auto_mapping=True,
     trust_remote_code=True,
     model_dtype=None,
@@ -268,9 +268,9 @@ def mllm_load_model(
     from auto_round.utils.device import get_device_and_parallelism, set_fake_cuda_device_capability
 
     device_str, use_auto_mapping = get_device_and_parallelism(device)
-    torch_dtype = "auto"
+    dtype = "auto"
     if device_str is not None and "hpu" in device_str:
-        torch_dtype = torch.bfloat16
+        dtype = torch.bfloat16
     if os.path.isdir(pretrained_model_name_or_path):
         config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json")))
     else:
@@ -306,7 +306,7 @@ def mllm_load_model(
         model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
             pretrained_model_name_or_path,
             trust_remote_code=trust_remote_code,
-            torch_dtype=torch_dtype,
+            dtype=dtype,
             device_map="auto" if use_auto_mapping else None,
         )
     else:
@@ -318,7 +318,7 @@ def mllm_load_model(
                 pretrained_model_name_or_path,
                 model_base=None,
                 model_name=pretrained_model_name_or_path,
-                torch_dtype=torch_dtype,
+                dtype=dtype,
             )
         else:
             if architectures.endswith("Model") and hasattr(
@@ -333,7 +333,7 @@ def mllm_load_model(
                 model = cls.from_pretrained(
                     pretrained_model_name_or_path,
                     trust_remote_code=trust_remote_code,
-                    torch_dtype=torch_dtype,
+                    dtype=dtype,
                     device_map="auto" if use_auto_mapping else None,
                 )
             except ValueError as e:
@@ -342,7 +342,7 @@ def mllm_load_model(
                     model = cls.from_pretrained(
                         pretrained_model_name_or_path,
                         trust_remote_code=trust_remote_code,
-                        torch_dtype=torch_dtype,
+                        dtype=dtype,
                         device_map="auto" if use_auto_mapping else None,
                     )
                     torch.cuda.get_device_capability = orig_func
@@ -383,7 +383,7 @@ def mllm_load_model(
 def diffusion_load_model(
     pretrained_model_name_or_path: str,
     device: Union[str, torch.device] = "cpu",
-    torch_dtype: Union[str, torch.dtype] = "auto",
+    dtype: Union[str, torch.dtype] = "auto",
     use_auto_mapping: bool = False,
     trust_remote_code: bool = True,
     model_dtype: str = None,
@@ -393,15 +393,13 @@ def diffusion_load_model(
     from auto_round.utils.device import get_device_and_parallelism
 
     device_str, use_auto_mapping = get_device_and_parallelism(device)
-    torch_dtype = "auto"
+    dtype = "auto"
     if device_str is not None and "hpu" in device_str:
-        torch_dtype = torch.bfloat16
+        dtype = torch.bfloat16
 
     pipelines = LazyImport("diffusers.pipelines")
 
-    pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained(
-        pretrained_model_name_or_path, torch_dtype=torch_dtype
-    )
+    pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained(pretrained_model_name_or_path, dtype=dtype)
     pipe = _to_model_dtype(pipe, model_dtype)
     model = pipe.transformer
     return pipe, model.to(device)

diff --git a/docs/step_by_step.md b/docs/step_by_step.md
@@ -489,7 +489,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 model_name = "opensourcerelease/DeepSeek-R1-bf16"
 
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, dtype="auto")
 
 block = model.model.layers
 device_map = {}
@@ -599,7 +599,7 @@ Supports 2, 4, and 8 bits. We recommend using intel-extension-for-pytorch (IPEX)
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -615,7 +615,7 @@ Supports 4 bits only. We recommend using intel-extension-for-pytorch (IPEX) for
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -630,7 +630,7 @@ Supports 2, 3, 4, and 8 bits. We recommend using GPTQModel for 4 and 8 bits infe
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
-model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
@@ -670,7 +670,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
 model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
 quantization_config = AutoRoundConfig(backend="ipex")
 model = AutoModelForCausalLM.from_pretrained(
-    model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto"
+    model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"
@@ -701,7 +701,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
 model_name = "ybelkada/opt-125m-gptq-4bit"
 quantization_config = AutoRoundConfig()
 model = AutoModelForCausalLM.from_pretrained(
-    model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto"
+    model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 text = "There is a girl who likes adventure,"

diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py
@@ -24,7 +24,7 @@ class TestAutoRoundAct(unittest.TestCase):
     def setUpClass(self):
         self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         self.save_dir = "./saved"
-        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()
 
@@ -35,7 +35,7 @@ def tearDownClass(self):
 
     def test_mx_fp4(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size, sym = 4, 128, True
         autoround = AutoRound(
@@ -54,7 +54,7 @@ def test_mx_fp4(self):
 
     def test_wint4fp8_dynamic(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         bits, group_size = 4, 128
         autoround = AutoRound(
@@ -93,7 +93,7 @@ def test_wfp8afp8_static(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
         from auto_round.wrapper import WrapperWALayer
 
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
             model,
@@ -114,7 +114,7 @@ def test_wfp8afp8_static(self):
         self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], 30)
 
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         autoround = AutoRound(
             model,

diff --git a/test/test_cpu/test_autoopt.py b/test/test_cpu/test_autoopt.py
@@ -24,7 +24,7 @@ class TestAutoRound(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.llm_dataloader = LLMDataLoader()