Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ The support for Gaudi device is limited.
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Intel/DeepSeek-R1-0528-Qwen3-8B-int4-AutoRound"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
Expand Down
2 changes: 1 addition & 1 deletion auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ def tune(args):
logger.error("Cannot find correct gguf file for evaluation, please check.")
sys.exit(-1)
model = AutoModelForCausalLM.from_pretrained(
eval_folder, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
eval_folder, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(eval_folder, gguf_file=gguf_file)
Expand Down
2 changes: 1 addition & 1 deletion auto_round/compressors/diffusion/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ from diffusers import AutoPipelineForText2Image

# Load the model
model_name = "black-forest-labs/FLUX.1-dev"
pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16)
pipe = AutoPipelineForText2Image.from_pretrained(model_name, dtype=torch.bfloat16)

# Quantize the model
autoround = AutoRound(
Expand Down
4 changes: 2 additions & 2 deletions auto_round/eval/eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def eval(args):
" but may affect accuracy."
)
model = AutoModelForCausalLM.from_pretrained(
model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
model, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
)
model.eval()
st = time.time()
Expand Down Expand Up @@ -252,7 +252,7 @@ def eval_task_by_task(
)

model = AutoModelForCausalLM.from_pretrained(
model, gguf_file=gguf_file, device_map="auto", torch_dtype=eval_model_dtype
model, gguf_file=gguf_file, device_map="auto", dtype=eval_model_dtype
)
model.eval()
parallelism = False
Expand Down
8 changes: 4 additions & 4 deletions auto_round/experimental/kv_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def prep_attention_module_for_calibration(module: torch.nn.Module):

def normalize_static_kv_dtype(static_kv_dtype: Union[str, torch.dtype]) -> torch.dtype:
valid_dtype_name_lst = ["float16", "bfloat16", "fp8", "float32", "float"]
valid_torch_dtype = {
valid_dtype = {
"float16": torch.float16,
"bfloat16": torch.bfloat16,
"fp8": torch.float8_e4m3fn,
Expand All @@ -272,13 +272,13 @@ def normalize_static_kv_dtype(static_kv_dtype: Union[str, torch.dtype]) -> torch
"float": torch.float32, # Alias for float32
}
if static_kv_dtype in valid_dtype_name_lst:
new_dtype = valid_torch_dtype[static_kv_dtype]
elif static_kv_dtype in valid_torch_dtype.values():
new_dtype = valid_dtype[static_kv_dtype]
elif static_kv_dtype in valid_dtype.values():
new_dtype = static_kv_dtype
else:
raise ValueError(
f"Invalid static kv dtype: {static_kv_dtype}. "
f"Valid options are: {', '.join(valid_dtype_name_lst + list(valid_torch_dtype.values()))}."
f"Valid options are: {', '.join(valid_dtype_name_lst + list(valid_dtype.values()))}."
)
return new_dtype

Expand Down
8 changes: 4 additions & 4 deletions auto_round/export/export_to_autoround/export_to_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,10 @@ def pack_layer(layer_name, model, data_type, device=None):
weight = layer.weight
weight, orig_shape, pad_len = reshape_pad_tensor_by_group_size(weight, layer.group_size)
act_scale = layer.act_scale.view(-1) if hasattr(layer, "act_scale") else None
torch_dtype = torch.float8_e4m3fn
dtype = torch.float8_e4m3fn
if "fp8_e5m2" in data_type:
torch_dtype = torch.float8_e5m2
info = torch.finfo(torch_dtype)
dtype = torch.float8_e5m2
info = torch.finfo(dtype)
if zp is not None:
if isinstance(zp, torch.Tensor):
zp = zp.to(packing_device)
Expand All @@ -117,7 +117,7 @@ def pack_layer(layer_name, model, data_type, device=None):
q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1)
q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len)
q_weight = torch.clamp(q_weight, info.min, info.max)
q_weight = q_weight.to(torch_dtype)
q_weight = q_weight.to(dtype)
if type(layer) == torch.nn.Linear:
in_features = layer.in_features
out_features = layer.out_features
Expand Down
2 changes: 1 addition & 1 deletion auto_round/export/export_to_gguf/convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def __init__(

# Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
if self.ftype == gguf.LlamaFileType.GUESSED:
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
# NOTE: can't use field "dtype" in config.json, because some finetunes lie.
_, first_tensor = next(self.get_tensors())
if first_tensor.dtype == torch.float16:
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,10 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device:
weight = layer.weight
weight, orig_shape, pad_len = reshape_pad_tensor_by_group_size(weight, layer.group_size)
act_scale = layer.act_scale.view(-1) if hasattr(layer, "act_scale") else None
torch_dtype = torch.float8_e4m3fn
dtype = torch.float8_e4m3fn
if "fp8_e5m2" in data_type:
torch_dtype = torch.float8_e5m2
info = torch.finfo(torch_dtype)
dtype = torch.float8_e5m2
info = torch.finfo(dtype)
if zp is not None:
if isinstance(zp, torch.Tensor):
zp = zp.to(packing_device)
Expand All @@ -85,7 +85,7 @@ def pack_layer(layer_name: str, model: torch.nn.Module, data_type: str, device:
q_weight = weight.to(packing_device) / scale.to(packing_device).unsqueeze(-1)
q_weight = revert_tensor_by_pad(q_weight, orig_shape=orig_shape, pad_len=pad_len)
q_weight = torch.clamp(q_weight, info.min, info.max)
q_weight = q_weight.to(torch_dtype)
q_weight = q_weight.to(dtype)
if type(layer) == torch.nn.Linear:
in_features = layer.in_features
out_features = layer.out_features
Expand Down
2 changes: 1 addition & 1 deletion auto_round/export/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def save_model(
if dtype is not None and dtype != model.dtype and os.path.exists(os.path.join(save_dir, "config.json")):
with open(config_path, "r") as file:
data = json.load(file)
data["torch_dtype"] = str(dtype).split(".")[-1]
data["dtype"] = str(dtype).split(".")[-1]
with open(config_path, "w") as file:
json.dump(data, file, indent=2)
config_file = "quantization_config.json"
Expand Down
8 changes: 4 additions & 4 deletions auto_round/inference/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,10 +329,10 @@ def validate_environment(self, *args, **kwargs):
"auto-round` or install from source"
)

def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype":
if torch_dtype is None:
torch_dtype = torch.bfloat16
return torch_dtype
def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
if dtype is None:
dtype = torch.bfloat16
return dtype

def post_init_model(self, model):
"""Post-initialization that require device information, for example buffers initialization on device.
Expand Down
2 changes: 1 addition & 1 deletion auto_round/modelling/gpt_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def __init__(self, config: GptOssConfig, original: GptOssMLP):
super().__init__()
hidden_size = config.hidden_size
intermediate_size = config.intermediate_size
dtype_str = getattr(config, "torch_dtype", None) or getattr(config, "dtype", None)
dtype_str = getattr(config, "dtype", None)
dtype = torch.bfloat16 if str(dtype_str).endswith("bfloat16") else torch.float32
top_k = config.num_experts_per_tok
self.hidden_size = hidden_size
Expand Down
36 changes: 17 additions & 19 deletions auto_round/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,9 @@ def llm_load_model(
)

device_str, use_auto_mapping = get_device_and_parallelism(device)
torch_dtype = "auto"
dtype = "auto"
if device_str is not None and "hpu" in device_str:
torch_dtype = torch.bfloat16
dtype = torch.bfloat16

is_glm = bool(re.search("chatglm", pretrained_model_name_or_path.lower()))

Expand All @@ -210,7 +210,7 @@ def llm_load_model(
if _use_hpu_compile_mode():
model = model_cls.from_pretrained(
pretrained_model_name_or_path,
torch_dtype=torch_dtype,
dtype=dtype,
attn_implementation="eager",
trust_remote_code=trust_remote_code,
device_map="auto" if use_auto_mapping else None,
Expand All @@ -219,7 +219,7 @@ def llm_load_model(
try:
model = model_cls.from_pretrained(
pretrained_model_name_or_path,
torch_dtype=torch_dtype,
dtype=dtype,
trust_remote_code=trust_remote_code,
device_map="auto" if use_auto_mapping else None,
)
Expand All @@ -228,7 +228,7 @@ def llm_load_model(
orig_func = set_fake_cuda_device_capability()
model = model_cls.from_pretrained(
pretrained_model_name_or_path,
torch_dtype=torch_dtype,
dtype=dtype,
trust_remote_code=trust_remote_code,
device_map="auto" if use_auto_mapping else None,
)
Expand All @@ -241,7 +241,7 @@ def llm_load_model(
logger.warning(f"fail to load {pretrained_model_name_or_path}, set trust_remote_code to False and retry.")
model = model_cls.from_pretrained(
pretrained_model_name_or_path,
torch_dtype=torch_dtype,
dtype=dtype,
trust_remote_code=False,
device_map="auto" if use_auto_mapping else None,
)
Expand All @@ -256,7 +256,7 @@ def llm_load_model(
def mllm_load_model(
pretrained_model_name_or_path,
device="cpu",
torch_dtype="auto",
dtype="auto",
use_auto_mapping=True,
trust_remote_code=True,
model_dtype=None,
Expand All @@ -268,9 +268,9 @@ def mllm_load_model(
from auto_round.utils.device import get_device_and_parallelism, set_fake_cuda_device_capability

device_str, use_auto_mapping = get_device_and_parallelism(device)
torch_dtype = "auto"
dtype = "auto"
if device_str is not None and "hpu" in device_str:
torch_dtype = torch.bfloat16
dtype = torch.bfloat16
if os.path.isdir(pretrained_model_name_or_path):
config = json.load(open(os.path.join(pretrained_model_name_or_path, "config.json")))
else:
Expand Down Expand Up @@ -306,7 +306,7 @@ def mllm_load_model(
model: DeepseekVLV2ForCausalLM = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
dtype=dtype,
device_map="auto" if use_auto_mapping else None,
)
else:
Expand All @@ -318,7 +318,7 @@ def mllm_load_model(
pretrained_model_name_or_path,
model_base=None,
model_name=pretrained_model_name_or_path,
torch_dtype=torch_dtype,
dtype=dtype,
)
else:
if architectures.endswith("Model") and hasattr(
Expand All @@ -333,7 +333,7 @@ def mllm_load_model(
model = cls.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
dtype=dtype,
device_map="auto" if use_auto_mapping else None,
)
except ValueError as e:
Expand All @@ -342,7 +342,7 @@ def mllm_load_model(
model = cls.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust_remote_code,
torch_dtype=torch_dtype,
dtype=dtype,
device_map="auto" if use_auto_mapping else None,
)
torch.cuda.get_device_capability = orig_func
Expand Down Expand Up @@ -383,7 +383,7 @@ def mllm_load_model(
def diffusion_load_model(
pretrained_model_name_or_path: str,
device: Union[str, torch.device] = "cpu",
torch_dtype: Union[str, torch.dtype] = "auto",
dtype: Union[str, torch.dtype] = "auto",
use_auto_mapping: bool = False,
trust_remote_code: bool = True,
model_dtype: str = None,
Expand All @@ -393,15 +393,13 @@ def diffusion_load_model(
from auto_round.utils.device import get_device_and_parallelism

device_str, use_auto_mapping = get_device_and_parallelism(device)
torch_dtype = "auto"
dtype = "auto"
if device_str is not None and "hpu" in device_str:
torch_dtype = torch.bfloat16
dtype = torch.bfloat16

pipelines = LazyImport("diffusers.pipelines")

pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained(
pretrained_model_name_or_path, torch_dtype=torch_dtype
)
pipe = pipelines.auto_pipeline.AutoPipelineForText2Image.from_pretrained(pretrained_model_name_or_path, dtype=dtype)
pipe = _to_model_dtype(pipe, model_dtype)
model = pipe.transformer
return pipe, model.to(device)
Expand Down
12 changes: 6 additions & 6 deletions docs/step_by_step.md
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "opensourcerelease/DeepSeek-R1-bf16"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, dtype="auto")

block = model.model.layers
device_map = {}
Expand Down Expand Up @@ -599,7 +599,7 @@ Supports 2, 4, and 8 bits. We recommend using intel-extension-for-pytorch (IPEX)
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
Expand All @@ -615,7 +615,7 @@ Supports 4 bits only. We recommend using intel-extension-for-pytorch (IPEX) for
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
Expand All @@ -630,7 +630,7 @@ Supports 2, 3, 4, and 8 bits. We recommend using GPTQModel for 4 and 8 bits infe
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
inputs = tokenizer(text, return_tensors="pt").to(model.device)
Expand Down Expand Up @@ -670,7 +670,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
quantization_config = AutoRoundConfig(backend="ipex")
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto"
model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
Expand Down Expand Up @@ -701,7 +701,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
model_name = "ybelkada/opt-125m-gptq-4bit"
quantization_config = AutoRoundConfig()
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto"
model_name, device_map="cpu", quantization_config=quantization_config, dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "There is a girl who likes adventure,"
Expand Down
10 changes: 5 additions & 5 deletions test/test_cpu/test_act_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class TestAutoRoundAct(unittest.TestCase):
def setUpClass(self):
self.model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
self.save_dir = "./saved"
self.model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name, dtype="auto", trust_remote_code=True)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
self.llm_dataloader = LLMDataLoader()

Expand All @@ -35,7 +35,7 @@ def tearDownClass(self):

def test_mx_fp4(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
bits, group_size, sym = 4, 128, True
autoround = AutoRound(
Expand All @@ -54,7 +54,7 @@ def test_mx_fp4(self):

def test_wint4fp8_dynamic(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
bits, group_size = 4, 128
autoround = AutoRound(
Expand Down Expand Up @@ -93,7 +93,7 @@ def test_wfp8afp8_static(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
from auto_round.wrapper import WrapperWALayer

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
autoround = AutoRound(
model,
Expand All @@ -114,7 +114,7 @@ def test_wfp8afp8_static(self):
self.assertEqual(autoround.model.model.decoder.layers[2].self_attn.k_proj.orig_layer.act_max.shape[0], 30)

model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
autoround = AutoRound(
model,
Expand Down
2 changes: 1 addition & 1 deletion test/test_cpu/test_autoopt.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class TestAutoRound(unittest.TestCase):
@classmethod
def setUpClass(self):
model_name = "/tf_dataset/auto_round/models/facebook/opt-125m"
self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype="auto", trust_remote_code=True)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
self.llm_dataloader = LLMDataLoader()

Expand Down
Loading