diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh
index 38f0ccf557e..c95ce2f9dd7 100644
--- a/.azure-pipelines/scripts/install_nc.sh
+++ b/.azure-pipelines/scripts/install_nc.sh
@@ -11,12 +11,13 @@ if [[ $1 = *"3x_pt"* ]]; then
else
echo -e "\n Install torch CPU ... "
pip install torch==2.7.0 torchvision --index-url https://download.pytorch.org/whl/cpu
- python -m pip install intel-extension-for-pytorch==2.7.0 oneccl_bind_pt==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+ python -m pip install intel-extension-for-pytorch==2.7.0 oneccl_bind_pt==2.7.0 --index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
python -m pip install --no-cache-dir -r requirements.txt
python setup.py bdist_wheel
fi
pip install --no-deps dist/neural_compressor*.whl --force-reinstall
elif [[ $1 = *"3x_tf"* ]]; then
+ pip install tensorflow==2.19.0
python -m pip install --no-cache-dir -r requirements.txt
python -m pip install --no-cache-dir -r requirements_tf.txt
python setup.py bdist_wheel
diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
index 5fbab435634..5f5f2136c64 100644
--- a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
+++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
@@ -12,7 +12,6 @@ echo "##[section]import check pass"
# install requirements
echo "##[group]set up UT env..."
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
-export PT_HPU_LAZY_MODE=1
sed -i '/^intel_extension_for_pytorch/d' /neural-compressor/test/3x/torch/requirements.txt
sed -i '/^auto_round/d' /neural-compressor/test/3x/torch/requirements.txt
cat /neural-compressor/test/3x/torch/requirements.txt
diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml
index da197c0b20a..a7625fc4d60 100644
--- a/.azure-pipelines/template/docker-template.yml
+++ b/.azure-pipelines/template/docker-template.yml
@@ -74,7 +74,7 @@ steps:
- ${{ if eq(parameters.imageSource, 'pull') }}:
- script: |
- docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+ docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu24.04/habanalabs/pytorch-installer-2.7.1:latest
displayName: "Pull habana docker image"
- script: |
@@ -95,7 +95,7 @@ steps:
else
docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
- -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+ -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.22.0/ubuntu24.04/habanalabs/pytorch-installer-2.7.1:latest
docker exec ${{ parameters.containerName }} bash -c "ln -sf \$(which python3) /usr/bin/python"
fi
echo "Show the container list after docker run ... "
diff --git a/.azure-pipelines/ut-3x-pt-fp8.yml b/.azure-pipelines/ut-3x-pt-fp8.yml
index 74f5a877496..47d18fab624 100644
--- a/.azure-pipelines/ut-3x-pt-fp8.yml
+++ b/.azure-pipelines/ut-3x-pt-fp8.yml
@@ -39,6 +39,7 @@ stages:
jobs:
- job:
displayName: Torch 3x Habana FP8
+ timeoutInMinutes: 120
steps:
- template: template/ut-template.yml
parameters:
@@ -54,6 +55,7 @@ stages:
jobs:
- job:
displayName: Torch 3x Habana FP8 baseline
+ timeoutInMinutes: 120
continueOnError: true
steps:
- template: template/ut-template.yml
diff --git a/README.md b/README.md
index 5e54c01fe32..362f47676cf 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ IntelĀ® Neural Compressor
An open-source Python library supporting popular model compression techniques on all mainstream deep learning frameworks (TensorFlow, PyTorch, and ONNX Runtime)
[](https://github.com/intel/neural-compressor)
-[](https://github.com/intel/neural-compressor/releases)
+[](https://github.com/intel/neural-compressor/releases)
[](https://github.com/intel/neural-compressor/blob/master/LICENSE)
[](https://github.com/intel/neural-compressor)
[](https://pepy.tech/project/neural-compressor)
@@ -56,7 +56,7 @@ To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, p
Run a container with an interactive shell, [more info](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html#docker-installation)
```
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu24.04/habanalabs/pytorch-installer-2.6.0:latest
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu24.04/habanalabs/pytorch-installer-2.7.1:latest
```
> Note: Since Habana software >= 1.21.0, `PT_HPU_LAZY_MODE=0` is the default setting. However, most low-precision functions (such as `convert_from_uint4`) do not support this setting. Therefore, we recommend setting `PT_HPU_LAZY_MODE=1` to maintain compatibility.
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt
index fe584f6fc27..417511035ba 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt
@@ -16,5 +16,5 @@ huggingface_hub
numba
tbb
intel-extension-for-pytorch==2.4.0
---extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+--index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
oneccl_bind_pt==2.4.0
diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py
index 3c9927734cc..b51af6701a7 100644
--- a/neural_compressor/torch/algorithms/weight_only/save_load.py
+++ b/neural_compressor/torch/algorithms/weight_only/save_load.py
@@ -937,7 +937,10 @@ def _init_hf_model(self, model_class, config):
else: # pragma: no cover
assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}'
- dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
+ if parse(transformers.__version__) >= parse("4.56.0"):
+ dtype_orig = model_class._set_default_dtype(torch_dtype)
+ else:
+ dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
init_contexts = (
[no_init_weights(_enable=_fast_init)]
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index a484824d39c..0f452f10746 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -38,7 +38,7 @@
if is_transformers_imported():
import transformers
- SUPPORTED_LAYERS = [nn.Linear, transformers.modeling_utils.Conv1D]
+ SUPPORTED_LAYERS = [nn.Linear, transformers.pytorch_utils.Conv1D]
else:
SUPPORTED_LAYERS = [nn.Conv1d, nn.Linear]
diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py
index dd17dfe580a..59de626cfaf 100644
--- a/neural_compressor/transformers/models/modeling_auto.py
+++ b/neural_compressor/transformers/models/modeling_auto.py
@@ -393,7 +393,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
# index of the files.
is_sharded = False
sharded_metadata = None
- if transformers.__version__ >= "4.50":
+ if parse(transformers.__version__) >= parse("4.50"):
from transformers.modeling_utils import _get_resolved_checkpoint_files
gguf_file = kwargs.pop("gguf_file", None)
@@ -635,8 +635,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
torch_dtype = torch.float32
else:
assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}'
-
- dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
+ if parse(transformers.__version__) >= parse("4.56.0"):
+ dtype_orig = model_class._set_default_dtype(torch_dtype)
+ else:
+ dtype_orig = model_class._set_default_torch_dtype(torch_dtype)
if quantization_config.compute_dtype is None:
if use_xpu:
quantization_config.compute_dtype = (
diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py
index 80be5b3764f..ced7ba174cd 100644
--- a/neural_compressor/transformers/quantization/utils.py
+++ b/neural_compressor/transformers/quantization/utils.py
@@ -579,7 +579,7 @@ def set_nontext_module_config(model, to_quant_block_names, config):
set_nontext_module_config(model, to_quant_block_names, config)
for n, m in model.named_modules():
- if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
+ if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.pytorch_utils.Conv1D):
if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
config.modules_to_not_convert.append(n)
print(
diff --git a/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py b/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py
index 408c7aeb841..5df612dd1b6 100644
--- a/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py
+++ b/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py
@@ -75,24 +75,18 @@ def test_quantizer_on_llm(self):
model_name = "facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name)
+ model_config = model.config
tokenizer = AutoTokenizer.from_pretrained(model_name)
- input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+ inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# example_inputs = (input_ids,)
# model = export_model_for_pt2e_quant(model, example_inputs=example_inputs)
+ attention_mask = inputs.attention_mask
+ input_ids = inputs.input_ids
+
+
+ from transformers.integrations.executorch import export_with_dynamic_cache
from transformers import DynamicCache
- example_inputs = {
- "input_ids": input_ids,
- "attention_mask": None,
- "past_key_values": DynamicCache(),
- "use_cache": True,
- }
- with torch.no_grad():
- ep = torch.export.export_for_training(
- model,
- (),
- example_inputs,
- strict=False,
- )
+ ep = export_with_dynamic_cache(model, input_ids, attention_mask)
model = ep.module()
model._exported = True
@@ -102,7 +96,12 @@ def test_quantizer_on_llm(self):
prepare_model = w8a8_static_quantizer.prepare(model)
# calibrate
for i in range(2):
- prepare_model(**example_inputs)
+ prepare_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ past_key_values=DynamicCache(config=model_config),
+ use_cache=True,
+ )
# convert
converted_model = w8a8_static_quantizer.convert(prepare_model)
# inference
@@ -110,7 +109,11 @@ def test_quantizer_on_llm(self):
config.freezing = True
opt_model = torch.compile(converted_model)
- out = opt_model(**example_inputs)
+ out = opt_model(input_ids=input_ids,
+ attention_mask=attention_mask,
+ past_key_values=DynamicCache(config=model_config),
+ use_cache=True,
+ )
assert out.logits is not None
@patch("neural_compressor.torch.algorithms.pt2e_quant.core.logger.error")
diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py
index 7bce6b89066..ab426a5a2c7 100644
--- a/test/3x/torch/quantization/test_pt2e_quant.py
+++ b/test/3x/torch/quantization/test_pt2e_quant.py
@@ -206,23 +206,19 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex):
model_name = "facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
- input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+ model = AutoModelForCausalLM.from_pretrained(model_name)
+ model_config = model.config
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
+ inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# example_inputs = (input_ids,)
- # model = export(model, example_inputs=example_inputs)
+ # model = export_model_for_pt2e_quant(model, example_inputs=example_inputs)
+ attention_mask = inputs.attention_mask
+ input_ids = inputs.input_ids
+
+
+ from transformers.integrations.executorch import export_with_dynamic_cache
from transformers import DynamicCache
- example_inputs = {
- "input_ids": input_ids,
- "attention_mask": None,
- "past_key_values": DynamicCache(),
- "use_cache": True,
- }
- with torch.no_grad():
- ep = torch.export.export_for_training(
- model,
- (),
- example_inputs,
- strict=False,
- )
+ ep = export_with_dynamic_cache(model, input_ids, attention_mask)
model = ep.module()
model._exported = True
model.dynamic_shapes = None
@@ -232,7 +228,12 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex):
prepare_model = prepare(model, quant_config)
# calibrate
for i in range(2):
- prepare_model(**example_inputs)
+ prepare_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ past_key_values=DynamicCache(config=model_config),
+ use_cache=True,
+ )
# convert
converted_model = convert(prepare_model)
# inference
@@ -240,7 +241,12 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex):
config.freezing = True
opt_model = torch.compile(converted_model)
- out = opt_model(**example_inputs)
+ out = opt_model(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ past_key_values=DynamicCache(config=model_config),
+ use_cache=True,
+ )
assert out.logits is not None
@staticmethod
diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt
index 16b7e508083..b17193fe980 100644
--- a/test/3x/torch/requirements.txt
+++ b/test/3x/torch/requirements.txt
@@ -1,6 +1,6 @@
auto_round
datasets
-deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.21.0
+deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.22.0
expecttest
intel_extension_for_pytorch
numpy