diff --git a/.azure-pipelines/scripts/install_nc.sh b/.azure-pipelines/scripts/install_nc.sh index 38f0ccf557e..c95ce2f9dd7 100644 --- a/.azure-pipelines/scripts/install_nc.sh +++ b/.azure-pipelines/scripts/install_nc.sh @@ -11,12 +11,13 @@ if [[ $1 = *"3x_pt"* ]]; then else echo -e "\n Install torch CPU ... " pip install torch==2.7.0 torchvision --index-url https://download.pytorch.org/whl/cpu - python -m pip install intel-extension-for-pytorch==2.7.0 oneccl_bind_pt==2.7.0 --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ + python -m pip install intel-extension-for-pytorch==2.7.0 oneccl_bind_pt==2.7.0 --index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ python -m pip install --no-cache-dir -r requirements.txt python setup.py bdist_wheel fi pip install --no-deps dist/neural_compressor*.whl --force-reinstall elif [[ $1 = *"3x_tf"* ]]; then + pip install tensorflow==2.19.0 python -m pip install --no-cache-dir -r requirements.txt python -m pip install --no-cache-dir -r requirements_tf.txt python setup.py bdist_wheel diff --git a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh index 5fbab435634..5f5f2136c64 100644 --- a/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh +++ b/.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh @@ -12,7 +12,6 @@ echo "##[section]import check pass" # install requirements echo "##[group]set up UT env..." export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH -export PT_HPU_LAZY_MODE=1 sed -i '/^intel_extension_for_pytorch/d' /neural-compressor/test/3x/torch/requirements.txt sed -i '/^auto_round/d' /neural-compressor/test/3x/torch/requirements.txt cat /neural-compressor/test/3x/torch/requirements.txt diff --git a/.azure-pipelines/template/docker-template.yml b/.azure-pipelines/template/docker-template.yml index da197c0b20a..a7625fc4d60 100644 --- a/.azure-pipelines/template/docker-template.yml +++ b/.azure-pipelines/template/docker-template.yml @@ -74,7 +74,7 @@ steps: - ${{ if eq(parameters.imageSource, 'pull') }}: - script: | - docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu24.04/habanalabs/pytorch-installer-2.7.1:latest displayName: "Pull habana docker image" - script: | @@ -95,7 +95,7 @@ steps: else docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \ --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \ - -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.22.0/ubuntu24.04/habanalabs/pytorch-installer-2.7.1:latest docker exec ${{ parameters.containerName }} bash -c "ln -sf \$(which python3) /usr/bin/python" fi echo "Show the container list after docker run ... " diff --git a/.azure-pipelines/ut-3x-pt-fp8.yml b/.azure-pipelines/ut-3x-pt-fp8.yml index 74f5a877496..47d18fab624 100644 --- a/.azure-pipelines/ut-3x-pt-fp8.yml +++ b/.azure-pipelines/ut-3x-pt-fp8.yml @@ -39,6 +39,7 @@ stages: jobs: - job: displayName: Torch 3x Habana FP8 + timeoutInMinutes: 120 steps: - template: template/ut-template.yml parameters: @@ -54,6 +55,7 @@ stages: jobs: - job: displayName: Torch 3x Habana FP8 baseline + timeoutInMinutes: 120 continueOnError: true steps: - template: template/ut-template.yml diff --git a/README.md b/README.md index 5e54c01fe32..362f47676cf 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ IntelĀ® Neural Compressor

An open-source Python library supporting popular model compression techniques on all mainstream deep learning frameworks (TensorFlow, PyTorch, and ONNX Runtime)

[![python](https://img.shields.io/badge/python-3.8%2B-blue)](https://github.com/intel/neural-compressor) -[![version](https://img.shields.io/badge/release-3.4.1-green)](https://github.com/intel/neural-compressor/releases) +[![version](https://img.shields.io/badge/release-3.5-green)](https://github.com/intel/neural-compressor/releases) [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/intel/neural-compressor/blob/master/LICENSE) [![coverage](https://img.shields.io/badge/coverage-85%25-green)](https://github.com/intel/neural-compressor) [![Downloads](https://static.pepy.tech/personalized-badge/neural-compressor?period=total&units=international_system&left_color=grey&right_color=green&left_text=downloads)](https://pepy.tech/project/neural-compressor) @@ -56,7 +56,7 @@ To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, p Run a container with an interactive shell, [more info](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html#docker-installation) ``` -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu24.04/habanalabs/pytorch-installer-2.6.0:latest +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu24.04/habanalabs/pytorch-installer-2.7.1:latest ``` > Note: Since Habana software >= 1.21.0, `PT_HPU_LAZY_MODE=0` is the default setting. However, most low-precision functions (such as `convert_from_uint4`) do not support this setting. Therefore, we recommend setting `PT_HPU_LAZY_MODE=1` to maintain compatibility. diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt index fe584f6fc27..417511035ba 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/transformers/weight_only/text-generation/requirements_cpu_woq.txt @@ -16,5 +16,5 @@ huggingface_hub numba tbb intel-extension-for-pytorch==2.4.0 ---extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ +--index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ oneccl_bind_pt==2.4.0 diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py index 3c9927734cc..b51af6701a7 100644 --- a/neural_compressor/torch/algorithms/weight_only/save_load.py +++ b/neural_compressor/torch/algorithms/weight_only/save_load.py @@ -937,7 +937,10 @@ def _init_hf_model(self, model_class, config): else: # pragma: no cover assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}' - dtype_orig = model_class._set_default_torch_dtype(torch_dtype) + if parse(transformers.__version__) >= parse("4.56.0"): + dtype_orig = model_class._set_default_dtype(torch_dtype) + else: + dtype_orig = model_class._set_default_torch_dtype(torch_dtype) init_contexts = ( [no_init_weights(_enable=_fast_init)] diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py index a484824d39c..0f452f10746 100644 --- a/neural_compressor/torch/utils/utility.py +++ b/neural_compressor/torch/utils/utility.py @@ -38,7 +38,7 @@ if is_transformers_imported(): import transformers - SUPPORTED_LAYERS = [nn.Linear, transformers.modeling_utils.Conv1D] + SUPPORTED_LAYERS = [nn.Linear, transformers.pytorch_utils.Conv1D] else: SUPPORTED_LAYERS = [nn.Conv1d, nn.Linear] diff --git a/neural_compressor/transformers/models/modeling_auto.py b/neural_compressor/transformers/models/modeling_auto.py index dd17dfe580a..59de626cfaf 100644 --- a/neural_compressor/transformers/models/modeling_auto.py +++ b/neural_compressor/transformers/models/modeling_auto.py @@ -393,7 +393,7 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): # index of the files. is_sharded = False sharded_metadata = None - if transformers.__version__ >= "4.50": + if parse(transformers.__version__) >= parse("4.50"): from transformers.modeling_utils import _get_resolved_checkpoint_files gguf_file = kwargs.pop("gguf_file", None) @@ -635,8 +635,10 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs): torch_dtype = torch.float32 else: assert False, f'`torch_dtype` can be either `torch.dtype` or `"auto"`, but received {torch_dtype}' - - dtype_orig = model_class._set_default_torch_dtype(torch_dtype) + if parse(transformers.__version__) >= parse("4.56.0"): + dtype_orig = model_class._set_default_dtype(torch_dtype) + else: + dtype_orig = model_class._set_default_torch_dtype(torch_dtype) if quantization_config.compute_dtype is None: if use_xpu: quantization_config.compute_dtype = ( diff --git a/neural_compressor/transformers/quantization/utils.py b/neural_compressor/transformers/quantization/utils.py index 80be5b3764f..ced7ba174cd 100644 --- a/neural_compressor/transformers/quantization/utils.py +++ b/neural_compressor/transformers/quantization/utils.py @@ -579,7 +579,7 @@ def set_nontext_module_config(model, to_quant_block_names, config): set_nontext_module_config(model, to_quant_block_names, config) for n, m in model.named_modules(): - if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D): + if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.pytorch_utils.Conv1D): if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0: config.modules_to_not_convert.append(n) print( diff --git a/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py b/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py index 408c7aeb841..5df612dd1b6 100644 --- a/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py +++ b/test/3x/torch/algorithms/pt2e_quant/test_pt2e_w8a8.py @@ -75,24 +75,18 @@ def test_quantizer_on_llm(self): model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name) + model_config = model.config tokenizer = AutoTokenizer.from_pretrained(model_name) - input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"] + inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") # example_inputs = (input_ids,) # model = export_model_for_pt2e_quant(model, example_inputs=example_inputs) + attention_mask = inputs.attention_mask + input_ids = inputs.input_ids + + + from transformers.integrations.executorch import export_with_dynamic_cache from transformers import DynamicCache - example_inputs = { - "input_ids": input_ids, - "attention_mask": None, - "past_key_values": DynamicCache(), - "use_cache": True, - } - with torch.no_grad(): - ep = torch.export.export_for_training( - model, - (), - example_inputs, - strict=False, - ) + ep = export_with_dynamic_cache(model, input_ids, attention_mask) model = ep.module() model._exported = True @@ -102,7 +96,12 @@ def test_quantizer_on_llm(self): prepare_model = w8a8_static_quantizer.prepare(model) # calibrate for i in range(2): - prepare_model(**example_inputs) + prepare_model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=DynamicCache(config=model_config), + use_cache=True, + ) # convert converted_model = w8a8_static_quantizer.convert(prepare_model) # inference @@ -110,7 +109,11 @@ def test_quantizer_on_llm(self): config.freezing = True opt_model = torch.compile(converted_model) - out = opt_model(**example_inputs) + out = opt_model(input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=DynamicCache(config=model_config), + use_cache=True, + ) assert out.logits is not None @patch("neural_compressor.torch.algorithms.pt2e_quant.core.logger.error") diff --git a/test/3x/torch/quantization/test_pt2e_quant.py b/test/3x/torch/quantization/test_pt2e_quant.py index 7bce6b89066..ab426a5a2c7 100644 --- a/test/3x/torch/quantization/test_pt2e_quant.py +++ b/test/3x/torch/quantization/test_pt2e_quant.py @@ -206,23 +206,19 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex): model_name = "facebook/opt-125m" model = AutoModelForCausalLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) - input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"] + model = AutoModelForCausalLM.from_pretrained(model_name) + model_config = model.config + tokenizer = AutoTokenizer.from_pretrained(model_name) + inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") # example_inputs = (input_ids,) - # model = export(model, example_inputs=example_inputs) + # model = export_model_for_pt2e_quant(model, example_inputs=example_inputs) + attention_mask = inputs.attention_mask + input_ids = inputs.input_ids + + + from transformers.integrations.executorch import export_with_dynamic_cache from transformers import DynamicCache - example_inputs = { - "input_ids": input_ids, - "attention_mask": None, - "past_key_values": DynamicCache(), - "use_cache": True, - } - with torch.no_grad(): - ep = torch.export.export_for_training( - model, - (), - example_inputs, - strict=False, - ) + ep = export_with_dynamic_cache(model, input_ids, attention_mask) model = ep.module() model._exported = True model.dynamic_shapes = None @@ -232,7 +228,12 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex): prepare_model = prepare(model, quant_config) # calibrate for i in range(2): - prepare_model(**example_inputs) + prepare_model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=DynamicCache(config=model_config), + use_cache=True, + ) # convert converted_model = convert(prepare_model) # inference @@ -240,7 +241,12 @@ def test_prepare_and_convert_on_llm(self, force_not_import_ipex): config.freezing = True opt_model = torch.compile(converted_model) - out = opt_model(**example_inputs) + out = opt_model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=DynamicCache(config=model_config), + use_cache=True, + ) assert out.logits is not None @staticmethod diff --git a/test/3x/torch/requirements.txt b/test/3x/torch/requirements.txt index 16b7e508083..b17193fe980 100644 --- a/test/3x/torch/requirements.txt +++ b/test/3x/torch/requirements.txt @@ -1,6 +1,6 @@ auto_round datasets -deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.21.0 +deepspeed @ git+https://github.com/HabanaAI/DeepSpeed.git@1.22.0 expecttest intel_extension_for_pytorch numpy