intel
diff --git a/‎.azure-pipelines/scripts/models/run_model_trigger_common.sh‎
Lines changed: 1 addition & 1 deletion b/‎.azure-pipelines/scripts/models/run_model_trigger_common.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.azure-pipelines/scripts/ut/3x/coverage.3x_pt‎
Lines changed: 1 addition & 0 deletions b/‎.azure-pipelines/scripts/ut/3x/coverage.3x_pt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8‎
Lines changed: 1 addition & 0 deletions b/‎.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh‎
Lines changed: 1 addition & 1 deletion b/‎.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.azure-pipelines/template/docker-template.yml‎
Lines changed: 2 additions & 2 deletions b/‎.azure-pipelines/template/docker-template.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/3x/PT_FP8Quant.md‎
Lines changed: 1 addition & 10 deletions b/‎docs/source/3x/PT_FP8Quant.md‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/smooth_quant/run_clm_no_trainer.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/ipex/run_clm_no_trainer.py‎
Lines changed: 0 additions & 1 deletion
@@ -88,7 +88,7 @@ elif [ "${mode}" == "tuning" ]; then
     cd ${WORK_SOURCE_DIR}/${model_src_dir}
     # for int4 models add "--accuracy" to run tuning after quantize
     if [[ "${model}" == *"int4"* ]]; then
-        sed -i "s|--quantize|--quantize --accuracy --int8|g" run_quant.sh
+        sed -i "s|--quantize|--quantize --accuracy --load|g" run_quant.sh
     fi
 
     $BOLD_YELLOW && echo "workspace ${WORK_SOURCE_DIR}/${model_src_dir}" && $RESET
 
@@ -7,6 +7,7 @@ include =
  */neural_compressor/torch/*
 omit =
  */neural_compressor/torch/algorithms/fp8_quant/*
+ */neural_compressor/torch/algorithms/mixed_low_precision/*
  */neural_compressor/torch/amp/*
 exclude_lines =
  pragma: no cover
 
@@ -4,6 +4,7 @@ branch = True
 [report]
 include =
  */neural_compressor/torch/algorithms/fp8_quant/*
+ */neural_compressor/torch/algorithms/mixed_low_precision/*
 exclude_lines =
  pragma: no cover
  raise NotImplementedError
 
@@ -10,7 +10,6 @@ sed -i '/^intel_extension_for_pytorch/d' /neural-compressor/test/3x/torch/requir
 sed -i '/^auto_round/d' /neural-compressor/test/3x/torch/requirements.txt
 cat /neural-compressor/test/3x/torch/requirements.txt
 pip install -r /neural-compressor/test/3x/torch/requirements.txt
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
 pip install pytest-cov
 pip install pytest-html
 pip install pytest-html-merger
@@ -27,6 +26,7 @@ pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-co
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
 # pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/weight_only/test_autoround.py 2>&1 | tee -a ${ut_log_name}
 pytest --cov="${inc_path}" -vs --disable-warnings --html=report_4.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_5.html --self-contained-html torch/algorithms/fp8_quant 2>&1 | tee -a ${ut_log_name}
 
 mkdir -p report && mv *.html report
 pytest_html_merger -i ./report -o ./report.html
 
@@ -74,7 +74,7 @@ steps:
 
   - ${{ if eq(parameters.imageSource, 'pull') }}:
       - script: |
-            docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
         displayName: "Pull habana docker image"
 
   - script: |
@@ -95,7 +95,7 @@ steps:
             else
                 docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
                 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
-                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
             fi
             echo "Show the container list after docker run ... "
             docker ps -a
 
@@ -20,15 +20,6 @@ Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 c
 
 ## Supported Parameters
 
-<style type="text/css">
-.tg  {border-collapse:collapse;border-spacing:0;}
-.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
-  overflow:hidden;padding:10px 5px;word-break:normal;}
-.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
-  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
-.tg .tg-fymr{border-color:inherit;font-weight:bold;text-align:left;vertical-align:top}
-.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
-</style>
 <table class="tg"><thead>
   <tr>
     <th class="tg-fymr">Attribute</th>
@@ -74,7 +65,7 @@ Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 c
   <tr>
     <td class="tg-0pky">scale_method</td>
     <td class="tg-0pky">The method for calculating the scale from the measurement.</td>
-    <td class="tg-0pky">- without_scale - Convert to/from FP8 without scaling.<br>- unit_scale - Always use scale of 1.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
+    <td class="tg-0pky">- unit_scale - Always use scale of 1.<br>- hw_aligned_single_scale - Always use scale that's aligned to the corresponding HW accelerated scale.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
   </tr>
   <tr>
     <td class="tg-0pky">measure_exclude</td>
 
@@ -11,3 +11,4 @@ neural-compressor
 lm_eval==0.4.3
 peft
 optimum-intel
+intel_extension_for_pytorch
@@ -217,7 +217,6 @@ def eval_func(model):
 
 
 if args.load:
-    # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
     if args.int8 or args.int8_bf16_mixed:
         print("load int8 model")
         from neural_compressor.torch.quantization import load
 
@@ -10,3 +10,4 @@ einops
 neural-compressor
 lm_eval==0.4.3
 peft
+intel_extension_for_pytorch
@@ -198,7 +198,6 @@ def run_fn(model):
     user_model.save(args.output_dir)
 
 if args.load:
-    # TODO: we need run_benchmark.sh for loading and remove --accuracy in run_quant.sh, currently run_quant.sh will get fp32 result
     if args.int8 or args.int8_bf16_mixed:
         print("load int8 model")
         from neural_compressor.torch.quantization import load