intel
diff --git a/‎.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt‎
Lines changed: 1 addition & 0 deletions b/‎.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.azure-pipelines/scripts/ut/3x/coverage.3x_pt‎
Lines changed: 1 addition & 1 deletion b/‎.azure-pipelines/scripts/ut/3x/coverage.3x_pt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8‎
Lines changed: 1 addition & 2 deletions b/‎.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.azure-pipelines/scripts/ut/3x/run_3x_pt.sh‎
Lines changed: 1 addition & 1 deletion b/‎.azure-pipelines/scripts/ut/3x/run_3x_pt.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh‎
Lines changed: 8 additions & 1 deletion b/‎.azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎.azure-pipelines/template/docker-template.yml‎
Lines changed: 2 additions & 2 deletions b/‎.azure-pipelines/template/docker-template.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.azure-pipelines/ut-3x-pt-fp8.yml‎
Lines changed: 6 additions & 0 deletions b/‎.azure-pipelines/ut-3x-pt-fp8.yml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 32 additions & 47 deletions b/‎README.md‎
Lines changed: 32 additions & 47 deletions
diff --git a/‎docs/3x/PT_FP8Quant.md‎
Lines changed: 113 additions & 0 deletions b/‎docs/3x/PT_FP8Quant.md‎
Lines changed: 113 additions & 0 deletions
@@ -25,4 +25,5 @@
 /neural-compressor/neural_compressor/torch/algorithms/static_quant
 /neural-compressor/neural_compressor/torch/algorithms/weight_only
 /neural-compressor/neural_compressor/torch/export
+/neural-compressor/neural_compressor/torch/quantization
 /neural-compressor/neural_compressor/torch/utils
@@ -6,7 +6,7 @@ include =
  */neural_compressor/common/*
  */neural_compressor/torch/*
 omit =
- */neural_compressor/torch/algorithms/habana_fp8/*
+ */neural_compressor/torch/algorithms/fp8_quant/*
  */neural_compressor/torch/amp/*
 exclude_lines =
  pragma: no cover
 
@@ -3,8 +3,7 @@ branch = True
 
 [report]
 include =
- */neural_compressor/torch/algorithms/habana_fp8/*
- */neural_compressor/torch/amp/*
+ */neural_compressor/torch/algorithms/fp8_quant/*
 exclude_lines =
  pragma: no cover
  raise NotImplementedError
 
@@ -15,8 +15,8 @@ export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverag
 inc_path=$(python -c 'import neural_compressor; print(neural_compressor.__path__[0])')
 cd /neural-compressor/test/3x || exit 1
 rm -rf tensorflow
-rm -rf onnxrt
 rm -rf torch/algorithms/fp8_quant
+rm -rf torch/quantization/fp8_quant
 
 LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}
 
@@ -5,11 +5,13 @@ echo "${test_case}"
 
 # install requirements
 echo "set up UT env..."
+export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
 sed -i '/^intel_extension_for_pytorch/d' /neural-compressor/test/3x/torch/requirements.txt
 pip install -r /neural-compressor/test/3x/torch/requirements.txt
 pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.16.0
 pip install pytest-cov
 pip install pytest-html
+pip install pytest-html-merger
 pip list
 
 export COVERAGE_RCFILE=/neural-compressor/.azure-pipelines/scripts/ut/3x/coverage.3x_pt_fp8
@@ -19,8 +21,13 @@ cd /neural-compressor/test/3x || exit 1
 LOG_DIR=/neural-compressor/log_dir
 mkdir -p ${LOG_DIR}
 ut_log_name=${LOG_DIR}/ut_3x_pt_fp8.log
-pytest --cov="${inc_path}" -vs --disable-warnings --html=report.html --self-contained-html torch/algorithms/fp8_quant 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_1.html --self-contained-html torch/quantization/weight_only/test_load.py 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_2.html --self-contained-html torch/quantization/weight_only/test_rtn.py 2>&1 | tee -a ${ut_log_name}
+# pytest --cov="${inc_path}" -vs --disable-warnings --html=report_3.html --self-contained-html torch/quantization/weight_only/test_autoround.py 2>&1 | tee -a ${ut_log_name}
+pytest --cov="${inc_path}" -vs --disable-warnings --html=report_4.html --self-contained-html torch/quantization/fp8_quant 2>&1 | tee -a ${ut_log_name}
 
+mkdir -p report && mv *.html report
+pytest_html_merger -i ./report -o ./report.html
 cp report.html ${LOG_DIR}/
 
 if [ $(grep -c '== FAILURES ==' ${ut_log_name}) != 0 ] || [ $(grep -c '== ERRORS ==' ${ut_log_name}) != 0 ] || [ $(grep -c ' passed' ${ut_log_name}) == 0 ]; then
 
@@ -74,7 +74,7 @@ steps:
 
   - ${{ if eq(parameters.imageSource, 'pull') }}:
       - script: |
-            docker pull vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
+            docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
         displayName: "Pull habana docker image"
 
   - script: |
@@ -95,7 +95,7 @@ steps:
             else
                 docker run -dit --disable-content-trust --privileged --name=${{ parameters.containerName }} --shm-size="2g" \
                 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host \
-                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest
+                -v ${BUILD_SOURCESDIRECTORY}:/neural-compressor vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
             fi
             echo "Show the container list after docker run ... "
             docker ps -a
 
@@ -10,6 +10,12 @@ pr:
     include:
       - .azure-pipelines/scripts/ut/3x/run_3x_pt_fp8.sh
       - .azure-pipelines/ut-3x-pt-fp8.yml
+      - neural_compressor/common
+      - neural_compressor/torch
+      - test/3x/torch/algorithms/fp8_quant
+      - test/3x/torch/quantization/fp8_quant
+      - setup.py
+      - requirements_pt.txt
 
 pool: GAUDI
 
 
@@ -128,7 +128,8 @@ repos:
               examples/.*(txt|patch)|
               examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/ptq_static/prompt.json|
               examples/notebook/dynas/ResNet50_Quantiation_Search_Supernet_NAS.ipynb|
-              examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb
+              examples/notebook/dynas/Transformer_LT_Supernet_NAS.ipynb|
+              neural_compressor/torch/algorithms/fp8_quant/internal/diffusion_evaluation/SR_evaluation/imagenet1000_clsidx_to_labels.txt
           )$
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
 
@@ -71,66 +71,50 @@ pip install "neural-compressor>=2.3" "transformers>=4.34.0" torch torchvision
 ```
 After successfully installing these packages, try your first quantization program.
 
-### Weight-Only Quantization (LLMs)
-Following example code demonstrates Weight-Only Quantization on LLMs, it supports Intel CPU, Intel Gaudi2 AI Accelerator, Nvidia GPU, best device will be selected automatically.
+### [FP8 Quantization](./examples/3.x_api/pytorch/cv/fp8_quant/)
+Following example code demonstrates FP8 Quantization, it is supported by Intel Gaudi2 AI Accelerator. 
 
 To try on Intel Gaudi2, docker image with Gaudi Software Stack is recommended, please refer to following script for environment setup. More details can be found in [Gaudi Guide](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#launch-docker-image-that-was-built).
 ```bash
 # Run a container with an interactive shell
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
-
-# Install the optimum-habana
-pip install --upgrade-strategy eager optimum[habana]
-
-# Install INC/auto_round
-pip install neural-compressor auto_round
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
 ```
 Run the example:
 ```python
-from transformers import AutoModel, AutoTokenizer
-
-from neural_compressor.config import PostTrainingQuantConfig
-from neural_compressor.quantization import fit
-from neural_compressor.adaptor.torch_utils.auto_round import get_dataloader
-
-model_name = "EleutherAI/gpt-neo-125m"
-float_model = AutoModel.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-dataloader = get_dataloader(tokenizer, seqlen=2048)
-
-woq_conf = PostTrainingQuantConfig(
-    approach="weight_only",
-    op_type_dict={
-        ".*": {  # match all ops
-            "weight": {
-                "dtype": "int",
-                "bits": 4,
-                "algorithm": "AUTOROUND",
-            },
-        }
-    },
+from neural_compressor.torch.quantization import (
+    FP8Config,
+    prepare,
+    convert,
 )
-quantized_model = fit(model=float_model, conf=woq_conf, calib_dataloader=dataloader)
+import torchvision.models as models
+
+model = models.resnet18()
+qconfig = FP8Config(fp8_config="E4M3")
+model = prepare(model, qconfig)
+# customer defined calibration
+calib_func(model)
+model = convert(model)
 ```
-**Note:**
 
-To try INT4 model inference, please directly use [Intel Extension for Transformers](https://github.com/intel/intel-extension-for-transformers), which leverages Intel Neural Compressor for model quantization.
+### Weight-Only Large Language Model Loading (LLMs)
 
-### Static Quantization (Non-LLMs)
+Following example code demonstrates weight-only large language model loading on Intel Gaudi2 AI Accelerator. 
 
 ```python
-from torchvision import models
+from neural_compressor.torch.quantization import load
+
+model_name = "TheBloke/Llama-2-7B-GPTQ"
+model = load(
+    model_name_or_path=model_name,
+    format="huggingface",
+    device="hpu",
+    torch_dtype=torch.bfloat16,
+)
+```
 
-from neural_compressor.config import PostTrainingQuantConfig
-from neural_compressor.data import DataLoader, Datasets
-from neural_compressor.quantization import fit
+**Note:**
 
-float_model = models.resnet18()
-dataset = Datasets("pytorch")["dummy"](shape=(1, 3, 224, 224))
-calib_dataloader = DataLoader(framework="pytorch", dataset=dataset)
-static_quant_conf = PostTrainingQuantConfig()
-quantized_model = fit(model=float_model, conf=static_quant_conf, calib_dataloader=calib_dataloader)
-```
+Intel Neural Compressor will convert the model format from auto-gptq to hpu format on the first load and save hpu_model.safetensors to the local cache directory for the next load. So it may take a while to load for the first time.
 
 ## Documentation
 
@@ -157,12 +141,13 @@ quantized_model = fit(model=float_model, conf=static_quant_conf, calib_dataloade
   <tbody>
     <tr>
         <td colspan="2" align="center"><a href="./docs/source/3x/PyTorch.md">Overview</a></td>
-        <td colspan="2" align="center"><a href="./docs/source/3x/PT_StaticQuant.md">Static Quantization</a></td>
         <td colspan="2" align="center"><a href="./docs/source/3x/PT_DynamicQuant.md">Dynamic Quantization</a></td>
+        <td colspan="2" align="center"><a href="./docs/source/3x/PT_StaticQuant.md">Static Quantization</a></td>
         <td colspan="2" align="center"><a href="./docs/source/3x/PT_SmoothQuant.md">Smooth Quantization</a></td>
     </tr>
     <tr>
-        <td colspan="4" align="center"><a href="./docs/source/3x/PT_WeightOnlyQuant.md">Weight-Only Quantization</a></td>
+        <td colspan="2" align="center"><a href="./docs/source/3x/PT_WeightOnlyQuant.md">Weight-Only Quantization</a></td>
+        <td colspan="2" align="center"><a href="./docs/3x/PT_FP8Quant.md">FP8 Quantization</a></td>
         <td colspan="2" align="center"><a href="./docs/source/3x/PT_MXQuant.md">MX Quantization</a></td>
         <td colspan="2" align="center"><a href="./docs/source/3x/PT_MixedPrecision.md">Mixed Precision</a></td>
     </tr>
 
@@ -0,0 +1,113 @@
+FP8 Quantization
+=======
+
+1. [Introduction](#introduction)
+2. [Supported Parameters](#supported-parameters)
+3. [Get Start with FP8 Quantization](#get-start-with-fp8-quantization)
+4. [Examples](#examples)  
+
+## Introduction
+
+Float point 8 (FP8) is a promising data type for low precision quantization which provides a data distribution that is completely different from INT8 and it's shown as below.
+
+<div align="center">
+    <img src="./imgs/fp8_dtype.png" height="250"/>
+</div>
+
+Intel Gaudi2, also known as HPU, provides this data type capability for low precision quantization, which includes `E4M3` and `E5M2`. For more information about these two data type, please refer to [link](https://arxiv.org/abs/2209.05433).
+
+Intel Neural Compressor provides general quantization APIs to leverage HPU FP8 capability. with simple  with lower memory usage and lower compute cost, 8 bit model
+
+## Supported Parameters
+
+<style type="text/css">
+.tg  {border-collapse:collapse;border-spacing:0;}
+.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
+  overflow:hidden;padding:10px 5px;word-break:normal;}
+.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
+  font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
+.tg .tg-fymr{border-color:inherit;font-weight:bold;text-align:left;vertical-align:top}
+.tg .tg-0pky{border-color:inherit;text-align:left;vertical-align:top}
+</style>
+<table class="tg"><thead>
+  <tr>
+    <th class="tg-fymr">Attribute</th>
+    <th class="tg-fymr">Description</th>
+    <th class="tg-fymr">Values</th>
+  </tr></thead>
+<tbody>
+  <tr>
+    <td class="tg-0pky">fp8_config</td>
+    <td class="tg-0pky">The target data type of FP8 quantization.</td>
+    <td class="tg-0pky">E4M3 (default) - As Fig. 2<br>E5M2 - As Fig. 1.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">hp_dtype</td>
+    <td class="tg-0pky">The high precision data type of non-FP8 operators.</td>
+    <td class="tg-0pky">bf16 (default) - torch.bfloat16<br>fp16 - torch.float16.<br>fp32 - torch.float32.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">observer</td>
+    <td class="tg-0pky">The observer to measure the statistics.</td>
+    <td class="tg-0pky">maxabs (default), saves all tensors to files.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">allowlist</td>
+    <td class="tg-0pky">List of nn.Module names or types to quantize. When setting an empty list, all the supported modules will be quantized by default. See Supported Modules. Not setting the list at all is not recommended as it will set the allowlist to these modules only: torch.nn.Linear, torch.nn.Conv2d, and BMM.</td>
+    <td class="tg-0pky">Default = {'names': [], 'types': <span title=["Matmul","Linear","FalconLinear","KVCache","Conv2d","LoRACompatibleLinear","LoRACompatibleConv","Softmax","ModuleFusedSDPA","LinearLayer","LinearAllreduce","ScopedLinearAllReduce","LmHeadLinearAllreduce"]>FP8_WHITE_LIST}</span></td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">blocklist</td>
+    <td class="tg-0pky">List of nn.Module names or types not to quantize. Defaults to empty list, so you may omit it from the config file.</td>
+    <td class="tg-0pky">Default = {'names': [], 'types': ()}</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">mode</td>
+    <td class="tg-0pky">The mode, measure or quantize, to run HQT with.</td>
+    <td class="tg-0pky">MEASURE - Measure statistics of all modules and emit the results to dump_stats_path.<br>QUANTIZE - Quantize and run the model according to the provided measurements.<br>AUTO (default) - Select from [MEASURE, QUANTIZE] automatically.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">dump_stats_path</td>
+    <td class="tg-0pky">The path to save and load the measurements. The path is created up until the level before last "/". The string after the last / will be used as prefix to all the measurement files that will be created.</td>
+    <td class="tg-0pky">Default = "./hqt_output/measure"</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">scale_method</td>
+    <td class="tg-0pky">The method for calculating the scale from the measurement.</td>
+    <td class="tg-0pky">- without_scale - Convert to/from FP8 without scaling.<br>- unit_scale - Always use scale of 1.<br>- maxabs_hw (default) - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then aligned to the corresponding HW accelerated scale.<br>- maxabs_pow2 - Scale is calculated to stretch/compress the maxabs measurement to the full-scale of FP8 and then rounded to the power of 2.<br>- maxabs_hw_opt_weight - Scale of model params (weights) is chosen as the scale that provides minimal mean-square-error between quantized and non-quantized weights, from all possible HW accelerated scales. Scale of activations is calculated the same as maxabs_hw.<br>- act_maxabs_pow2_weights_pcs_opt_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_hw_opt_weight. Scale of activations is calculated the same as maxabs_pow2.<br>- act_maxabs_hw_weights_pcs_maxabs_pow2 - Scale of model params (weights) is calculated per-channel of the params tensor. The scale per-channel is calculated the same as maxabs_pow2. Scale of activations is calculated the same as maxabs_hw.</td>
+  </tr>
+  <tr>
+    <td class="tg-0pky">measure_exclude</td>
+    <td class="tg-0pky">If this attribute is not defined, the default is OUTPUT. Since most models do not require measuring output tensors, you can exclude it to speed up the measurement process.</td>
+    <td class="tg-0pky">NONE - All tensors are measured.<br>OUTPUT (default) - Excludes measurement of output tensors.</td>
+  </tr>
+</tbody></table>
+
+## Get Start with FP8 Quantization
+
+### Demo Usage
+
+```python
+from neural_compressor.torch.quantization import (
+    FP8Config,
+    prepare,
+    convert,
+)
+import torchvision.models as models
+
+model = models.resnet18()
+qconfig = FP8Config(fp8_config="E4M3")
+model = prepare(model, qconfig)
+# customer defined calibration
+calib_func(model)
+model = convert(model)
+```
+
+## Examples
+
+| Task                 | Example |
+|----------------------|---------|
+| Computer Vision (CV)      |    [Link](../../examples/3.x_api/pytorch/cv/fp8_quant/)     |
+| Large Language Model (LLM) |    [Link](https://github.com/HabanaAI/optimum-habana-fork/tree/habana-main/examples/text-generation#running-with-fp8)     |
+
+> Note: For LLM, Optimum-habana provides higher performance based on modified modeling files, so here the Link of LLM goes to Optimum-habana, which utilize Intel Neural Compressor for FP8 quantization internally.