From 122bc4c1c6f87ef29394e7bc87b2debe55165b98 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Mon, 13 Oct 2025 13:24:43 +0000 Subject: [PATCH 01/25] Add flux example Signed-off-by: Mengni Wang --- .../torch/algorithms/weight_only/autoround.py | 19 ++++++++++++++++++- .../torch/quantization/algorithm_entry.py | 8 ++++++++ .../torch/quantization/quantize.py | 2 ++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 9127f7ba6e5..5fa3b253cfa 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -100,6 +100,10 @@ def __init__( truncation: bool = False, # 0.7 scheme: Union[str, dict, QuantizationScheme] = "W4A16", + # diffusion + guidance_scale: float = 7.5, + num_inference_steps: int = 50, + generator_seed: int = None, **kwargs, ): """Init a AutQRoundQuantizer object. @@ -172,6 +176,10 @@ def __init__( template (Template): The template to specify process for different mllms. truncation (bool): Activates truncation to cut input sequences longer than `max_length` to `max_length`. scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations. + guidance_scale (float): Control how much the image generation process follows the text prompt. + The more it is, the more closely it follows the prompt (default is 7.5). + num_inference_steps (int): The reference number of denoising steps (default is 50). + generator_seed (int): A seed that controls the initial noise for image generation (default is None). Returns: The quantized model. @@ -227,6 +235,9 @@ def __init__( self.device_map = device_map self.quant_lm_head = quant_lm_head self.enable_w4afp8 = self._is_w4afp8() + self.guidance_scale = guidance_scale + self.num_inference_steps = num_inference_steps + self.generator_seed = generator_seed def _is_w4afp8(self) -> bool: return any([v.get("data_type", None) == "fp8_to_int_sym" for v in self.quant_config.values()]) @@ -252,13 +263,16 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): Returns: The quantized model. """ + pipe = kwargs.pop("pipeline", None) tokenizer = getattr(model.orig_model, "tokenizer", None) if tokenizer is not None: delattr(model.orig_model, "tokenizer") - else: + elif pipe is None: tokenizer = "Placeholder" self.dataset = CapturedDataloader(model.args_list, model.kwargs_list) model = model.orig_model + if pipe is not None: + model = pipe rounder = AutoRound( model, layer_config=self.layer_config, @@ -307,6 +321,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): truncation=self.truncation, enable_torch_compile=self.enable_torch_compile, quant_lm_head=self.quant_lm_head, + guidance_scale=self.guidance_scale, + num_inference_steps=self.num_inference_steps, + generator_seed=self.generator_seed, ) if self.enable_w4afp8: diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 4936fbbe213..1f9b50d7339 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -608,6 +608,7 @@ def autoround_quantize_entry( "act_data_type": act_data_type, } layer_config = quant_config.to_dict().get("layer_config", None) + dataset = quant_config.to_dict().get("dataset", "NeelNanda/pile-10k") output_dir = quant_config.to_dict().get("output_dir", "temp_auto_round") enable_full_range = quant_config.enable_full_range batch_size = quant_config.batch_size @@ -642,6 +643,9 @@ def autoround_quantize_entry( scheme = quant_config.scheme device_map = quant_config.device_map quant_lm_head = quant_config.quant_lm_head + guidance_scale = quant_config.to_dict().get("guidance_scale", 7.5) + num_inference_steps = quant_config.to_dict().get("num_inference_steps", 50) + generator_seed = quant_config.to_dict().get("generator_seed", None) kwargs.pop("example_inputs") quantizer = get_quantizer( @@ -665,6 +669,7 @@ def autoround_quantize_entry( batch_size=batch_size, amp=amp, lr_scheduler=lr_scheduler, + dataset=dataset, enable_quanted_input=enable_quanted_input, enable_minmax_tuning=enable_minmax_tuning, lr=lr, @@ -694,6 +699,9 @@ def autoround_quantize_entry( scheme=scheme, device_map=device_map, quant_lm_head=quant_lm_head, + guidance_scale=guidance_scale, + num_inference_steps=num_inference_steps, + generator_seed=generator_seed, ) model = quantizer.execute(model=model, mode=mode, *args, **kwargs) model.qconfig = configs_mapping diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 60e138876d5..d9bad24283b 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -228,6 +228,7 @@ def convert( model: torch.nn.Module, quant_config: BaseConfig = None, inplace: bool = True, + **kwargs, ): """Convert the prepared model to a quantized model. @@ -284,6 +285,7 @@ def convert( configs_mapping, example_inputs=example_inputs, mode=Mode.CONVERT, + **kwargs, ) setattr(q_model, "is_quantized", True) return q_model From 728f3154c230e6ae3fc817c9bb042b91cc8bd84f Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Wed, 15 Oct 2025 08:29:54 +0000 Subject: [PATCH 02/25] add scripts Signed-off-by: Mengni Wang --- .../diffusion_model/diffusers/flux/README.md | 34 ++++ .../diffusion_model/diffusers/flux/main.py | 181 ++++++++++++++++++ .../diffusers/flux/requirements.txt | 5 + .../diffusers/flux/run_quant.sh | 55 ++++++ 4 files changed, 275 insertions(+) create mode 100644 examples/pytorch/diffusion_model/diffusers/flux/README.md create mode 100644 examples/pytorch/diffusion_model/diffusers/flux/main.py create mode 100644 examples/pytorch/diffusion_model/diffusers/flux/requirements.txt create mode 100644 examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh diff --git a/examples/pytorch/diffusion_model/diffusers/flux/README.md b/examples/pytorch/diffusion_model/diffusers/flux/README.md new file mode 100644 index 00000000000..1049a18e073 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/flux/README.md @@ -0,0 +1,34 @@ +# Step-by-Step + +This example quantizes and validates the accuracy of Flux. + +# Prerequisite + +## 1. Environment + +```shell +pip install -r requirements.txt +# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.6 release +pip install neural-compressor-pt==3.6 +# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc` for the latest updates before auto-round v0.8.0 release +pip install auto-round==0.8.0 +``` + +## 2. Prepare Model + +```shell +hf download black-forest-labs/FLUX.1-dev --local-dir FLUX.1-dev +``` + +## 3. Prepare Dataset +```shell +wget https://github.com/mlcommons/inference/raw/refs/heads/master/text_to_image/coco2014/captions/captions_source.tsv +``` + +# Run + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=flux_fp8 --input_model=FLUX.1-dev +``` +- topology: support flux_fp8 and flux_mxfp8 +- CUDA_VISIBLE_DEVICES: split the evaluation file into the number of GPUs' subset to speed up the evaluation diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py new file mode 100644 index 00000000000..b70feeb270d --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -0,0 +1,181 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import argparse + +import pandas as pd +import tabulate +import torch + +from diffusers import AutoPipelineForText2Image +from neural_compressor.torch.quantization import ( + AutoRoundConfig, + convert, + prepare, +) +import multiprocessing as mp + +from auto_round.compressors.diffusion.eval import metric_map +from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader + + +def inference_worker(device, eval_file, pipe, image_save_dir): + if device != "cpu": + os.environ["CUDA_VISIBLE_DEVICES"] = str(device) + torch.cuda.set_device(device) + + gen_kwargs = { + "guidance_scale": 7.5, + "num_inference_steps": 50, + "generator": None, + } + + dataloader, _, _ = get_diffusion_dataloader(eval_file, nsamples=-1, bs=1) + prompt_list = [] + image_list = [] + for image_ids, prompts in dataloader: + prompt_list.extend(prompts) + + new_ids = [] + new_prompts = [] + for idx, image_id in enumerate(image_ids): + image_id = image_id.item() + image_list.append(os.path.join(image_save_dir, str(image_id) + ".png")) + + if os.path.exists(os.path.join(image_save_dir, str(image_id) + ".png")): + continue + new_ids.append(image_id) + new_prompts.append(prompts[idx]) + + if len(new_prompts) == 0: + continue + + output = pipe(prompt=new_prompts, **gen_kwargs) + for idx, image_id in enumerate(new_ids): + output.images[idx].save(os.path.join(image_save_dir, str(image_id) + ".png")) + + return prompt_list, image_list + +class BasicArgumentParser(argparse.ArgumentParser): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.add_argument("--model", "--model_name", "--model_name_or_path", + help="model name or path") + + self.add_argument('--scheme', default="MXFP4", type=str, + help="quantizaion scheme.") + + self.add_argument("--quantize", action="store_true") + + self.add_argument("--inference", action="store_true") + + self.add_argument("--dataset", type=str, default="coco2014", + help="the dataset for quantization training.") + + self.add_argument("--output_dir", default="./tmp_autoround", type=str, + help="the directory to save quantized model") + + self.add_argument("--eval_dataset", default="captions_source.tsv", type=str, + help="eval datasets") + + self.add_argument("--output_image_path", default="./tmp_imgs", type=str, + help="the directory to save quantized model") + + +def setup_parser(): + parser = BasicArgumentParser() + + parser.add_argument("--iters", "--iter", default=1000, type=int, + help="tuning iters") + + args = parser.parse_args() + return args + + +def tune(args): + model_name = args.model + if model_name[-1] == "/": + model_name = model_name[:-1] + print(f"start to quantize {model_name}") + + use_auto_mapping = True + layer_config = {} + pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16) + model = pipe.transformer + kwargs = {} + if args.scheme == "FP8": + for n, m in model.named_modules(): + if m.__class__.__name__ == "Linear": + layer_config[n] = {"bits": 8, "act_bits": 8, "data_type": "fp", "act_data_type": "fp", "group_size": 0, "act_group_size": 0} + elif args.scheme == "MXFP8": + kwargs["scheme"] = "MXFP8" + + qconfig = AutoRoundConfig( + iters=args.iters, + dataset=args.dataset, + layer_config=layer_config, + num_inference_steps=3, + export_format="fake", + nsamples=128, + batch_size=1, + **kwargs + ) + model = prepare(model, qconfig) + model = convert(model, qconfig, pipeline=pipe) + return model, pipe + +if __name__ == '__main__': + mp.set_start_method('spawn', force=True) + args = setup_parser() + model, pipe = tune(args) + if "--inference" in sys.argv: + if not os.path.exists(args.output_image_path): + os.makedirs(args.output_image_path) + + visible_gpus = torch.cuda.device_count() + + if visible_gpus == 0: + prompt_list, image_list = inference_worker("cpu", args.eval_dataset, pipe, args.output_image_path) + + else: + df = pd.read_csv(args.eval_dataset, sep='\t') + subsut_sample_num = len(df) // visible_gpus + for i in range(visible_gpus): + start = i * subsut_sample_num + end = min((i + 1) * subsut_sample_num, len(df)) + df_subset = df.iloc[start : end] + df_subset.to_csv(f"subset_{i}.tsv", sep='\t', index=False) + + pipe.model = model + + with mp.Pool(processes=visible_gpus) as pool: + results = [pool.apply_async(inference_worker, (i, f"subset_{i}.tsv", pipe.to(f"cuda:{i}"), args.output_image_path)) for i in range(visible_gpus)] + outputs = [r.get() for r in results] + + prompt_list = [] + image_list = [] + for output in outputs: + prompt_list.extend(output[0]) + image_list.extend(output[1]) + + print("Evaluations for subset are done! Getting the final accuracy...") + + result = {} + metrics = ["clip", "clip-iqa", "imagereward"] + for metric in metrics: + result.update(metric_map[metric](prompt_list, image_list, pipe.device)) + + print(tabulate.tabulate(result.items(), tablefmt="grid")) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt b/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt new file mode 100644 index 00000000000..cd30494f746 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt @@ -0,0 +1,5 @@ +diffusers +clip +image-reward +torchmetrics +transformers==4.55.0 diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh new file mode 100644 index 00000000000..4c942a332e1 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + tuned_checkpoint=${tuned_checkpoint:="saved_results"} + + if [ "${topology}" = "flux_fp8" ]; then + extra_cmd="--scheme FP8 --iters 0 --dataset captions_source.tsv" + elif [ "${topology}" = "flux_mxfp8" ]; then + extra_cmd="--scheme MXFP8 --iters 10 --dataset captions_source.tsv" + fi + + python3 main.py \ + --model ${input_model} \ + --output_dir ${tuned_checkpoint} \ + --quantize \ + --inference \ + ${extra_cmd} +} + +main "$@" From b7a922eeb81f231f2f62ab13f6bd972edba987d5 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 16:35:21 +0800 Subject: [PATCH 03/25] Update run_quant.sh --- .../pytorch/diffusion_model/diffusers/flux/run_quant.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh index 4c942a332e1..912839d2865 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh @@ -46,10 +46,10 @@ function run_tuning { python3 main.py \ --model ${input_model} \ - --output_dir ${tuned_checkpoint} \ - --quantize \ - --inference \ - ${extra_cmd} + --output_dir ${tuned_checkpoint} \ + --quantize \ + --inference \ + ${extra_cmd} } main "$@" From 01571a102d97528b979b6f3c85ae3d3024da0f22 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 16:35:54 +0800 Subject: [PATCH 04/25] Update run_quant.sh --- examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh index 912839d2865..6c56735162f 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh @@ -41,7 +41,7 @@ function run_tuning { if [ "${topology}" = "flux_fp8" ]; then extra_cmd="--scheme FP8 --iters 0 --dataset captions_source.tsv" elif [ "${topology}" = "flux_mxfp8" ]; then - extra_cmd="--scheme MXFP8 --iters 10 --dataset captions_source.tsv" + extra_cmd="--scheme MXFP8 --iters 1000 --dataset captions_source.tsv" fi python3 main.py \ From cdb368836603cbab31c2779781c06d77057a164c Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 16:48:17 +0800 Subject: [PATCH 05/25] Update main.py --- .../diffusion_model/diffusers/flux/main.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index b70feeb270d..f03a5b90d4c 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -105,16 +105,8 @@ def setup_parser(): return args -def tune(args): - model_name = args.model - if model_name[-1] == "/": - model_name = model_name[:-1] - print(f"start to quantize {model_name}") - - use_auto_mapping = True +def tune(args, model, pipe): layer_config = {} - pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16) - model = pipe.transformer kwargs = {} if args.scheme == "FP8": for n, m in model.named_modules(): @@ -140,7 +132,15 @@ def tune(args): if __name__ == '__main__': mp.set_start_method('spawn', force=True) args = setup_parser() - model, pipe = tune(args) + model_name = args.model + if model_name[-1] == "/": + model_name = model_name[:-1] + pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16) + model = pipe.transformer + + if "--quantize" in sys.argv: + print(f"start to quantize {model_name}") + model, pipe = tune(args, model, pipe) if "--inference" in sys.argv: if not os.path.exists(args.output_image_path): os.makedirs(args.output_image_path) From 502f57178c5c2eb03268142f3369ed576e5c24c7 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 16:49:54 +0800 Subject: [PATCH 06/25] Update run_quant.sh --- .../diffusion_model/diffusers/flux/run_quant.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh index 6c56735162f..4e6b372e8d5 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh @@ -39,16 +39,16 @@ function run_tuning { tuned_checkpoint=${tuned_checkpoint:="saved_results"} if [ "${topology}" = "flux_fp8" ]; then - extra_cmd="--scheme FP8 --iters 0 --dataset captions_source.tsv" + extra_cmd="--scheme FP8 --iters 0 --dataset captions_source.tsv --inference --quantize" elif [ "${topology}" = "flux_mxfp8" ]; then - extra_cmd="--scheme MXFP8 --iters 1000 --dataset captions_source.tsv" - fi + extra_cmd="--scheme MXFP8 --iters 1000 --dataset captions_source.tsv --inference --quantize" + else + extra_cmd="--inference" + fi python3 main.py \ --model ${input_model} \ --output_dir ${tuned_checkpoint} \ - --quantize \ - --inference \ ${extra_cmd} } From 40c21f2f22e3a41bbd8d4fb6cd333b190719dd7a Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 16:52:02 +0800 Subject: [PATCH 07/25] Update requirements.txt --- .../diffusion_model/diffusers/flux/requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt b/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt index cd30494f746..52ba33bb163 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt +++ b/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt @@ -1,5 +1,5 @@ -diffusers -clip -image-reward -torchmetrics +diffusers==0.35.1 +clip==0.2.0 +image-reward==1.5 +torchmetrics==1.8.2 transformers==4.55.0 From 0c169e8af7218cb90202a500ff45d3680e99b548 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 16:53:31 +0800 Subject: [PATCH 08/25] Update requirements.txt --- examples/pytorch/diffusion_model/diffusers/flux/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt b/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt index 52ba33bb163..1d6637869b3 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt +++ b/examples/pytorch/diffusion_model/diffusers/flux/requirements.txt @@ -1,4 +1,5 @@ diffusers==0.35.1 +pandas==2.2.2 clip==0.2.0 image-reward==1.5 torchmetrics==1.8.2 From 7412bdda9418962053363c21eea5d15f645f505c Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 16:55:46 +0800 Subject: [PATCH 09/25] Update README.md --- examples/pytorch/diffusion_model/diffusers/flux/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/README.md b/examples/pytorch/diffusion_model/diffusers/flux/README.md index 1049a18e073..4095b04ac50 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/README.md +++ b/examples/pytorch/diffusion_model/diffusers/flux/README.md @@ -10,7 +10,7 @@ This example quantizes and validates the accuracy of Flux. pip install -r requirements.txt # Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@v3.6rc` for the latest updates before neural-compressor v3.6 release pip install neural-compressor-pt==3.6 -# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc` for the latest updates before auto-round v0.8.0 release +# Use `pip install git+https://github.com/intel/auto-round.git@v0.8.0rc2` for the latest updates before auto-round v0.8.0 release pip install auto-round==0.8.0 ``` From d4097c9fa462d5fe4b79f007bb318c26340865f9 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 16:58:06 +0800 Subject: [PATCH 10/25] Update main.py --- examples/pytorch/diffusion_model/diffusers/flux/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index f03a5b90d4c..9720cdecf81 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -75,7 +75,7 @@ def __init__(self, *args, **kwargs): self.add_argument("--model", "--model_name", "--model_name_or_path", help="model name or path") - self.add_argument('--scheme', default="MXFP4", type=str, + self.add_argument('--scheme', default="MXFP8", type=str, help="quantizaion scheme.") self.add_argument("--quantize", action="store_true") From 82f78d428a0de17b0885676fba329fb18574a67a Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 17:00:07 +0800 Subject: [PATCH 11/25] Update main.py --- .../pytorch/diffusion_model/diffusers/flux/main.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index 9720cdecf81..371b92a81a1 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -105,7 +105,8 @@ def setup_parser(): return args -def tune(args, model, pipe): +def tune(args, pipe): + model = pipe.transformer layer_config = {} kwargs = {} if args.scheme == "FP8": @@ -127,7 +128,7 @@ def tune(args, model, pipe): ) model = prepare(model, qconfig) model = convert(model, qconfig, pipeline=pipe) - return model, pipe + return pipe if __name__ == '__main__': mp.set_start_method('spawn', force=True) @@ -136,11 +137,11 @@ def tune(args, model, pipe): if model_name[-1] == "/": model_name = model_name[:-1] pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16) - model = pipe.transformer + if "--quantize" in sys.argv: print(f"start to quantize {model_name}") - model, pipe = tune(args, model, pipe) + pipe = tune(args, pipe) if "--inference" in sys.argv: if not os.path.exists(args.output_image_path): os.makedirs(args.output_image_path) @@ -159,8 +160,6 @@ def tune(args, model, pipe): df_subset = df.iloc[start : end] df_subset.to_csv(f"subset_{i}.tsv", sep='\t', index=False) - pipe.model = model - with mp.Pool(processes=visible_gpus) as pool: results = [pool.apply_async(inference_worker, (i, f"subset_{i}.tsv", pipe.to(f"cuda:{i}"), args.output_image_path)) for i in range(visible_gpus)] outputs = [r.get() for r in results] From 9a0bbd4de97a481b8829248c41bc5ab77fc4f440 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Wed, 15 Oct 2025 17:01:48 +0800 Subject: [PATCH 12/25] Update main.py --- examples/pytorch/diffusion_model/diffusers/flux/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index 371b92a81a1..38fc518c125 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -137,7 +137,6 @@ def tune(args, pipe): if model_name[-1] == "/": model_name = model_name[:-1] pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16) - if "--quantize" in sys.argv: print(f"start to quantize {model_name}") From 5bd640f60e87d9a87da50b5a8216b3adc176034f Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 16 Oct 2025 10:57:54 +0800 Subject: [PATCH 13/25] Update main.py --- examples/pytorch/diffusion_model/diffusers/flux/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index 38fc518c125..99ff37f2928 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -128,6 +128,7 @@ def tune(args, pipe): ) model = prepare(model, qconfig) model = convert(model, qconfig, pipeline=pipe) + delattr(model, "save") return pipe if __name__ == '__main__': From 6d61fac8b25ad410af39f651afb6de39c1547ff4 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 16 Oct 2025 11:01:52 +0800 Subject: [PATCH 14/25] Update README.md --- examples/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/README.md b/examples/README.md index b427d39d2c2..59dd2963dbb 100644 --- a/examples/README.md +++ b/examples/README.md @@ -75,6 +75,12 @@ Intel® Neural Compressor validated examples with multiple compression technique Static Quantization link + + FLUX.1-dev + Text to Image + Quantization (MXFP8+FP8) + link + From 71dedc4d760e8ab453acf8f4d75837ce8bb9e061 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Fri, 17 Oct 2025 03:40:22 +0000 Subject: [PATCH 15/25] fix eval Signed-off-by: Mengni Wang --- .../diffusion_model/diffusers/flux/main.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index 99ff37f2928..d770cbedcac 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -30,9 +30,10 @@ from auto_round.compressors.diffusion.eval import metric_map from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader +from torch.multiprocessing import Process, Queue -def inference_worker(device, eval_file, pipe, image_save_dir): +def inference_worker(device, eval_file, pipe, image_save_dir, queue): if device != "cpu": os.environ["CUDA_VISIBLE_DEVICES"] = str(device) torch.cuda.set_device(device) @@ -67,7 +68,7 @@ def inference_worker(device, eval_file, pipe, image_save_dir): for idx, image_id in enumerate(new_ids): output.images[idx].save(os.path.join(image_save_dir, str(image_id) + ".png")) - return prompt_list, image_list + queue.put((prompt_list, image_list)) class BasicArgumentParser(argparse.ArgumentParser): def __init__(self, *args, **kwargs): @@ -160,9 +161,16 @@ def tune(args, pipe): df_subset = df.iloc[start : end] df_subset.to_csv(f"subset_{i}.tsv", sep='\t', index=False) - with mp.Pool(processes=visible_gpus) as pool: - results = [pool.apply_async(inference_worker, (i, f"subset_{i}.tsv", pipe.to(f"cuda:{i}"), args.output_image_path)) for i in range(visible_gpus)] - outputs = [r.get() for r in results] + processes = [] + queue = Queue() + for i in range(visible_gpus): + p = Process(target=inference_worker, args=(i, f"subset_{i}.tsv", pipe.to(f"cuda:{i}"), args.output_image_path, queue)) + p.start() + processes.append(p) + for p in processes: + p.join() + + outputs = [queue.get() for _ in range(visible_gpus)] prompt_list = [] image_list = [] From 1bcbea5a2eb7f3f15c89af79916ae8b073cc35aa Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Fri, 17 Oct 2025 13:22:47 +0800 Subject: [PATCH 16/25] Update README.md --- examples/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/README.md b/examples/README.md index 59dd2963dbb..90251320be4 100644 --- a/examples/README.md +++ b/examples/README.md @@ -15,6 +15,12 @@ Intel® Neural Compressor validated examples with multiple compression technique + + FLUX.1-dev + Text to Image + Quantization (MXFP8+FP8) + link + Llama-4-Scout-17B-16E-Instruct Multimodal Modeling @@ -75,12 +81,6 @@ Intel® Neural Compressor validated examples with multiple compression technique Static Quantization link - - FLUX.1-dev - Text to Image - Quantization (MXFP8+FP8) - link - From 1c0c5825aa1ecceec056cd52e36e9323ff6c71b2 Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Fri, 17 Oct 2025 09:40:59 +0000 Subject: [PATCH 17/25] fix script bug Signed-off-by: Mengni Wang --- examples/pytorch/diffusion_model/diffusers/flux/main.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index d770cbedcac..d73d1f56e40 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -33,7 +33,7 @@ from torch.multiprocessing import Process, Queue -def inference_worker(device, eval_file, pipe, image_save_dir, queue): +def inference_worker(device, eval_file, pipe, image_save_dir, queue=None): if device != "cpu": os.environ["CUDA_VISIBLE_DEVICES"] = str(device) torch.cuda.set_device(device) @@ -68,7 +68,10 @@ def inference_worker(device, eval_file, pipe, image_save_dir, queue): for idx, image_id in enumerate(new_ids): output.images[idx].save(os.path.join(image_save_dir, str(image_id) + ".png")) - queue.put((prompt_list, image_list)) + if queue is None: + return prompt_list, image_list + else: + queue.put((prompt_list, image_list)) class BasicArgumentParser(argparse.ArgumentParser): def __init__(self, *args, **kwargs): From decac5ae8bce04969cabf3087e6781509517328d Mon Sep 17 00:00:00 2001 From: Mengni Wang Date: Thu, 23 Oct 2025 13:27:46 +0000 Subject: [PATCH 18/25] update script Signed-off-by: Mengni Wang --- .../diffusion_model/diffusers/flux/README.md | 12 +- .../diffusers/flux/dataset_split.py | 22 ++ .../diffusion_model/diffusers/flux/main.py | 191 +++++++++--------- .../diffusers/flux/run_benchmark.sh | 99 +++++++++ .../diffusers/flux/run_quant.sh | 12 +- 5 files changed, 228 insertions(+), 108 deletions(-) create mode 100644 examples/pytorch/diffusion_model/diffusers/flux/dataset_split.py create mode 100644 examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh diff --git a/examples/pytorch/diffusion_model/diffusers/flux/README.md b/examples/pytorch/diffusion_model/diffusers/flux/README.md index 4095b04ac50..bca01cbf32c 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/README.md +++ b/examples/pytorch/diffusion_model/diffusers/flux/README.md @@ -27,8 +27,18 @@ wget https://github.com/mlcommons/inference/raw/refs/heads/master/text_to_image/ # Run +## Quantization + ```bash -CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=flux_fp8 --input_model=FLUX.1-dev +bash run_quant.sh --topology=flux_mxfp8 --input_model=FLUX.1-dev --output_model=mxfp8_model ``` - topology: support flux_fp8 and flux_mxfp8 + + +## Evaluation + +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --topology=flux_mxfp8 --input_model=FLUX.1-dev --quantized_model=mxfp8_model +``` + - CUDA_VISIBLE_DEVICES: split the evaluation file into the number of GPUs' subset to speed up the evaluation diff --git a/examples/pytorch/diffusion_model/diffusers/flux/dataset_split.py b/examples/pytorch/diffusion_model/diffusers/flux/dataset_split.py new file mode 100644 index 00000000000..56015d1f3db --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/flux/dataset_split.py @@ -0,0 +1,22 @@ +import argparse +import pandas as pd + +parser = argparse.ArgumentParser() +parser.add_argument('--split_num', type=int) +parser.add_argument('--limit', default=-1, type=int) +parser.add_argument('--input_file', type=str) +parser.add_argument('--output_file', default="subset", type=str) +args = parser.parse_args() + +# load the TSV file +df = pd.read_csv(args.input_file, sep='\t') + +if args.limit > 0: + df = df.iloc[0:args.limit] + +num = round(len(df) / args.split_num) +for i in range(args.split_num): + start = i * num + end = min((i + 1) * num, len(df)) + df_subset = df.iloc[start:end] + df_subset.to_csv(f"{args.output_file}_{i}.tsv", sep='\t', index=False) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index d73d1f56e40..cd55c8063be 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import os import sys import argparse @@ -20,41 +21,52 @@ import tabulate import torch -from diffusers import AutoPipelineForText2Image +from diffusers import AutoPipelineForText2Image, FluxTransformer2DModel +from functools import partial from neural_compressor.torch.quantization import ( AutoRoundConfig, convert, prepare, ) -import multiprocessing as mp - +from auto_round.data_type.mxfp import quant_mx_rceil +from auto_round.data_type.fp8 import quant_fp8_sym +from auto_round.utils import get_block_names, get_module from auto_round.compressors.diffusion.eval import metric_map from auto_round.compressors.diffusion.dataset import get_diffusion_dataloader -from torch.multiprocessing import Process, Queue - -def inference_worker(device, eval_file, pipe, image_save_dir, queue=None): - if device != "cpu": - os.environ["CUDA_VISIBLE_DEVICES"] = str(device) - torch.cuda.set_device(device) +parser = argparse.ArgumentParser( + description="Flux quantization.", formatter_class=argparse.ArgumentDefaultsHelpFormatter +) +parser.add_argument("--model", "--model_name", "--model_name_or_path", help="model name or path") +parser.add_argument('--scheme', default="MXFP8", type=str, help="quantizaion scheme.") +parser.add_argument("--quantize", action="store_true") +parser.add_argument("--inference", action="store_true") +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--dataset", type=str, default="coco2014", help="the dataset for quantization training.") +parser.add_argument("--output_dir", "--quantized_model_path", default="./tmp_autoround", type=str, help="the directory to save quantized model") +parser.add_argument("--eval_dataset", default="captions_source.tsv", type=str, help="eval datasets") +parser.add_argument("--output_image_path", default="./tmp_imgs", type=str, help="the directory to save quantized model") +parser.add_argument("--iters", "--iter", default=1000, type=int, help="tuning iters") +parser.add_argument("--limit", default=-1, type=int, help="limit the number of prompts for evaluation") + +args = parser.parse_args() + + +def inference_worker(eval_file, pipe, image_save_dir): gen_kwargs = { "guidance_scale": 7.5, "num_inference_steps": 50, "generator": None, } - dataloader, _, _ = get_diffusion_dataloader(eval_file, nsamples=-1, bs=1) - prompt_list = [] - image_list = [] + dataloader, _, _ = get_diffusion_dataloader(eval_file, nsamples=args.limit, bs=1) for image_ids, prompts in dataloader: - prompt_list.extend(prompts) new_ids = [] new_prompts = [] for idx, image_id in enumerate(image_ids): image_id = image_id.item() - image_list.append(os.path.join(image_save_dir, str(image_id) + ".png")) if os.path.exists(os.path.join(image_save_dir, str(image_id) + ".png")): continue @@ -68,57 +80,22 @@ def inference_worker(device, eval_file, pipe, image_save_dir, queue=None): for idx, image_id in enumerate(new_ids): output.images[idx].save(os.path.join(image_save_dir, str(image_id) + ".png")) - if queue is None: - return prompt_list, image_list - else: - queue.put((prompt_list, image_list)) - -class BasicArgumentParser(argparse.ArgumentParser): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.add_argument("--model", "--model_name", "--model_name_or_path", - help="model name or path") - - self.add_argument('--scheme', default="MXFP8", type=str, - help="quantizaion scheme.") - - self.add_argument("--quantize", action="store_true") - - self.add_argument("--inference", action="store_true") - - self.add_argument("--dataset", type=str, default="coco2014", - help="the dataset for quantization training.") - - self.add_argument("--output_dir", default="./tmp_autoround", type=str, - help="the directory to save quantized model") - - self.add_argument("--eval_dataset", default="captions_source.tsv", type=str, - help="eval datasets") - - self.add_argument("--output_image_path", default="./tmp_imgs", type=str, - help="the directory to save quantized model") - -def setup_parser(): - parser = BasicArgumentParser() - - parser.add_argument("--iters", "--iter", default=1000, type=int, - help="tuning iters") - - args = parser.parse_args() - return args - - -def tune(args, pipe): +def tune(): + pipe = AutoPipelineForText2Image.from_pretrained(args.model, torch_dtype=torch.bfloat16) model = pipe.transformer layer_config = {} kwargs = {} if args.scheme == "FP8": for n, m in model.named_modules(): if m.__class__.__name__ == "Linear": - layer_config[n] = {"bits": 8, "act_bits": 8, "data_type": "fp", "act_data_type": "fp", "group_size": 0, "act_group_size": 0} + layer_config[n] = {"bits": 8, "data_type": "fp", "group_size": 0} elif args.scheme == "MXFP8": - kwargs["scheme"] = "MXFP8" + kwargs["scheme"] = { + "bits": 8, + "group_size": 32, + "data_type": "mx_fp", + } qconfig = AutoRoundConfig( iters=args.iters, @@ -128,64 +105,78 @@ def tune(args, pipe): export_format="fake", nsamples=128, batch_size=1, + output_dir=args.output_dir, **kwargs ) model = prepare(model, qconfig) model = convert(model, qconfig, pipeline=pipe) - delattr(model, "save") - return pipe if __name__ == '__main__': - mp.set_start_method('spawn', force=True) - args = setup_parser() - model_name = args.model - if model_name[-1] == "/": - model_name = model_name[:-1] - pipe = AutoPipelineForText2Image.from_pretrained(model_name, torch_dtype=torch.bfloat16) - - if "--quantize" in sys.argv: - print(f"start to quantize {model_name}") - pipe = tune(args, pipe) - if "--inference" in sys.argv: + device = "cpu" if torch.cuda.device_count() == 0 else "cuda" + + if args.quantize: + print(f"Start to quantize {args.model}.") + tune() + exit(0) + + if args.inference: + pipe = AutoPipelineForText2Image.from_pretrained(args.model, torch_dtype=torch.bfloat16) + if not os.path.exists(args.output_image_path): os.makedirs(args.output_image_path) - visible_gpus = torch.cuda.device_count() + if os.path.exists(args.output_dir) and os.path.exists(os.path.join(args.output_dir, "diffusion_pytorch_model.safetensors.index.json")): + print(f"Loading quantized model from {args.output_dir}") + model = FluxTransformer2DModel.from_pretrained(args.output_dir, torch_dtype=torch.bfloat16) + + # replace Linear's forward function + if args.scheme == "MXFP8": + def act_qdq_forward(module, x, *args, **kwargs): + qdq_x, _, _ = quant_mx_rceil(x, bits=8, group_size=32, data_type="mx_fp_rceil") + return module.orig_forward(qdq_x, *args, **kwargs) + + all_quant_blocks = get_block_names(model) + + for block_names in all_quant_blocks: + for block_name in block_names: + block = get_module(model, block_name) + for n, m in block.named_modules(): + if m.__class__.__name__ == "Linear": + m.orig_forward = m.forward + m.forward = partial(act_qdq_forward, m) + + if args.scheme == "FP8": + def act_qdq_forward(module, x, *args, **kwargs): + qdq_x, _, _ = quant_fp8_sym(x, group_size=0) + return module.orig_forward(qdq_x, *args, **kwargs) + + for n, m in model.named_modules(): + if m.__class__.__name__ == "Linear": + m.orig_forward = m.forward + m.forward = partial(act_qdq_forward, m) - if visible_gpus == 0: - prompt_list, image_list = inference_worker("cpu", args.eval_dataset, pipe, args.output_image_path) + pipe.transformer = model else: - df = pd.read_csv(args.eval_dataset, sep='\t') - subsut_sample_num = len(df) // visible_gpus - for i in range(visible_gpus): - start = i * subsut_sample_num - end = min((i + 1) * subsut_sample_num, len(df)) - df_subset = df.iloc[start : end] - df_subset.to_csv(f"subset_{i}.tsv", sep='\t', index=False) - - processes = [] - queue = Queue() - for i in range(visible_gpus): - p = Process(target=inference_worker, args=(i, f"subset_{i}.tsv", pipe.to(f"cuda:{i}"), args.output_image_path, queue)) - p.start() - processes.append(p) - for p in processes: - p.join() - - outputs = [queue.get() for _ in range(visible_gpus)] - - prompt_list = [] - image_list = [] - for output in outputs: - prompt_list.extend(output[0]) - image_list.extend(output[1]) - - print("Evaluations for subset are done! Getting the final accuracy...") + print("Don't supply quantized_model_path or quantized model doesn't exist, evaluate BF16 accuracy.") + + inference_worker(args.eval_dataset, pipe.to(device), args.output_image_path) + + if args.accuracy: + df = pd.read_csv(args.eval_dataset, sep="\t") + prompt_list = [] + image_list = [] + for index, row in df.iterrows(): + assert "id" in row and "caption" in row + caption_id = row["id"] + caption_text = row["caption"] + if os.path.exists(os.path.join(args.output_image_path, str(caption_id) + ".png")): + prompt_list.append(caption_text) + image_list.append(os.path.join(args.output_image_path, str(caption_id) + ".png")) result = {} metrics = ["clip", "clip-iqa", "imagereward"] for metric in metrics: - result.update(metric_map[metric](prompt_list, image_list, pipe.device)) + result.update(metric_map[metric](prompt_list, image_list, device)) print(tabulate.tabulate(result.items(), tablefmt="grid")) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh new file mode 100644 index 00000000000..8640c45f925 --- /dev/null +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh @@ -0,0 +1,99 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --quantized_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + --limit=*) + limit=$(echo $var |cut -f2 -d=) + ;; + --output_image_path=*) + output_image_path=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_tuning +function run_tuning { + dataset_location=${dataset_location:="captions_source.tsv"} + limit=${limit:=-1} + output_image_path=${output_image_path:="./tmp_imgs"} + + if [ "${topology}" = "flux_fp8" ]; then + extra_cmd="--scheme FP8 --inference" + elif [ "${topology}" = "flux_mxfp8" ]; then + extra_cmd="--scheme MXFP8 --inference" + fi + + if ! command -v nvidia-smi &> /dev/null; then + echo "GPU is not available, use CPU for evaluation." + + python3 main.py \ + --model ${input_model} \ + --quantized_model_path ${tuned_checkpoint} \ + --limit ${limit} \ + --output_image_path ${output_image_path} \ + --eval_dataset ${dataset_location} \ + ${extra_cmd} + python3 main.py --output_image_path ${output_image_path} --accuracy + + else + if [ -n "$CUDA_VISIBLE_DEVICES" ]; then + gpu_list="${CUDA_VISIBLE_DEVICES:-}" + IFS=',' read -ra gpu_ids <<< "$gpu_list" + visible_gpus=${#gpu_ids[@]} + else + visible_gpus=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | wc -l) + fi + + echo "visible_gpus: ${visible_gpus}" + python dataset_split.py --split_num ${visible_gpus} --input_file ${dataset_location} --limit ${limit} + + for ((i=0; i Date: Thu, 23 Oct 2025 21:32:55 +0800 Subject: [PATCH 19/25] Update run_benchmark.sh --- .../diffusion_model/diffusers/flux/run_benchmark.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh index 8640c45f925..1e916ee69b4 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh @@ -61,7 +61,7 @@ function run_tuning { --quantized_model_path ${tuned_checkpoint} \ --limit ${limit} \ --output_image_path ${output_image_path} \ - --eval_dataset ${dataset_location} \ + --eval_dataset ${dataset_location} \ ${extra_cmd} python3 main.py --output_image_path ${output_image_path} --accuracy @@ -75,7 +75,7 @@ function run_tuning { fi echo "visible_gpus: ${visible_gpus}" - python dataset_split.py --split_num ${visible_gpus} --input_file ${dataset_location} --limit ${limit} + python dataset_split.py --split_num ${visible_gpus} --input_file ${dataset_location} --limit ${limit} for ((i=0; i Date: Thu, 23 Oct 2025 21:35:30 +0800 Subject: [PATCH 20/25] Update run_benchmark.sh --- .../diffusers/flux/run_benchmark.sh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh index 1e916ee69b4..10e294d2724 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh @@ -22,15 +22,15 @@ function init_params { --input_model=*) input_model=$(echo $var |cut -f2 -d=) ;; - --quantized_model=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; - --limit=*) - limit=$(echo $var |cut -f2 -d=) - ;; - --output_image_path=*) - output_image_path=$(echo $var |cut -f2 -d=) - ;; + --quantized_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + --limit=*) + limit=$(echo $var |cut -f2 -d=) + ;; + --output_image_path=*) + output_image_path=$(echo $var |cut -f2 -d=) + ;; *) echo "Error: No such parameter: ${var}" exit 1 @@ -68,8 +68,8 @@ function run_tuning { else if [ -n "$CUDA_VISIBLE_DEVICES" ]; then gpu_list="${CUDA_VISIBLE_DEVICES:-}" - IFS=',' read -ra gpu_ids <<< "$gpu_list" - visible_gpus=${#gpu_ids[@]} + IFS=',' read -ra gpu_ids <<< "$gpu_list" + visible_gpus=${#gpu_ids[@]} else visible_gpus=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | wc -l) fi @@ -87,7 +87,7 @@ function run_tuning { --eval_dataset "subset_$i.tsv" \ ${extra_cmd} & program_pid+=($!) - echo "Start (PID: ${program_pid[-1]}, GPU: ${i})" + echo "Start (PID: ${program_pid[-1]}, GPU: ${i})" done wait "${program_pid[@]}" echo "Start calculating final score..." From b8652159d57a8e11bda04a79e465006ae971c7cd Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 23 Oct 2025 21:38:28 +0800 Subject: [PATCH 21/25] Update run_quant.sh --- .../diffusion_model/diffusers/flux/run_quant.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh index ea70460c598..d13c3bcf470 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_quant.sh @@ -22,9 +22,9 @@ function init_params { --input_model=*) input_model=$(echo $var |cut -f2 -d=) ;; - --output_model=*) - tuned_checkpoint=$(echo $var |cut -f2 -d=) - ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; *) echo "Error: No such parameter: ${var}" exit 1 @@ -46,8 +46,8 @@ function run_tuning { python3 main.py \ --model ${input_model} \ - --output_dir ${tuned_checkpoint} \ - ${extra_cmd} + --output_dir ${tuned_checkpoint} \ + ${extra_cmd} } main "$@" From 41a7fcae301b545c293f21b2262cdf0edbf32d8b Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 23 Oct 2025 21:46:43 +0800 Subject: [PATCH 22/25] Update main.py --- examples/pytorch/diffusion_model/diffusers/flux/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/main.py b/examples/pytorch/diffusion_model/diffusers/flux/main.py index cd55c8063be..c9e0b98d5c0 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/main.py +++ b/examples/pytorch/diffusion_model/diffusers/flux/main.py @@ -160,7 +160,7 @@ def act_qdq_forward(module, x, *args, **kwargs): else: print("Don't supply quantized_model_path or quantized model doesn't exist, evaluate BF16 accuracy.") - inference_worker(args.eval_dataset, pipe.to(device), args.output_image_path) + inference_worker(args.eval_dataset, pipe.to(device), args.output_image_path) if args.accuracy: df = pd.read_csv(args.eval_dataset, sep="\t") From ab609b60b8bb8c601b99c31aecb286227fe6b7e0 Mon Sep 17 00:00:00 2001 From: "Wang, Mengni" Date: Thu, 23 Oct 2025 21:54:26 +0800 Subject: [PATCH 23/25] Update run_benchmark.sh --- .../diffusers/flux/run_benchmark.sh | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh index 10e294d2724..c29d6c6baf1 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh @@ -70,26 +70,33 @@ function run_tuning { gpu_list="${CUDA_VISIBLE_DEVICES:-}" IFS=',' read -ra gpu_ids <<< "$gpu_list" visible_gpus=${#gpu_ids[@]} - else - visible_gpus=$(nvidia-smi --query-gpu=count --format=csv,noheader,nounits | wc -l) - fi + echo "visible_gpus: ${visible_gpus}" - echo "visible_gpus: ${visible_gpus}" - python dataset_split.py --split_num ${visible_gpus} --input_file ${dataset_location} --limit ${limit} + python dataset_split.py --split_num ${visible_gpus} --input_file ${dataset_location} --limit ${limit} - for ((i=0; i Date: Thu, 23 Oct 2025 21:58:39 +0800 Subject: [PATCH 24/25] Update run_benchmark.sh --- .../diffusers/flux/run_benchmark.sh | 64 ++++++++----------- 1 file changed, 25 insertions(+), 39 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh index c29d6c6baf1..ef920c7928d 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh @@ -53,54 +53,40 @@ function run_tuning { extra_cmd="--scheme MXFP8 --inference" fi - if ! command -v nvidia-smi &> /dev/null; then - echo "GPU is not available, use CPU for evaluation." + if [ -n "$CUDA_VISIBLE_DEVICES" ]; then + gpu_list="${CUDA_VISIBLE_DEVICES:-}" + IFS=',' read -ra gpu_ids <<< "$gpu_list" + visible_gpus=${#gpu_ids[@]} + echo "visible_gpus: ${visible_gpus}" - python3 main.py \ - --model ${input_model} \ - --quantized_model_path ${tuned_checkpoint} \ - --limit ${limit} \ - --output_image_path ${output_image_path} \ - --eval_dataset ${dataset_location} \ - ${extra_cmd} - python3 main.py --output_image_path ${output_image_path} --accuracy - - else - if [ -n "$CUDA_VISIBLE_DEVICES" ]; then - gpu_list="${CUDA_VISIBLE_DEVICES:-}" - IFS=',' read -ra gpu_ids <<< "$gpu_list" - visible_gpus=${#gpu_ids[@]} - echo "visible_gpus: ${visible_gpus}" + python dataset_split.py --split_num ${visible_gpus} --input_file ${dataset_location} --limit ${limit} - python dataset_split.py --split_num ${visible_gpus} --input_file ${dataset_location} --limit ${limit} + for ((i=0; i Date: Thu, 23 Oct 2025 21:59:18 +0800 Subject: [PATCH 25/25] Update run_benchmark.sh --- .../pytorch/diffusion_model/diffusers/flux/run_benchmark.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh index ef920c7928d..7fe1006ba69 100644 --- a/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh +++ b/examples/pytorch/diffusion_model/diffusers/flux/run_benchmark.sh @@ -4,7 +4,7 @@ set -x function main { init_params "$@" - run_tuning + run_benchmark } @@ -41,8 +41,8 @@ function init_params { } -# run_tuning -function run_tuning { +# run_benchmark +function run_benchmark { dataset_location=${dataset_location:="captions_source.tsv"} limit=${limit:=-1} output_image_path=${output_image_path:="./tmp_imgs"}