From 825340e6384703b077c8dc19939409bab6cc4e6e Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Tue, 2 Dec 2025 14:29:32 -0800 Subject: [PATCH 1/4] [mxfp8 moe training] parallelize along col blocks in scale blocked format kernel for groups along K stack-info: PR: https://github.com/pytorch/ao/pull/3416, branch: danielvegamyhre/stack/85 --- ...h_triton_mx_block_rearrange_2d_K_groups.py | 229 ++ log.txt | 1952 +++++++++++++++++ setup.py | 1 + test/prototype/moe_training/test_kernels.py | 59 + .../mx_block_rearrange_2d_K_groups.cu | 199 ++ .../csrc/cuda/mx_kernels/mxfp8_extension.cpp | 77 + .../test_mx_block_rearrange_standalone.py | 217 ++ .../moe_training/kernels/mxfp8/quant.py | 188 +- .../moe_training/scaled_grouped_mm.py | 8 +- torchao/prototype/mx_formats/kernels.py | 9 +- 10 files changed, 2927 insertions(+), 12 deletions(-) create mode 100644 benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py create mode 100644 log.txt create mode 100644 torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu create mode 100644 torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py new file mode 100644 index 0000000000..35245f13bf --- /dev/null +++ b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py @@ -0,0 +1,229 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + +import itertools +import os +from dataclasses import dataclass +from typing import List + +import torch +from tabulate import tabulate +from torch.utils.cpp_extension import load +from tqdm import tqdm + +from benchmarks.utils import benchmark_cuda_function_in_microseconds +from torchao.prototype.moe_training.kernels.mxfp8 import ( + triton_mx_block_rearrange_2d_K_groups, +) +from torchao.prototype.moe_training.kernels.mxfp8.quant import ( + triton_mx_block_rearrange_2d_K_groups_naive, +) +from torchao.prototype.moe_training.utils import generate_jagged_offs + +# Build CUDA kernel directly using torch.utils.cpp_extension.load +mxfp8_cuda = None +try: + # Get the kernel source directory + KERNEL_DIR = os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "..", + "..", + "..", + "torchao", + "csrc", + "cuda", + "mx_kernels", + ) + KERNEL_DIR = os.path.normpath(KERNEL_DIR) + + print("Compiling CUDA kernel...") + mxfp8_cuda = load( + name="mxfp8_cuda", + sources=[ + os.path.join(KERNEL_DIR, "mxfp8_extension.cpp"), + os.path.join(KERNEL_DIR, "mxfp8_cuda.cu"), + os.path.join(KERNEL_DIR, "mx_block_rearrange_2d_K_groups.cu"), + ], + extra_cuda_cflags=[ + "-O3", + "--use_fast_math", + "-std=c++17", + "-gencode=arch=compute_90,code=sm_90", + ], + extra_cflags=["-O3", "-std=c++17"], + verbose=True, + ) + print("✓ CUDA kernel compilation successful!") +except (ImportError, RuntimeError) as e: + print(f"⚠ CUDA kernel not available: {e}") + print("The benchmark will only run 'naive' and 'parallel' Triton versions.\n") + +device = torch.device("cuda") + +# Needed since changing args to function causes recompiles +torch._dynamo.config.cache_size_limit = 1000 + + +@dataclass(frozen=True) +class ExperimentConfig: + input_shape: tuple[int] + num_groups: int + version: str # "naive" or "parallel" + + +@dataclass(frozen=True) +class ExperimentResult: + time_us: float + mem_bw_gbps: float + + +@dataclass(frozen=True) +class Experiment: + config: ExperimentConfig + result: ExperimentResult + + +def get_configs() -> List[ExperimentConfig]: + # Llama4 and DSV3 671b shapes. Input activations are scaled along the total_M dim, which contains all the token groups. + block_size = 32 + input_shapes = [ + (5120, 16384 // block_size), + (5120, 131072 // block_size), + (8192, 16384 // block_size), + (8192, 131072 // block_size), + (7168, 16384 // block_size), + (7168, 131072 // block_size), + (2048, 16384 // block_size), + (2048, 131072 // block_size), + ] + num_groups = [8] + versions = ["naive", "parallel", "cuda"] + + configs = [] + for shape, groups, version in itertools.product( + input_shapes, + num_groups, + versions, + ): + configs.append( + ExperimentConfig( + input_shape=shape, + num_groups=groups, + version=version, + ) + ) + return configs + + +def run_experiment(config: ExperimentConfig) -> ExperimentResult: + input_shape, num_groups, version = ( + config.input_shape, + config.num_groups, + config.version, + ) + input_tensor = torch.randint( + low=0, + high=256, + size=input_shape, + dtype=torch.uint8, + device=device, + ) + + M, Kg = input_shape + block_size = 32 + input_group_offsets = generate_jagged_offs(num_groups, Kg, multiple_of=block_size) + + # Select which kernel to benchmark based on version + if version == "naive": + kernel_fn = triton_mx_block_rearrange_2d_K_groups_naive + elif version == "parallel": + kernel_fn = triton_mx_block_rearrange_2d_K_groups + elif version == "cuda": + kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups + else: + raise ValueError(f"Unknown version: {version}") + + # Run kernel to get output shape + out_scales = kernel_fn( + input_tensor, + input_group_offsets, + ) + + # Benchmark the kernel + assert input_tensor.is_contiguous() + time_us = benchmark_cuda_function_in_microseconds( + kernel_fn, + input_tensor, + input_group_offsets, + ) + + # Calculate memory bandwidth + bytes_per_input_el = torch.finfo(torch.float8_e8m0fnu).bits / 8 + bytes_per_output_el = torch.finfo(torch.float8_e4m3fn).bits / 8 + + read_bytes = input_tensor.numel() * bytes_per_input_el + write_bytes = out_scales.numel() * bytes_per_output_el + + mem_bw_gbps = ((read_bytes + write_bytes) / 1e9) / (time_us / 1e6) + + return ExperimentResult( + time_us=time_us, + mem_bw_gbps=mem_bw_gbps, + ) + + +def print_results(experiments: List[Experiment]): + # Group experiments by input shape + shapes_dict = {} + for exp in experiments: + shape_key = exp.config.input_shape + if shape_key not in shapes_dict: + shapes_dict[shape_key] = {} + shapes_dict[shape_key][exp.config.version] = exp.result + + headers = [ + "kernel_version", + "input_shape", + "time_us", + "mem_bw_gbps", + "fastest_version", + ] + + rows = [] + for shape, versions in shapes_dict.items(): + # Find fastest version for this shape + fastest_version = min(versions.items(), key=lambda x: x[1].time_us)[0] + + # Add rows for each version + for version, result in versions.items(): + rows.append( + [ + version, + f"({shape[0]}, {shape[1]})", + f"{result.time_us:.2f}", + round(result.mem_bw_gbps, 3), + fastest_version, + ] + ) + + print(tabulate(rows, headers=headers)) + + +def main(): + torch.random.manual_seed(123) + configs = get_configs() + results = [] + for config in tqdm(configs): + result = run_experiment(config) + results.append(Experiment(config=config, result=result)) + + # Use Tabulate to print results + print_results(results) + + +if __name__ == "__main__": + main() diff --git a/log.txt b/log.txt new file mode 100644 index 0000000000..e891bce92e --- /dev/null +++ b/log.txt @@ -0,0 +1,1952 @@ +============================= test session starts ============================== +platform linux -- Python 3.12.12, pytest-8.4.2, pluggy-1.6.0 -- /home/danvm/.conda/envs/release/bin/python3.12 +cachedir: .pytest_cache +hypothesis profile 'default' +rootdir: /home/danvm/ao +configfile: pyproject.toml +plugins: hypothesis-6.148.6, anyio-4.12.0 +collecting ... TMA benchmarks will be running without grid constant TMA descriptor. +NOTE: Using slow Hadamard transform for SpinQuant. For better performance on GPU, install `fast_hadamard_transform`: `pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git` +collected 10930 items / 9 skipped + +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config0] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config1] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config2] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config3] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config4] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config5] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config6] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config7] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config8] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config9] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config10] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config11] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config12] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config13] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config14] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config15] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config16] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config17] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config18] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config19] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config20] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config21] PASSED +test/core/test_config.py::test_reconstructable_dict_file_round_trip[config22] PASSED +test/core/test_config.py::test_granularity_serialization[granularity0] TMA benchmarks will be running without grid constant TMA descriptor. +PASSED +test/core/test_config.py::test_granularity_serialization[granularity1] TMA benchmarks will be running without grid constant TMA descriptor. +PASSED +test/core/test_config.py::test_granularity_serialization[granularity2] TMA benchmarks will be running without grid constant TMA descriptor. +PASSED +test/core/test_config.py::test_disallowed_modules PASSED +test/core/test_config.py::test_version_mismatch PASSED +test/core/test_config.py::test_default_version PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant0 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant1 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant2 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant3 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant4 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant5 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_print_quantized_module FAILED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_register_new_dispatch PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_tensor_core_layout_transpose PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant0 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant1 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant2 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant3 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant4 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant5 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_affine_quantized_intx_static PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant0 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant1 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant2 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant3 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant4 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_weights_only PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_alias_device_cpu_bfloat16 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_alias_device_cuda_bfloat16 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_flatten_unflatten_device_cpu_bfloat16 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_flatten_unflatten_device_cuda_bfloat16 FAILED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_matmul_cuda_bfloat16 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_mm_int4wo_cuda_bfloat16 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_slice_and_copy_int4wo_cuda_bfloat16 PASSED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_slice_gemlite_cuda_bfloat16 SKIPPED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_slice_gemlite_cuda_float16 SKIPPED +test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_slice_int4wo_cuda_bfloat16 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_choose_scale_float8_bounds_float8_e4m3fn_bfloat16 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_choose_scale_float8_bounds_float8_e4m3fn_float32 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_choose_scale_float8_bounds_float8_e5m2_bfloat16 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_choose_scale_float8_bounds_float8_e5m2_float32 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_bfloat16_block_size0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_bfloat16_block_size1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_bfloat16_block_size2 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_bfloat16_block_size3 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_float32_block_size0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_float32_block_size1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_float32_block_size2 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_float32_block_size3 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_bfloat16_block_size0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_bfloat16_block_size1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_bfloat16_block_size2 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_bfloat16_block_size3 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_float32_block_size0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_float32_block_size1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_float32_block_size2 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_float32_block_size3 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_scale_broadcasting PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_expected_kernels_on_gpu_granularity0_float8_config_version_1 frames [('total', 1), ('ok', 1)] +stats [('calls_captured', 1), ('unique_graphs', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 34), ('benchmarking.InductorBenchmarker.benchmark_gpu', 34), ('coordesc_tuning_bench', 34), ('triton_bundler_save_kernel', 24), ('async_compile_cache_miss', 3), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('triton_bundler_save_static_autotuner', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_512_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_expected_kernels_on_gpu_granularity0_float8_config_version_2 frames [('total', 1), ('ok', 1)] +stats [('calls_captured', 1), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 34), ('benchmarking.InductorBenchmarker.benchmark_gpu', 34), ('coordesc_tuning_bench', 34), ('triton_bundler_save_kernel', 24), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 3), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_512_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_expected_kernels_on_gpu_granularity1_float8_config_version_1 frames [('total', 1), ('ok', 1)] +stats [('calls_captured', 1), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 8), ('benchmarking.InductorBenchmarker.benchmark', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('coordesc_tuning_bench', 5), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('async_compile_cache_miss', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_hit', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_512_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_expected_kernels_on_gpu_granularity1_float8_config_version_2 frames [('total', 1), ('ok', 1)] +stats [('calls_captured', 1), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 8), ('benchmarking.InductorBenchmarker.benchmark', 6), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('coordesc_tuning_bench', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('async_compile_cache_miss', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_hit', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_512_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_basic_granularity0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_basic_granularity1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_edge_cases PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_functional_correctness_granularity0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_functional_correctness_granularity1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_per_row PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_per_tensor PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_False_granularity0_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_False_granularity0_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_False_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_False_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 79), ('benchmarking.InductorBenchmarker.benchmark_gpu', 79), ('coordesc_tuning_bench', 79), ('triton_bundler_save_kernel', 48), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 64), ('benchmarking.InductorBenchmarker.benchmark', 61), ('benchmarking.InductorBenchmarker.benchmark_gpu', 61), ('coordesc_tuning_bench', 57), ('async_compile_cache_miss', 12), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_hit', 6), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_True_granularity1_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 32), ('benchmarking.InductorBenchmarker.benchmark', 24), ('benchmarking.InductorBenchmarker.benchmark_gpu', 24), ('coordesc_tuning_bench', 21), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_True_granularity1_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 32), ('benchmarking.InductorBenchmarker.benchmark', 18), ('benchmarking.InductorBenchmarker.benchmark_gpu', 18), ('coordesc_tuning_bench', 15), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_False_granularity0_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_False_granularity0_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_False_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_False_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 24), ('benchmarking.InductorBenchmarker.benchmark_gpu', 24), ('coordesc_tuning_bench', 24), ('triton_bundler_save_kernel', 16), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 32), ('benchmarking.InductorBenchmarker.benchmark', 19), ('benchmarking.InductorBenchmarker.benchmark_gpu', 19), ('coordesc_tuning_bench', 15), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_True_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_True_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_False_granularity0_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_False_granularity0_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_False_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_False_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 56), ('benchmarking.InductorBenchmarker.benchmark', 45), ('benchmarking.InductorBenchmarker.benchmark_gpu', 45), ('coordesc_tuning_bench', 39), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten.mm_128_256_128', 1), ('aten.mm_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 48), ('benchmarking.InductorBenchmarker.benchmark', 39), ('benchmarking.InductorBenchmarker.benchmark_gpu', 39), ('coordesc_tuning_bench', 34), ('pattern_matcher_nodes', 5), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('removed_pointless_view_pair', 1), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten.mm_4096_64_256', 1), ('aten.mm_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_True_granularity1_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 57), ('benchmarking.InductorBenchmarker.benchmark_gpu', 57), ('triton_bundler_save_kernel', 56), ('coordesc_tuning_bench', 51), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten.mm_128_256_128', 1), ('aten.mm_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_True_granularity1_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 48), ('benchmarking.InductorBenchmarker.benchmark', 32), ('benchmarking.InductorBenchmarker.benchmark_gpu', 32), ('coordesc_tuning_bench', 27), ('pattern_matcher_nodes', 5), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('removed_pointless_view_pair', 1), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten.mm_4096_64_256', 1), ('aten.mm_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_False_granularity0_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_False_granularity0_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_False_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_False_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 69), ('benchmarking.InductorBenchmarker.benchmark_gpu', 69), ('coordesc_tuning_bench', 69), ('triton_bundler_save_kernel', 48), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 64), ('benchmarking.InductorBenchmarker.benchmark', 64), ('benchmarking.InductorBenchmarker.benchmark_gpu', 64), ('coordesc_tuning_bench', 60), ('async_compile_cache_miss', 12), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_hit', 6), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_True_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_True_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_False_granularity0_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_False_granularity0_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_False_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_False_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 18), ('benchmarking.InductorBenchmarker.benchmark_gpu', 18), ('coordesc_tuning_bench', 18), ('triton_bundler_save_kernel', 16), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 32), ('benchmarking.InductorBenchmarker.benchmark', 25), ('benchmarking.InductorBenchmarker.benchmark_gpu', 25), ('coordesc_tuning_bench', 21), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_True_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_True_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_False_granularity0_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_False_granularity0_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_False_granularity1_sizes0 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_False_granularity1_sizes1 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 20), ('benchmarking.InductorBenchmarker.benchmark_gpu', 20), ('coordesc_tuning_bench', 20), ('triton_bundler_save_kernel', 16), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten.mm_128_256_128', 1), ('aten.mm_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 48), ('benchmarking.InductorBenchmarker.benchmark', 41), ('benchmarking.InductorBenchmarker.benchmark_gpu', 41), ('coordesc_tuning_bench', 36), ('pattern_matcher_nodes', 5), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('removed_pointless_view_pair', 1), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten.mm_4096_64_256', 1), ('aten.mm_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_True_granularity1_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('benchmarking.InductorBenchmarker.benchmark', 26), ('benchmarking.InductorBenchmarker.benchmark_gpu', 26), ('coordesc_tuning_bench', 26), ('triton_bundler_save_kernel', 16), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten.mm_128_256_128', 1), ('aten.mm_128_128_256', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_True_granularity1_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)] +aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)] +inductor [('triton_bundler_save_kernel', 48), ('benchmarking.InductorBenchmarker.benchmark', 39), ('benchmarking.InductorBenchmarker.benchmark_gpu', 39), ('coordesc_tuning_bench', 34), ('pattern_matcher_nodes', 5), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('removed_pointless_view_pair', 1), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)] +graph_break [] +aten_mm_info [('aten.mm_4096_64_256', 1), ('aten.mm_4096_256_64', 1)] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_weight_dimension_warning ToyLinearModel( + (linear1): Linear(in_features=10, out_features=25, bias=False, weight=Tensor: ) + (linear2): Linear(in_features=25, out_features=10, bias=False, weight=Tensor: ) +) +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_invalid_granularity PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mismatched_granularity PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape0_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape0_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape1_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape1_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape2_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape2_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape3_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape3_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape4_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape4_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape0_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape0_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape1_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape1_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape2_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape2_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape3_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape3_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape4_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape4_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape0_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape0_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape1_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape1_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape2_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape2_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape3_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape3_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape4_bias_False PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape4_bias_True PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_per_row_with_float32 PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_preprocess_scale_3d_reshape PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_quantize_dequantize_fp8_inductor_float8_e4m3fn_bfloat16 frames [('total', 2), ('ok', 2)] +stats [('calls_captured', 2), ('unique_graphs', 2)] +aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] +inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2)] +graph_break [] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_quantize_dequantize_fp8_inductor_float8_e4m3fn_float32 frames [('total', 2), ('ok', 2)] +stats [('calls_captured', 2), ('unique_graphs', 2)] +aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] +inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2)] +graph_break [] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_quantize_dequantize_fp8_inductor_float8_e5m2_bfloat16 frames [('total', 2), ('ok', 2)] +stats [('calls_captured', 2), ('unique_graphs', 2)] +aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] +inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2)] +graph_break [] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_quantize_dequantize_fp8_inductor_float8_e5m2_float32 frames [('total', 2), ('ok', 2)] +stats [('calls_captured', 2), ('unique_graphs', 2)] +aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)] +inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2)] +graph_break [] +PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_serialization_mode_dynamic PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_serialization_mode_static PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_serialization_mode_weight-only PASSED +test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_unsupported_granularity PASSED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt8woAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:49:48.793000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 3994849 +I1204 14:49:48.795000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 3994853 +I1204 14:49:48.797000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 3994857 +I1204 14:49:48.799000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 3994860 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +NCCL version 2.27.5+cuda12.9 +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +PASSED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt8woAffineQuantizedTensorParallel::test_tp_float16 I1204 14:50:02.048000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4001302 +I1204 14:50:02.050000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4001308 +I1204 14:50:02.058000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4001314 +I1204 14:50:02.060000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4001329 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +NCCL version 2.27.5+cuda12.9 +PASSED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt8woAffineQuantizedTensorParallel::test_tp_float32 I1204 14:50:18.788000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4019773 +I1204 14:50:18.790000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4019776 +I1204 14:50:18.792000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4019780 +I1204 14:50:18.795000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4019786 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +NCCL version 2.27.5+cuda12.9 +PASSED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt4woAffineQuantizedTensorParallel::test_tp_bfloat16 SKIPPED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestGemliteLayoutTensorParallel::test_tp_gemlite_float16 SKIPPED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt8dqAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:50:35.226000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4034897 +I1204 14:50:35.228000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4034904 +I1204 14:50:35.230000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4034908 +I1204 14:50:35.232000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4034913 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1566: UserWarning: Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1566: UserWarning: Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1566: UserWarning: Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1566: UserWarning: Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details + warnings.warn( +NCCL version 2.27.5+cuda12.9 +PASSED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8woAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:50:53.577000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4050466 +I1204 14:50:53.579000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4050467 +I1204 14:50:53.581000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4050469 +I1204 14:50:53.583000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4050471 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 1 with exit code: 10 +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 3 with exit code: 10 +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 2 with exit code: 10 +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 0 with exit code: 10 +Process 1 terminated with exit code 10, terminating remaining processes. +FAILED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8woAffineQuantizedTensorParallel::test_tp_float16 I1204 14:50:59.619000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4052343 +I1204 14:50:59.621000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4052344 +I1204 14:50:59.625000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4052346 +I1204 14:50:59.627000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4052348 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float16 +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 0 with exit code: 10 +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float16 +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 3 with exit code: 10 +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float16 +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 1 with exit code: 10 +Process 0 terminated with exit code 10, terminating remaining processes. +FAILED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8woAffineQuantizedTensorParallel::test_tp_float32 I1204 14:51:05.970000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4055063 +I1204 14:51:05.972000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4055068 +I1204 14:51:05.974000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4055071 +I1204 14:51:05.976000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4055074 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float32 +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 1 with exit code: 10 +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float32 +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 2 with exit code: 10 +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float32 +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 3 with exit code: 10 +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float32 +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 0 with exit code: 10 +Process 1 terminated with exit code 10, terminating remaining processes. +FAILED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8dqTensorAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:51:13.816000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4061064 +I1204 14:51:13.818000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4061080 +I1204 14:51:13.820000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4061085 +I1204 14:51:13.821000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4061089 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 1 with exit code: 10 +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 3 with exit code: 10 +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 2 with exit code: 10 +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 0 with exit code: 10 +Process 1 terminated with exit code 10, terminating remaining processes. +FAILED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8dqTensorAffineQuantizedTensorParallel::test_tp_float16 I1204 14:51:18.662000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4064730 +I1204 14:51:18.664000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4064731 +I1204 14:51:18.666000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4064733 +I1204 14:51:18.668000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4064735 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float16 +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 0 with exit code: 10 +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float16 +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 1 with exit code: 10 +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float16 +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 3 with exit code: 10 +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float16 +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 2 with exit code: 10 +Process 0 terminated with exit code 10, terminating remaining processes. +FAILED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8dqTensorAffineQuantizedTensorParallel::test_tp_float32 I1204 14:51:26.309000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4070526 +I1204 14:51:26.311000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4070527 +I1204 14:51:26.313000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4070529 +I1204 14:51:26.315000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4070531 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float32 +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 0 with exit code: 10 +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float32 +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 3 with exit code: 10 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float32 +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 2 with exit code: 10 +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float32 +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 1 with exit code: 10 +Process 0 terminated with exit code 10, terminating remaining processes. +FAILED +test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8dqRowAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:51:31.250000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4074998 +I1204 14:51:31.255000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4074999 +I1204 14:51:31.258000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4075002 +I1204 14:51:31.259000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4075003 +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +TMA benchmarks will be running without grid constant TMA descriptor. +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 236, in test_tp +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqRowAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 3 with exit code: 10 +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 236, in test_tp +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqRowAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 0 with exit code: 10 +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 236, in test_tp +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqRowAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 2 with exit code: 10 +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last): +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] getattr(self, test_name)() +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] fn() +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] method(*args, **kwargs) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] test(self, **param_kwargs) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] raise e +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] func(self, *args, **kwargs) # type: ignore[misc] +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 236, in test_tp +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] return self._test_tp(dtype) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] up_dist = self.colwise_shard(up_quant, mesh) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)]) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] return _FromTorchTensor.apply( # pyre-ignore[16]: autograd func +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] return super().apply(*args, **kwargs) # type: ignore[misc] +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] input.view_as(input), +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] return func(*args, **kwargs) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] return _func(f, types, args, kwargs) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] scale_shape.append(qdata.shape[i] // self.block_size[i]) +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] ~~~~~~~~~~~^^^ +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir: +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqRowAffineQuantizedTensorParallel.test_tp_bfloat16 +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0 +E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] exiting process 1 with exit code: 10 +Process 0 terminated with exit code 10, terminating remaining processes. +FAILED +test/dtypes/test_bitpacking.py::test_CPU[0-1] PASSED +test/dtypes/test_bitpacking.py::test_CPU[0-2] PASSED +test/dtypes/test_bitpacking.py::test_CPU[0-3] PASSED +test/dtypes/test_bitpacking.py::test_CPU[0-4] PASSED +test/dtypes/test_bitpacking.py::test_CPU[0-5] PASSED +test/dtypes/test_bitpacking.py::test_CPU[0-6] PASSED +test/dtypes/test_bitpacking.py::test_CPU[0-7] PASSED +test/dtypes/test_bitpacking.py::test_CPU[-1-1] PASSED +test/dtypes/test_bitpacking.py::test_CPU[-1-2] PASSED +test/dtypes/test_bitpacking.py::test_CPU[-1-3] PASSED +test/dtypes/test_bitpacking.py::test_CPU[-1-4] PASSED +test/dtypes/test_bitpacking.py::test_CPU[-1-5] PASSED +test/dtypes/test_bitpacking.py::test_CPU[-1-6] PASSED +test/dtypes/test_bitpacking.py::test_CPU[-1-7] PASSED +test/dtypes/test_bitpacking.py::test_CPU[1-1] PASSED +test/dtypes/test_bitpacking.py::test_CPU[1-2] PASSED +test/dtypes/test_bitpacking.py::test_CPU[1-3] PASSED +test/dtypes/test_bitpacking.py::test_CPU[1-4] PASSED +test/dtypes/test_bitpacking.py::test_CPU[1-5] PASSED +test/dtypes/test_bitpacking.py::test_CPU[1-6] PASSED +test/dtypes/test_bitpacking.py::test_CPU[1-7] PASSED +test/dtypes/test_bitpacking.py::test_GPU[0-1] PASSED +test/dtypes/test_bitpacking.py::test_GPU[0-2] PASSED +test/dtypes/test_bitpacking.py::test_GPU[0-3] PASSED +test/dtypes/test_bitpacking.py::test_GPU[0-4] PASSED +test/dtypes/test_bitpacking.py::test_GPU[0-5] PASSED +test/dtypes/test_bitpacking.py::test_GPU[0-6] PASSED +test/dtypes/test_bitpacking.py::test_GPU[0-7] PASSED +test/dtypes/test_bitpacking.py::test_GPU[-1-1] PASSED +test/dtypes/test_bitpacking.py::test_GPU[-1-2] PASSED +test/dtypes/test_bitpacking.py::test_GPU[-1-3] PASSED +test/dtypes/test_bitpacking.py::test_GPU[-1-4] PASSED +test/dtypes/test_bitpacking.py::test_GPU[-1-5] PASSED +test/dtypes/test_bitpacking.py::test_GPU[-1-6] PASSED +test/dtypes/test_bitpacking.py::test_GPU[-1-7] PASSED +test/dtypes/test_bitpacking.py::test_GPU[1-1] PASSED +test/dtypes/test_bitpacking.py::test_GPU[1-2] PASSED +test/dtypes/test_bitpacking.py::test_GPU[1-3] PASSED +test/dtypes/test_bitpacking.py::test_GPU[1-4] PASSED +test/dtypes/test_bitpacking.py::test_GPU[1-5] PASSED +test/dtypes/test_bitpacking.py::test_GPU[1-6] PASSED +test/dtypes/test_bitpacking.py::test_GPU[1-7] PASSED +test/dtypes/test_bitpacking.py::test_compile[0-1] PASSED +test/dtypes/test_bitpacking.py::test_compile[0-2] PASSED +test/dtypes/test_bitpacking.py::test_compile[0-3] PASSED +test/dtypes/test_bitpacking.py::test_compile[0-4] PASSED +test/dtypes/test_bitpacking.py::test_compile[0-5] PASSED +test/dtypes/test_bitpacking.py::test_compile[0-6] PASSED +test/dtypes/test_bitpacking.py::test_compile[0-7] PASSED +test/dtypes/test_bitpacking.py::test_compile[-1-1] PASSED +test/dtypes/test_bitpacking.py::test_compile[-1-2] PASSED +test/dtypes/test_bitpacking.py::test_compile[-1-3] PASSED +test/dtypes/test_bitpacking.py::test_compile[-1-4] PASSED +test/dtypes/test_bitpacking.py::test_compile[-1-5] PASSED +test/dtypes/test_bitpacking.py::test_compile[-1-6] PASSED +test/dtypes/test_bitpacking.py::test_compile[-1-7] PASSED +test/dtypes/test_bitpacking.py::test_compile[1-1] PASSED +test/dtypes/test_bitpacking.py::test_compile[1-2] PASSED +test/dtypes/test_bitpacking.py::test_compile[1-3] PASSED +test/dtypes/test_bitpacking.py::test_compile[1-4] PASSED +test/dtypes/test_bitpacking.py::test_compile[1-5] PASSED +test/dtypes/test_bitpacking.py::test_compile[1-6] PASSED +test/dtypes/test_bitpacking.py::test_compile[1-7] PASSED +test/dtypes/test_bitpacking.py::test_pack_example tensor([ 0, 105, 151, 37], device='cuda:0', dtype=torch.uint8) tensor([ 39, 146], device='cuda:0', dtype=torch.uint8) +PASSED +test/dtypes/test_bitpacking.py::test_pack_example_CPU tensor([ 0, 105, 151, 37], dtype=torch.uint8) tensor([ 39, 146], dtype=torch.uint8) +PASSED +test/dtypes/test_floatx.py::TestFloatxTensorCoreAQTTensorImpl::test_fpx_weight_only_ebits_2_mbits_2_bias_False_bfloat16 CUDA error (/__w/ao/ao/pytorch/ao/torchao/csrc/cuda/fp6_llm/fp6_linear.cu:84): no kernel image is available for execution on the device diff --git a/setup.py b/setup.py index 9d2d7bce1c..9136371a4d 100644 --- a/setup.py +++ b/setup.py @@ -702,6 +702,7 @@ def get_extensions(): mxfp8_sources = [ os.path.join(mxfp8_extension_dir, "mxfp8_extension.cpp"), os.path.join(mxfp8_extension_dir, "mxfp8_cuda.cu"), + os.path.join(mxfp8_extension_dir, "mx_block_rearrange_2d_K_groups.cu"), ] # Only add the extension if the source files exist AND we are building for sm100 diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py index ecd4cefe6a..d59eb62906 100644 --- a/test/prototype/moe_training/test_kernels.py +++ b/test/prototype/moe_training/test_kernels.py @@ -354,3 +354,62 @@ def test_cuda_mx_dim1_3d_numerics(E, N, K, input_dtype, scaling_mode): # Check quantized values torch.testing.assert_close(y_d1, y_d1_ref, rtol=0, atol=0) assert y_d1.stride() == y_d1_ref.stride(), "quantized tensor strides do not match" + + +@pytest.mark.skipif( + not is_sm_at_least_100(), + reason="MXFP8 requires CUDA capability 10.0 or greater", +) +@pytest.mark.parametrize("m", [256, 512, 1024, 5120]) +@pytest.mark.parametrize("total_k", [512, 1024, 2048, 4096, 8192, 16384]) +@pytest.mark.parametrize("n_groups", [1, 4, 8, 16]) +def test_cuda_mx_block_rearrange_2d_K_groups( + m: int, + total_k: int, + n_groups: int, +): + """ + Test CUDA kernel for mx_block_rearrange_2d_K_groups against Triton reference. + This kernel rearranges E8M0 scales to block-scaled swizzle format for cuBLAS Tmem. + """ + from torchao.prototype import mxfp8_cuda + + device = "cuda" + block_size = 32 + input_data = torch.randn(m, total_k, device=device) + + e8m0_scales, _ = to_mx( + input_data, elem_dtype=torch.float8_e4m3fn, block_size=block_size + ) + + # Generate group end offsets along total_K, then divide by block_size to get scale group end offsets + input_group_offsets = generate_jagged_offs( + n_groups, total_k, multiple_of=block_size, device=device + ) + scale_group_offsets = input_group_offsets // block_size + + # Triton reference implementation + triton_out_scales = triton_mx_block_rearrange_2d_K_groups( + e8m0_scales, + scale_group_offsets, + ) + + # CUDA kernel implementation + cuda_out_scales = mxfp8_cuda.mx_block_rearrange_2d_K_groups( + e8m0_scales.view(torch.uint8), + scale_group_offsets, + ) + + # Check that outputs match + assert torch.equal(triton_out_scales, cuda_out_scales.view(torch.float8_e8m0fnu)), ( + "CUDA and Triton blocked scales not equal" + ) + + # Verify output shape + expected_rows = ((m + 127) // 128) * 128 # Padded to multiple of 128 + expected_cols = ( + e8m0_scales.size(1) + n_groups * 4 + ) # Original cols + padding per group + assert cuda_out_scales.shape == (expected_rows, expected_cols), ( + f"Output shape mismatch: expected {(expected_rows, expected_cols)}, got {cuda_out_scales.shape}" + ) diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu new file mode 100644 index 0000000000..c97baec10d --- /dev/null +++ b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu @@ -0,0 +1,199 @@ +#include +#include +#include +#include + +#define BLOCK_ROWS 128 +#define BLOCK_COLS 4 + +// Helper function to compute ceil division +__device__ __forceinline__ int ceil_div(int a, int b) { + return (a + b - 1) / b; +} + +// Helper function to compute the start index of a group after padding +__device__ __forceinline__ int compute_output_group_start_col( + int group_id, + const int32_t* input_group_end_offsets, + int num_groups, + int padding_size +) { + int start_idx = 0; + + // Compute prefix sum of padded group sizes + for (int i = 0; i < group_id; i++) { + int prev_offset = (i > 0) ? input_group_end_offsets[i - 1] : 0; + int curr_offset = input_group_end_offsets[i]; + int group_size = curr_offset - prev_offset; + int padded_size = ceil_div(group_size, padding_size) * padding_size; + start_idx += padded_size; + } + + return start_idx; +} + +// Compute destination index for swizzled block layout +// For a 128x4 block: r_div_32 = row / 32, r_mod_32 = row % 32 +// Swizzle: dest = r_mod_32 * 16 + r_div_32 * 4 + col +__device__ __forceinline__ int compute_swizzled_index(int row, int col) { + int r_div_32 = row / 32; + int r_mod_32 = row % 32; + return r_mod_32 * 16 + r_div_32 * 4 + col; +} + +__global__ void mx_block_rearrange_2d_K_groups_naive_kernel( + const uint8_t* __restrict__ scales_ptr, + int scales_stride_dim0, + int scale_rows, + int scale_cols, + int padded_rows, + const int32_t* __restrict__ input_group_end_offsets, + uint8_t* __restrict__ output_scales_ptr, + int output_stride_per_block, + int num_groups +) { + const int group_id = blockIdx.x; + const int block_row_id = blockIdx.y; + const int tid = threadIdx.x; // 128 threads, each handles one row + + // Shared memory for one 128x4 block + __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS]; + + // Get start/end cols of this input group + int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0; + int input_group_end_col = input_group_end_offsets[group_id]; + int num_cols_in_group = input_group_end_col - input_group_start_col; + + // Get output group start column + int output_group_start_col = compute_output_group_start_col( + group_id, + input_group_end_offsets, + num_groups, + 4); // scaling factor column padding size + + // Compute base offset for this group in output + int out_group_base_offset = output_group_start_col * padded_rows; + + // Compute stride per row of blocks in this group + int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS); + int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block; + + // Each thread handles one row + int input_row = block_row_id * BLOCK_ROWS + tid; + + // Loop through column blocks in this group + int curr_input_start_col = input_group_start_col; + int curr_out_col_block = 0; + + while (curr_input_start_col < input_group_end_col) { + // Calculate how many columns to load for this block + int cols_remaining = input_group_end_col - curr_input_start_col; + int cols_to_load = min(BLOCK_COLS, cols_remaining); + + // Load data for this row using vectorized loads when possible + uint32_t row_data = 0; + + if (input_row < scale_rows && curr_input_start_col < input_group_end_col) { + int input_offset = input_row * scales_stride_dim0 + curr_input_start_col; + const uint8_t* input_ptr = scales_ptr + input_offset; + + // Check alignment and available columns within this group + uintptr_t ptr_addr = reinterpret_cast(input_ptr); + + if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) { + // 4-byte aligned and have 4 columns within group: use uint32_t load + row_data = *reinterpret_cast(input_ptr); + } else { + // Byte-by-byte loads for unaligned or partial blocks + uint8_t* row_bytes = reinterpret_cast(&row_data); + for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) { + row_bytes[i] = input_ptr[i]; + } + } + } + + // Write to swizzled positions in shared memory + uint8_t* row_bytes = reinterpret_cast(&row_data); + + #pragma unroll + for (int col = 0; col < BLOCK_COLS; col++) { + int swizzled_idx = compute_swizzled_index(tid, col); + smem_block[swizzled_idx] = row_bytes[col]; + } + + __syncthreads(); + + // Write from shared memory to global memory + // Calculate the output offset for this specific block + int offset_in_group = block_row_id * stride_per_row_of_blocks_in_group + + curr_out_col_block * output_stride_per_block; + int final_offset = out_group_base_offset + offset_in_group; + + // Each thread writes 4 bytes (one row of the 128x4 block) + uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS; + + // Check output alignment for vectorized write + uintptr_t out_ptr_addr = reinterpret_cast(output_ptr); + if (out_ptr_addr % 4 == 0) { + // Aligned: use uint32_t store + *reinterpret_cast(output_ptr) = + *reinterpret_cast(&smem_block[tid * BLOCK_COLS]); + } else { + // Unaligned: byte by byte + const uint8_t* smem_ptr = &smem_block[tid * BLOCK_COLS]; + #pragma unroll + for (int i = 0; i < BLOCK_COLS; i++) { + output_ptr[i] = smem_ptr[i]; + } + } + + __syncthreads(); + + // Advance to next column block + curr_input_start_col += BLOCK_COLS; + curr_out_col_block += 1; + } +} + +// Host function to launch the kernel +namespace mxfp8 { + +void launch_mx_block_rearrange_2d_K_groups( + const uint8_t* scales_ptr, + int scales_stride_dim0, + int scale_rows, + int scale_cols, + int padded_rows, + const int32_t* input_group_end_offsets, + uint8_t* output_scales_ptr, + int num_groups, + cudaStream_t stream +) { + int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS; + + // Grid parallelizes over (num_groups, num_row_blocks) + // Each thread block loops through column blocks within its group + dim3 grid(num_groups, num_row_blocks); + dim3 block(128); // 128 threads, each handling one row + + int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS; + + mx_block_rearrange_2d_K_groups_naive_kernel<<>>( + scales_ptr, + scales_stride_dim0, + scale_rows, + scale_cols, + padded_rows, + input_group_end_offsets, + output_scales_ptr, + output_stride_per_block, + num_groups + ); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("CUDA Error: %s\n", cudaGetErrorString(err)); + } +} + +} // namespace mxfp8 diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp index d445fcad4d..55db12c40e 100644 --- a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp +++ b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp @@ -25,6 +25,17 @@ void mxfp8_quantize_3d_cuda(const torch::Tensor &input, const std::string &fp8_format, const std::string &scaling_mode); +void launch_mx_block_rearrange_2d_K_groups( + const uint8_t* scales_ptr, + int scales_stride_dim0, + int scale_rows, + int scale_cols, + int padded_rows, + const int32_t* input_group_end_offsets, + uint8_t* output_scales_ptr, + int num_groups, + cudaStream_t stream); + // Helper for tensor validation void check_cuda_tensor(const torch::Tensor &t, const char *name) { TORCH_CHECK(t.is_cuda(), name, " must be a CUDA tensor"); @@ -177,6 +188,66 @@ mxfp8_quantize_3d(torch::Tensor input, int64_t scale_dim_n, return std::make_tuple(output_colwise, scales_colwise); } +// Python wrapper for mx_block_rearrange_2d_K_groups +torch::Tensor mx_block_rearrange_2d_K_groups( + torch::Tensor scales_tensor, + torch::Tensor input_group_end_offsets) { + + // Validate inputs + check_cuda_tensor(scales_tensor, "scales_tensor"); + check_cuda_tensor(input_group_end_offsets, "input_group_end_offsets"); + + TORCH_CHECK(scales_tensor.dim() == 2, "scales_tensor must be 2D"); + TORCH_CHECK(scales_tensor.scalar_type() == torch::kUInt8 || + scales_tensor.scalar_type() == torch::kFloat8_e8m0fnu, + "scales_tensor must be uint8 or e8m0"); + TORCH_CHECK(input_group_end_offsets.scalar_type() == torch::kInt32, + "input_group_end_offsets must be int32"); + TORCH_CHECK(input_group_end_offsets.dim() == 1, + "input_group_end_offsets must be 1D"); + + c10::cuda::CUDAGuard device_guard(scales_tensor.device()); + + const int rows = scales_tensor.size(0); + const int cols = scales_tensor.size(1); + const int num_groups = input_group_end_offsets.size(0); + TORCH_CHECK(num_groups <= 32, "num_groups must be <= 32"); + + // Calculate blocks needed + const int BLOCK_ROWS = 128; + const int BLOCK_COLS = 4; + const int num_row_blocks = (rows + BLOCK_ROWS - 1) / BLOCK_ROWS; + const int padded_rows = num_row_blocks * BLOCK_ROWS; + + // Padding per group is variable/data dependent, so pad each group by upper bound + const int padded_cols = cols + num_groups * BLOCK_COLS; + + // Create output tensor + auto output = torch::zeros({padded_rows, padded_cols}, + torch::TensorOptions() + .dtype(scales_tensor.scalar_type()) + .device(scales_tensor.device())); + + // Get raw pointers + const uint8_t* scales_ptr = scales_tensor.data_ptr(); + const int32_t* offsets_ptr = input_group_end_offsets.data_ptr(); + uint8_t* output_ptr = output.data_ptr(); + + // Launch kernel + launch_mx_block_rearrange_2d_K_groups( + scales_ptr, + scales_tensor.stride(0), + rows, + cols, + padded_rows, + offsets_ptr, + output_ptr, + num_groups, + at::cuda::getCurrentCUDAStream()); + + return output; +} + } // namespace mxfp8 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { @@ -192,4 +263,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { py::arg("input"), py::arg("scale_dim_n") = 32, py::arg("fp8_format") = "e4m3", py::arg("scaling_mode") = "floor"); + + m.def("mx_block_rearrange_2d_K_groups", + &mxfp8::mx_block_rearrange_2d_K_groups, + "Rearrange E8M0 scales to block-scaled swizzle format for cuBLAS Tmem", + py::arg("scales_tensor"), + py::arg("input_group_end_offsets")); } diff --git a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py new file mode 100644 index 0000000000..9b779c655c --- /dev/null +++ b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py @@ -0,0 +1,217 @@ +""" +Standalone test for mx_block_rearrange_2d_K_groups CUDA kernel. +Uses torch.utils.cpp_extension.load for quick compilation and iteration. + +Usage: + python test_mx_block_rearrange_standalone.py +""" + +import os +import sys + +import torch +from torch.utils.cpp_extension import load + +# Get the directory where this script is located +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Load the CUDA extension +print("Compiling CUDA kernel...") +mx_block_rearrange = load( + name="mx_block_rearrange_2d_K_groups", + sources=[ + os.path.join(SCRIPT_DIR, "mxfp8_extension.cpp"), + os.path.join(SCRIPT_DIR, "mxfp8_cuda.cu"), + os.path.join(SCRIPT_DIR, "mx_block_rearrange_2d_K_groups.cu"), + ], + extra_cuda_cflags=[ + "-O3", + "--use_fast_math", + "-std=c++17", + "-gencode=arch=compute_90,code=sm_90", + ], + extra_cflags=["-O3", "-std=c++17"], + verbose=True, +) + +print("✓ Compilation successful!") + + +def benchmark_kernel(kernel_fn, *args, warmup=10, iterations=100): + """Benchmark a kernel function and return average time in microseconds.""" + # Warmup + for _ in range(warmup): + kernel_fn(*args) + torch.cuda.synchronize() + + # Benchmark + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + start_event.record() + for _ in range(iterations): + kernel_fn(*args) + end_event.record() + + torch.cuda.synchronize() + elapsed_ms = start_event.elapsed_time(end_event) + return (elapsed_ms / iterations) * 1000 # Convert to microseconds + + +def test_kernel(): + print("\n" + "=" * 80) + print("Testing mx_block_rearrange_2d_K_groups kernel") + print("=" * 80) + + # Try importing the Triton reference implementation + try: + ao_root = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", "..", "..")) + sys.path.insert(0, ao_root) + + from torchao.prototype.moe_training.kernels.mxfp8 import ( + triton_mx_block_rearrange_2d_K_groups, + ) + from torchao.prototype.moe_training.kernels.mxfp8.quant import ( + triton_mx_block_rearrange_2d_K_groups_naive, + ) + from torchao.prototype.moe_training.utils import generate_jagged_offs + from torchao.prototype.mx_formats.mx_tensor import to_mx + + has_triton = True + print("✓ Triton reference implementation available") + except ImportError as e: + print(f"⚠ Triton reference not available: {e}") + has_triton = False + + # Test parameters - use larger size for meaningful benchmarks + device = "cuda" + m, total_k = 5120, 16384 + n_groups = 8 + block_size = 32 + + print("\nTest configuration:") + print(f" Matrix size: {m} x {total_k}") + print(f" Number of groups: {n_groups}") + + # Generate test data + print("\nGenerating test data...") + torch.manual_seed(42) + input_data = torch.randn(m, total_k, device=device) + + if has_triton: + e8m0_scales, _ = to_mx( + input_data, elem_dtype=torch.float8_e4m3fn, block_size=block_size + ) + + input_group_offsets = generate_jagged_offs( + n_groups, total_k, multiple_of=block_size, device=device + ) + scale_group_offsets = input_group_offsets // block_size + + print(f" Scales shape: {e8m0_scales.shape}") + else: + return False + + # Calculate memory bandwidth metrics + bytes_per_element = 1 + input_bytes = e8m0_scales.numel() * bytes_per_element + + # Test CUDA kernel + print("\n" + "-" * 80) + print("Running CUDA kernel...") + cuda_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups( + e8m0_scales.view(torch.uint8), + scale_group_offsets, + ) + print("✓ CUDA kernel completed successfully") + + output_bytes = cuda_out_scales.numel() * bytes_per_element + total_bytes = input_bytes + output_bytes + + # Compare with Triton reference + print("\n" + "-" * 80) + print("Running Triton reference kernels...") + triton_naive_out = triton_mx_block_rearrange_2d_K_groups_naive( + e8m0_scales, + scale_group_offsets, + ) + print("✓ Triton kernel completed successfully") + + # Verify correctness + cuda_out_e8m0 = cuda_out_scales.view(torch.float8_e8m0fnu) + if not torch.equal(triton_naive_out, cuda_out_e8m0): + print("✗ CUDA and Triton naive outputs differ!") + return False + print("✓ All outputs are IDENTICAL!") + + # Benchmark section + print("\n" + "=" * 80) + print("BENCHMARKING MEMORY BANDWIDTH") + print("=" * 80) + + print("\nBenchmarking kernels (100 iterations each)...") + + # Benchmark Triton naive + triton_naive_time_us = benchmark_kernel( + triton_mx_block_rearrange_2d_K_groups_naive, + e8m0_scales, + scale_group_offsets, + ) + triton_naive_bw_gbps = (total_bytes / 1e9) / (triton_naive_time_us / 1e6) + + # Benchmark Triton parallel + triton_parallel_time_us = benchmark_kernel( + triton_mx_block_rearrange_2d_K_groups, + e8m0_scales, + scale_group_offsets, + ) + triton_parallel_bw_gbps = (total_bytes / 1e9) / (triton_parallel_time_us / 1e6) + + # Benchmark CUDA + cuda_time_us = benchmark_kernel( + mx_block_rearrange.mx_block_rearrange_2d_K_groups, + e8m0_scales.view(torch.uint8), + scale_group_offsets, + ) + cuda_bw_gbps = (total_bytes / 1e9) / (cuda_time_us / 1e6) + + # Print results + print("\nResults:") + print(f" Input size: {input_bytes / 1e6:.2f} MB") + print(f" Output size: {output_bytes / 1e6:.2f} MB") + print(f" Total I/O: {total_bytes / 1e6:.2f} MB\n") + print(f"{'Kernel':<25} {'Time (μs)':<15} {'Bandwidth (GB/s)':<20} {'Speedup':<10}") + print("-" * 70) + print( + f"{'Triton Naive':<25} {triton_naive_time_us:<15.2f} {triton_naive_bw_gbps:<20.2f} {'1.00x':<10}" + ) + print( + f"{'Triton Parallel':<25} {triton_parallel_time_us:<15.2f} {triton_parallel_bw_gbps:<20.2f} {triton_naive_time_us / triton_parallel_time_us:<10.2f}x" + ) + print( + f"{'CUDA (Optimized)':<25} {cuda_time_us:<15.2f} {cuda_bw_gbps:<20.2f} {triton_naive_time_us / cuda_time_us:<10.2f}x" + ) + print() + + # Highlight best performer + best_bw = max(triton_naive_bw_gbps, triton_parallel_bw_gbps, cuda_bw_gbps) + if cuda_bw_gbps == best_bw: + print("🏆 CUDA kernel achieves highest memory bandwidth!") + elif triton_parallel_bw_gbps == best_bw: + print("🏆 Triton parallel kernel achieves highest memory bandwidth!") + else: + print("🏆 Triton naive kernel achieves highest memory bandwidth!") + + return True + + +if __name__ == "__main__": + success = test_kernel() + + print("\n" + "=" * 80) + if success: + print("🎉 ALL TESTS PASSED!") + sys.exit(0) + else: + print("❌ TESTS FAILED") + sys.exit(1) diff --git a/torchao/prototype/moe_training/kernels/mxfp8/quant.py b/torchao/prototype/moe_training/kernels/mxfp8/quant.py index 24915d6359..f6a9a045d1 100644 --- a/torchao/prototype/moe_training/kernels/mxfp8/quant.py +++ b/torchao/prototype/moe_training/kernels/mxfp8/quant.py @@ -484,12 +484,12 @@ def triton_scale_swizzle_per_group_3d( ) -@triton_op("torchao::triton_mx_block_rearrange_2d_K_groups", mutates_args={}) -def triton_mx_block_rearrange_2d_K_groups( +def triton_mx_block_rearrange_2d_K_groups_naive( scales_tensor: torch.Tensor, input_group_end_offsets: torch.Tensor, ) -> torch.Tensor: """ + Naive version with while loop (before optimization). Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis, where the groups are along the contraction dimension of the GEMM. @@ -499,7 +499,6 @@ def triton_mx_block_rearrange_2d_K_groups( Args: scales_tensor: Input tensor containing e8m0 scales for each logical group of a target tensor. input_group_end_offsets: tensor of int32 values representing group end indexes for the input scales - output_group_start_offsets: tensor of int32 values representing pre-computed group start indexes after blocked format padding Returns: - Rearranged tensor in block-scaled swizzle format """ @@ -522,8 +521,7 @@ def triton_mx_block_rearrange_2d_K_groups( BLOCK_ROWS, BLOCK_COLS = 128, 4 output_stride_per_block = BLOCK_ROWS * BLOCK_COLS - # We parallelize per group and per row block. - # Cols per group is variable, so we just loop through col blocks for each group. + # Naive grid - only parallelize by group and row grid = lambda META: ( num_groups, num_row_blocks, @@ -547,6 +545,186 @@ def triton_mx_block_rearrange_2d_K_groups( return output +@triton_op("torchao::triton_mx_block_rearrange_2d_K_groups", mutates_args={}) +def triton_mx_block_rearrange_2d_K_groups( + scales_tensor: torch.Tensor, + input_group_end_offsets: torch.Tensor, +) -> torch.Tensor: + """ + Parallel version (parallelized over column blocks). + Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis, + where the groups are along the contraction dimension of the GEMM. + + This format is suitable for Tmem as described in NVIDIA documentation: + https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout + + Args: + scales_tensor: Input tensor containing e8m0 scales for each logical group of a target tensor. + input_group_end_offsets: tensor of int32 values representing group end indexes for the input scales + Returns: + - Rearranged tensor in block-scaled swizzle format + """ + assert scales_tensor.ndim == 2, "scales tensor must be 2d" + assert scales_tensor.element_size() == 1, ( + "Expected element size to be 1 byte (8 bits)" + ) + rows, cols = scales_tensor.shape + # Calculate blocks needed + num_groups = input_group_end_offsets.shape[0] + num_row_blocks = ceil_div(rows, 128) + padded_rows = num_row_blocks * 128 + + # Padding needing per group is variable/data dependent, so we just pad each group by + # the upper bound of 4 cols to avoid a d2h sync caused by iterating over each group. + padded_cols = cols + num_groups * 4 + output = scales_tensor.new_zeros((padded_rows, padded_cols)) + + # Output block stride for the rearranged format + BLOCK_ROWS, BLOCK_COLS = 128, 4 + output_stride_per_block = BLOCK_ROWS * BLOCK_COLS + + # Calculate column blocks for the ORIGINAL input tensor (before padding) + # Simply divide the number of columns by BLOCK_COLS + total_col_blocks = (cols + BLOCK_COLS - 1) // BLOCK_COLS + + # Compute per-group column block counts on GPU for the kernel to use + zero = torch.zeros( + 1, dtype=input_group_end_offsets.dtype, device=scales_tensor.device + ) + group_sizes = torch.diff(input_group_end_offsets, prepend=zero) + group_col_block_counts = (group_sizes + BLOCK_COLS - 1) // BLOCK_COLS + + # We parallelize over all column blocks across all groups and row blocks + grid = lambda META: ( + total_col_blocks, + num_row_blocks, + ) + wrap_triton(triton_scale_swizzle_2d_K_groups_parallel)[grid]( + scales_tensor.view(torch.uint8), + scales_tensor.stride(0), + scales_tensor.stride(1), + rows, + cols, + padded_rows, + input_group_end_offsets, + group_col_block_counts, + output.view(torch.uint8), + output_stride_per_block, + num_groups=num_groups, + BLOCK_ROWS=BLOCK_ROWS, + BLOCK_COLS=BLOCK_COLS, + ) + return output + + +@triton.jit +def triton_scale_swizzle_2d_K_groups_parallel( + scales_ptr, # (M, total_K//block_size) + scales_stride_dim0, + scales_stride_dim1, + scale_rows, + scale_cols, + padded_rows, + orig_offsets, # (num_groups,) + group_col_block_counts, # (num_groups,) - number of column blocks per group + output_scales_ptr, + output_stride_per_block, + num_groups: tl.constexpr, + BLOCK_ROWS: tl.constexpr, + BLOCK_COLS: tl.constexpr, +): + """ + Parallel version that parallelizes over column blocks. + Each thread block processes exactly one (row_block, col_block) pair. + Uses simple linear search to find which group a column block belongs to. + """ + col_block_pid = tl.program_id(0) + row_block_pid = tl.program_id(1) + + # Vectorized search to find which group this column block belongs to + # Load all group block counts at once + group_indices = tl.arange(0, num_groups) + all_block_counts = tl.load(group_col_block_counts + group_indices) + + # Compute cumulative sums to get start/end positions of each group + # cumsum_inclusive[i] = total blocks from group 0 to i (inclusive) + cumsum_inclusive = tl.cumsum(all_block_counts, axis=0) + + # cumsum_exclusive[i] = total blocks before group i (exclusive) + # For i > 0: cumsum_exclusive[i] = cumsum_inclusive[i] - all_block_counts[i] + # For i == 0: cumsum_exclusive[i] = 0 + cumsum_exclusive = tl.where( + group_indices > 0, cumsum_inclusive - all_block_counts, 0 + ) + + # Find which group col_block_pid belongs to + # A block belongs to group i if: cumsum_exclusive[i] <= block_id < cumsum_inclusive[i] + is_in_group = (col_block_pid >= cumsum_exclusive) & ( + col_block_pid < cumsum_inclusive + ) + + # Extract the group_pid (sum of indices where condition is true) + group_pid = tl.sum(tl.where(is_in_group, group_indices, 0)) + + # Extract the local column block offset within the group + local_col_block = tl.sum(tl.where(is_in_group, col_block_pid - cumsum_exclusive, 0)) + + # Load group offset boundaries + input_group_start_col = tl.load( + orig_offsets + group_pid - 1, mask=group_pid > 0, other=0 + ) + input_group_end_col = tl.load(orig_offsets + group_pid) + + # Compute input column offset for this specific column block + curr_input_start_col = input_group_start_col + local_col_block * BLOCK_COLS + + # Early exit if beyond group boundary + if curr_input_start_col >= input_group_end_col: + return + + # Calculate this group's start col after blocked format padding + output_group_start_col = _blocked_group_start_idx( + group_pid, orig_offsets, num_groups, 4 + ) + + row_offs = tl.arange(0, BLOCK_ROWS)[:, None] + col_offs = tl.arange(0, BLOCK_COLS)[None, :] + + # Read block of input scales + block_row_offs = row_block_pid * BLOCK_ROWS + row_offs + block_col_offs = curr_input_start_col + col_offs + block_offs = ( + block_row_offs * scales_stride_dim0 + block_col_offs * scales_stride_dim1 + ) + mask = (block_row_offs < scale_rows) & (block_col_offs < input_group_end_col) + input_scales = tl.load(scales_ptr + block_offs, mask=mask, other=0.0) + scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS)) + + # Compute output offset + out_group_base_offset = output_group_start_col * padded_rows + + num_cols_in_group = input_group_end_col - input_group_start_col + num_col_blocks_in_group = tl.cdiv(num_cols_in_group, BLOCK_COLS) + stride_per_row_of_blocks_in_group = ( + num_col_blocks_in_group * output_stride_per_block + ) + + offset_in_group = ( + row_block_pid * stride_per_row_of_blocks_in_group + + local_col_block * output_stride_per_block + ) + final_offset = out_group_base_offset + offset_in_group + + # Apply swizzling and write + dest_indices_flat = _dest_indices_for_block( + row_offs, col_offs, BLOCK_ROWS, BLOCK_COLS + ) + tl.store( + output_scales_ptr + final_offset + dest_indices_flat, + scales_flat, + ) + + @triton.jit def triton_scale_swizzle_2d_K_groups( scales_ptr, # (M, total_K//block_size) diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py index 3a4ad43b4f..4cb6525c16 100644 --- a/torchao/prototype/moe_training/scaled_grouped_mm.py +++ b/torchao/prototype/moe_training/scaled_grouped_mm.py @@ -26,13 +26,13 @@ _is_column_major, ) from torchao.prototype.mx_formats.config import ( + KernelPreference, MXFP8Dim1CastKernelChoice, ScaleCalculationMode, ) from torchao.prototype.mx_formats.kernels import triton_to_mxfp8_dim0 from torchao.prototype.mx_formats.mx_tensor import to_mx from torchao.prototype.mx_formats.utils import _to_mxfp8_dim1_kernel_wrapper -from torchao.quantization.quantize_.common import KernelPreference logger: logging.Logger = logging.getLogger(__name__) @@ -412,7 +412,7 @@ def backward(ctx, grad_out: torch.Tensor): block_size, elem_dtype=torch.float8_e4m3fn, hp_dtype=grad_out.dtype, - kernel_preference=KernelPreference.AUTO, # Not used + kernel_preference=KernelPreference.AUTO, cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA, scale_calculation_mode=scale_calculation_mode, ) @@ -428,7 +428,7 @@ def backward(ctx, grad_out: torch.Tensor): block_size, elem_dtype=torch.float8_e4m3fn, hp_dtype=A.dtype, - kernel_preference=KernelPreference.AUTO, # Not used + kernel_preference=KernelPreference.AUTO, cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA, scale_calculation_mode=scale_calculation_mode, ) @@ -475,7 +475,7 @@ def _to_mxfp8_dim1_3d( block_size, elem_dtype=torch.float8_e4m3fn, hp_dtype=B_reshaped.dtype, - kernel_preference=KernelPreference.AUTO, # Not used + kernel_preference=KernelPreference.AUTO, cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA, scale_calculation_mode=scaling_mode, ) diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py index b4cd192244..72a19e2c86 100644 --- a/torchao/prototype/mx_formats/kernels.py +++ b/torchao/prototype/mx_formats/kernels.py @@ -626,9 +626,10 @@ def triton_mxfp8_dequant_dim0( scale_block_size: int = 32, ) -> torch.Tensor: assert scale_block_size == 32, "scale_block_size must be 32 for now" - assert out_dtype in (torch.bfloat16, torch.float32), ( - "out_dtype must be bf16 or fp32" - ) + assert out_dtype in ( + torch.bfloat16, + torch.float32, + ), "out_dtype must be bf16 or fp32" # Input shape must be 2D. orig_shape = e4m3_data.shape @@ -1055,6 +1056,7 @@ def _(scale_tensor): padded_cols = n_col_blocks * 4 return scale_tensor.new_empty((padded_rows, padded_cols)) + else: def triton_to_mxfp8_dim0( @@ -1216,6 +1218,7 @@ def custom_mxfp8_quantize_cuda_dim1_sharding( rule_for_input_sharded_dim1, ] return acceptable_shardings + else: def mxfp8_quantize_cuda( From daf9ffdf1c051583134b63beda9c58caeb0d5ff7 Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Sat, 6 Dec 2025 23:01:12 -0800 Subject: [PATCH 2/4] stick with loop for now --- .../mx_block_rearrange_2d_K_groups.cu | 43 +++---------------- 1 file changed, 6 insertions(+), 37 deletions(-) diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu index c97baec10d..38eb4159c5 100644 --- a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu +++ b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu @@ -2,15 +2,12 @@ #include #include #include - #define BLOCK_ROWS 128 #define BLOCK_COLS 4 - // Helper function to compute ceil division __device__ __forceinline__ int ceil_div(int a, int b) { return (a + b - 1) / b; } - // Helper function to compute the start index of a group after padding __device__ __forceinline__ int compute_output_group_start_col( int group_id, @@ -19,7 +16,6 @@ __device__ __forceinline__ int compute_output_group_start_col( int padding_size ) { int start_idx = 0; - // Compute prefix sum of padded group sizes for (int i = 0; i < group_id; i++) { int prev_offset = (i > 0) ? input_group_end_offsets[i - 1] : 0; @@ -28,10 +24,8 @@ __device__ __forceinline__ int compute_output_group_start_col( int padded_size = ceil_div(group_size, padding_size) * padding_size; start_idx += padded_size; } - return start_idx; } - // Compute destination index for swizzled block layout // For a 128x4 block: r_div_32 = row / 32, r_mod_32 = row % 32 // Swizzle: dest = r_mod_32 * 16 + r_div_32 * 4 + col @@ -40,7 +34,6 @@ __device__ __forceinline__ int compute_swizzled_index(int row, int col) { int r_mod_32 = row % 32; return r_mod_32 * 16 + r_div_32 * 4 + col; } - __global__ void mx_block_rearrange_2d_K_groups_naive_kernel( const uint8_t* __restrict__ scales_ptr, int scales_stride_dim0, @@ -55,83 +48,65 @@ __global__ void mx_block_rearrange_2d_K_groups_naive_kernel( const int group_id = blockIdx.x; const int block_row_id = blockIdx.y; const int tid = threadIdx.x; // 128 threads, each handles one row - // Shared memory for one 128x4 block __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS]; - // Get start/end cols of this input group int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0; int input_group_end_col = input_group_end_offsets[group_id]; int num_cols_in_group = input_group_end_col - input_group_start_col; - // Get output group start column int output_group_start_col = compute_output_group_start_col( group_id, input_group_end_offsets, num_groups, 4); // scaling factor column padding size - // Compute base offset for this group in output int out_group_base_offset = output_group_start_col * padded_rows; - // Compute stride per row of blocks in this group int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS); int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block; - // Each thread handles one row int input_row = block_row_id * BLOCK_ROWS + tid; - // Loop through column blocks in this group int curr_input_start_col = input_group_start_col; int curr_out_col_block = 0; - while (curr_input_start_col < input_group_end_col) { // Calculate how many columns to load for this block int cols_remaining = input_group_end_col - curr_input_start_col; int cols_to_load = min(BLOCK_COLS, cols_remaining); - // Load data for this row using vectorized loads when possible uint32_t row_data = 0; - if (input_row < scale_rows && curr_input_start_col < input_group_end_col) { int input_offset = input_row * scales_stride_dim0 + curr_input_start_col; const uint8_t* input_ptr = scales_ptr + input_offset; - // Check alignment and available columns within this group uintptr_t ptr_addr = reinterpret_cast(input_ptr); - if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) { // 4-byte aligned and have 4 columns within group: use uint32_t load - row_data = *reinterpret_cast(input_ptr); + row_data = __ldg(reinterpret_cast(input_ptr)); } else { // Byte-by-byte loads for unaligned or partial blocks uint8_t* row_bytes = reinterpret_cast(&row_data); for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) { - row_bytes[i] = input_ptr[i]; + row_bytes[i] = __ldg(input_ptr + i); } } } - // Write to swizzled positions in shared memory uint8_t* row_bytes = reinterpret_cast(&row_data); - #pragma unroll for (int col = 0; col < BLOCK_COLS; col++) { int swizzled_idx = compute_swizzled_index(tid, col); smem_block[swizzled_idx] = row_bytes[col]; } - __syncthreads(); - // Write from shared memory to global memory // Calculate the output offset for this specific block int offset_in_group = block_row_id * stride_per_row_of_blocks_in_group + curr_out_col_block * output_stride_per_block; int final_offset = out_group_base_offset + offset_in_group; - // Each thread writes 4 bytes (one row of the 128x4 block) uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS; - // Check output alignment for vectorized write uintptr_t out_ptr_addr = reinterpret_cast(output_ptr); if (out_ptr_addr % 4 == 0) { @@ -146,18 +121,17 @@ __global__ void mx_block_rearrange_2d_K_groups_naive_kernel( output_ptr[i] = smem_ptr[i]; } } - - __syncthreads(); - // Advance to next column block curr_input_start_col += BLOCK_COLS; curr_out_col_block += 1; + // Only sync if there's another iteration + if (curr_input_start_col < input_group_end_col) { + __syncthreads(); + } } } - // Host function to launch the kernel namespace mxfp8 { - void launch_mx_block_rearrange_2d_K_groups( const uint8_t* scales_ptr, int scales_stride_dim0, @@ -170,14 +144,11 @@ void launch_mx_block_rearrange_2d_K_groups( cudaStream_t stream ) { int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS; - // Grid parallelizes over (num_groups, num_row_blocks) // Each thread block loops through column blocks within its group dim3 grid(num_groups, num_row_blocks); dim3 block(128); // 128 threads, each handling one row - int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS; - mx_block_rearrange_2d_K_groups_naive_kernel<<>>( scales_ptr, scales_stride_dim0, @@ -189,11 +160,9 @@ void launch_mx_block_rearrange_2d_K_groups( output_stride_per_block, num_groups ); - cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDA Error: %s\n", cudaGetErrorString(err)); } } - } // namespace mxfp8 From a5d83e317d5f26e44bc12da64840b2b300eff8bf Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Sun, 7 Dec 2025 11:38:28 -0800 Subject: [PATCH 3/4] cuda parallel idea working now --- ...h_triton_mx_block_rearrange_2d_K_groups.py | 29 ++- .../mx_block_rearrange_2d_K_groups.cu | 237 +++++++++++++++--- .../csrc/cuda/mx_kernels/mxfp8_extension.cpp | 81 +++++- .../test_mx_block_rearrange_standalone.py | 83 +++--- 4 files changed, 352 insertions(+), 78 deletions(-) diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py index 35245f13bf..f473166e4b 100644 --- a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py +++ b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py @@ -52,7 +52,7 @@ "-O3", "--use_fast_math", "-std=c++17", - "-gencode=arch=compute_90,code=sm_90", + "-gencode=arch=compute_100,code=sm_100", ], extra_cflags=["-O3", "-std=c++17"], verbose=True, @@ -101,7 +101,7 @@ def get_configs() -> List[ExperimentConfig]: (2048, 131072 // block_size), ] num_groups = [8] - versions = ["naive", "parallel", "cuda"] + versions = ["triton_naive", "triton_parallel", "cuda_parallel", "cuda_naive"] configs = [] for shape, groups, version in itertools.product( @@ -138,12 +138,18 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult: input_group_offsets = generate_jagged_offs(num_groups, Kg, multiple_of=block_size) # Select which kernel to benchmark based on version - if version == "naive": + if version == "triton_naive": kernel_fn = triton_mx_block_rearrange_2d_K_groups_naive - elif version == "parallel": + elif version == "triton_parallel": kernel_fn = triton_mx_block_rearrange_2d_K_groups - elif version == "cuda": + elif version == "cuda_parallel": + if mxfp8_cuda is None: + raise RuntimeError("CUDA kernel not available") kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups + elif version == "cuda_naive": + if mxfp8_cuda is None: + raise RuntimeError("CUDA kernel not available") + kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups_naive else: raise ValueError(f"Unknown version: {version}") @@ -191,6 +197,7 @@ def print_results(experiments: List[Experiment]): "time_us", "mem_bw_gbps", "fastest_version", + "speedup_vs_triton_naive", ] rows = [] @@ -198,8 +205,19 @@ def print_results(experiments: List[Experiment]): # Find fastest version for this shape fastest_version = min(versions.items(), key=lambda x: x[1].time_us)[0] + # Get naive baseline time for speedup calculation + naive_time_us = ( + versions.get("triton_naive").time_us if "triton_naive" in versions else None + ) + # Add rows for each version for version, result in versions.items(): + # Calculate speedup vs naive + speedup_str = "" + if naive_time_us and naive_time_us > 0: + speedup = naive_time_us / result.time_us + speedup_str = f"{speedup:.2f}x" + rows.append( [ version, @@ -207,6 +225,7 @@ def print_results(experiments: List[Experiment]): f"{result.time_us:.2f}", round(result.mem_bw_gbps, 3), fastest_version, + speedup_str, ] ) diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu index 38eb4159c5..a904e472ed 100644 --- a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu +++ b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu @@ -2,13 +2,48 @@ #include #include #include + #define BLOCK_ROWS 128 #define BLOCK_COLS 4 -// Helper function to compute ceil division + __device__ __forceinline__ int ceil_div(int a, int b) { return (a + b - 1) / b; } -// Helper function to compute the start index of a group after padding + +__device__ void find_group_and_local_offset( + int col_block_pid, + const int32_t* __restrict__ input_group_end_offsets, + int num_groups, + int* __restrict__ smem_cumsum, + int& group_id, + int& local_col_block +) { + if (threadIdx.x == 0) { + int cumsum = 0; + for (int g = 0; g < num_groups; g++) { + int input_group_start = (g > 0) ? input_group_end_offsets[g - 1] : 0; + int input_group_end = input_group_end_offsets[g]; + int group_size = input_group_end - input_group_start; + int num_col_blocks = ceil_div(group_size, BLOCK_COLS); + cumsum += num_col_blocks; + smem_cumsum[g] = cumsum; + } + } + __syncthreads(); + + group_id = 0; + int cumsum_before = 0; + for (int g = 0; g < num_groups; g++) { + int cumsum_at_g = smem_cumsum[g]; + if (col_block_pid < cumsum_at_g) { + group_id = g; + local_col_block = col_block_pid - cumsum_before; + break; + } + cumsum_before = cumsum_at_g; + } +} + __device__ __forceinline__ int compute_output_group_start_col( int group_id, const int32_t* input_group_end_offsets, @@ -16,7 +51,6 @@ __device__ __forceinline__ int compute_output_group_start_col( int padding_size ) { int start_idx = 0; - // Compute prefix sum of padded group sizes for (int i = 0; i < group_id; i++) { int prev_offset = (i > 0) ? input_group_end_offsets[i - 1] : 0; int curr_offset = input_group_end_offsets[i]; @@ -26,9 +60,7 @@ __device__ __forceinline__ int compute_output_group_start_col( } return start_idx; } -// Compute destination index for swizzled block layout -// For a 128x4 block: r_div_32 = row / 32, r_mod_32 = row % 32 -// Swizzle: dest = r_mod_32 * 16 + r_div_32 * 4 + col + __device__ __forceinline__ int compute_swizzled_index(int row, int col) { int r_div_32 = row / 32; int r_mod_32 = row % 32; @@ -47,92 +79,176 @@ __global__ void mx_block_rearrange_2d_K_groups_naive_kernel( ) { const int group_id = blockIdx.x; const int block_row_id = blockIdx.y; - const int tid = threadIdx.x; // 128 threads, each handles one row - // Shared memory for one 128x4 block + const int tid = threadIdx.x; + __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS]; - // Get start/end cols of this input group + int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0; int input_group_end_col = input_group_end_offsets[group_id]; int num_cols_in_group = input_group_end_col - input_group_start_col; - // Get output group start column + int output_group_start_col = compute_output_group_start_col( - group_id, - input_group_end_offsets, - num_groups, - 4); // scaling factor column padding size - // Compute base offset for this group in output + group_id, input_group_end_offsets, num_groups, 4); + int out_group_base_offset = output_group_start_col * padded_rows; - // Compute stride per row of blocks in this group + int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS); int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block; - // Each thread handles one row + int input_row = block_row_id * BLOCK_ROWS + tid; - // Loop through column blocks in this group + int curr_input_start_col = input_group_start_col; int curr_out_col_block = 0; + while (curr_input_start_col < input_group_end_col) { - // Calculate how many columns to load for this block int cols_remaining = input_group_end_col - curr_input_start_col; int cols_to_load = min(BLOCK_COLS, cols_remaining); - // Load data for this row using vectorized loads when possible + uint32_t row_data = 0; if (input_row < scale_rows && curr_input_start_col < input_group_end_col) { int input_offset = input_row * scales_stride_dim0 + curr_input_start_col; const uint8_t* input_ptr = scales_ptr + input_offset; - // Check alignment and available columns within this group + uintptr_t ptr_addr = reinterpret_cast(input_ptr); if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) { - // 4-byte aligned and have 4 columns within group: use uint32_t load row_data = __ldg(reinterpret_cast(input_ptr)); } else { - // Byte-by-byte loads for unaligned or partial blocks uint8_t* row_bytes = reinterpret_cast(&row_data); for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) { row_bytes[i] = __ldg(input_ptr + i); } } } - // Write to swizzled positions in shared memory + uint8_t* row_bytes = reinterpret_cast(&row_data); #pragma unroll for (int col = 0; col < BLOCK_COLS; col++) { int swizzled_idx = compute_swizzled_index(tid, col); smem_block[swizzled_idx] = row_bytes[col]; } + __syncthreads(); - // Write from shared memory to global memory - // Calculate the output offset for this specific block + int offset_in_group = block_row_id * stride_per_row_of_blocks_in_group + curr_out_col_block * output_stride_per_block; int final_offset = out_group_base_offset + offset_in_group; - // Each thread writes 4 bytes (one row of the 128x4 block) + uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS; - // Check output alignment for vectorized write uintptr_t out_ptr_addr = reinterpret_cast(output_ptr); - if (out_ptr_addr % 4 == 0) { - // Aligned: use uint32_t store + + if (out_ptr_addr % 4 == 0 && cols_to_load >= 4) { *reinterpret_cast(output_ptr) = *reinterpret_cast(&smem_block[tid * BLOCK_COLS]); } else { - // Unaligned: byte by byte const uint8_t* smem_ptr = &smem_block[tid * BLOCK_COLS]; #pragma unroll - for (int i = 0; i < BLOCK_COLS; i++) { + for (int i = 0; i < cols_to_load; i++) { output_ptr[i] = smem_ptr[i]; } } - // Advance to next column block + curr_input_start_col += BLOCK_COLS; curr_out_col_block += 1; - // Only sync if there's another iteration + if (curr_input_start_col < input_group_end_col) { __syncthreads(); } } } -// Host function to launch the kernel +__global__ void mx_block_rearrange_2d_K_groups_parallel_kernel( + const uint8_t* __restrict__ scales_ptr, + int scales_stride_dim0, + int scale_rows, + int scale_cols, + int padded_rows, + const int32_t* __restrict__ input_group_end_offsets, + uint8_t* __restrict__ output_scales_ptr, + int output_stride_per_block, + int num_groups +) { + const int col_block_pid = blockIdx.x; + const int row_block_pid = blockIdx.y; + const int tid = threadIdx.x; + + __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS]; + __shared__ int smem_cumsum[32]; + + int group_id, local_col_block; + find_group_and_local_offset( + col_block_pid, + input_group_end_offsets, + num_groups, + smem_cumsum, + group_id, + local_col_block + ); + + int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0; + int input_group_end_col = input_group_end_offsets[group_id]; + int curr_input_start_col = input_group_start_col + local_col_block * BLOCK_COLS; + + if (curr_input_start_col >= input_group_end_col) { + return; + } + + int input_row = row_block_pid * BLOCK_ROWS + tid; + int cols_remaining = input_group_end_col - curr_input_start_col; + int cols_to_load = min(BLOCK_COLS, cols_remaining); + + uint32_t row_data = 0; + if (input_row < scale_rows && curr_input_start_col < input_group_end_col) { + int input_offset = input_row * scales_stride_dim0 + curr_input_start_col; + const uint8_t* input_ptr = scales_ptr + input_offset; + + uintptr_t ptr_addr = reinterpret_cast(input_ptr); + if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) { + row_data = __ldg(reinterpret_cast(input_ptr)); + } else { + uint8_t* row_bytes = reinterpret_cast(&row_data); + for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) { + row_bytes[i] = __ldg(input_ptr + i); + } + } + } + + uint8_t* row_bytes = reinterpret_cast(&row_data); + #pragma unroll + for (int col = 0; col < BLOCK_COLS; col++) { + int swizzled_idx = compute_swizzled_index(tid, col); + smem_block[swizzled_idx] = row_bytes[col]; + } + + __syncthreads(); + + int output_group_start_col = compute_output_group_start_col( + group_id, input_group_end_offsets, num_groups, 4 + ); + int out_group_base_offset = output_group_start_col * padded_rows; + + int num_cols_in_group = input_group_end_col - input_group_start_col; + int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS); + int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block; + + int offset_in_group = row_block_pid * stride_per_row_of_blocks_in_group + + local_col_block * output_stride_per_block; + int final_offset = out_group_base_offset + offset_in_group; + + uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS; + uintptr_t out_ptr_addr = reinterpret_cast(output_ptr); + + if (out_ptr_addr % 4 == 0 && cols_to_load >= 4) { + *reinterpret_cast(output_ptr) = + *reinterpret_cast(&smem_block[tid * BLOCK_COLS]); + } else { + const uint8_t* smem_ptr = &smem_block[tid * BLOCK_COLS]; + #pragma unroll + for (int i = 0; i < cols_to_load; i++) { + output_ptr[i] = smem_ptr[i]; + } + } +} namespace mxfp8 { -void launch_mx_block_rearrange_2d_K_groups( +void launch_mx_block_rearrange_2d_K_groups_naive( const uint8_t* scales_ptr, int scales_stride_dim0, int scale_rows, @@ -144,11 +260,11 @@ void launch_mx_block_rearrange_2d_K_groups( cudaStream_t stream ) { int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS; - // Grid parallelizes over (num_groups, num_row_blocks) - // Each thread block loops through column blocks within its group - dim3 grid(num_groups, num_row_blocks); - dim3 block(128); // 128 threads, each handling one row int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS; + + dim3 grid(num_groups, num_row_blocks); + dim3 block(128); + mx_block_rearrange_2d_K_groups_naive_kernel<<>>( scales_ptr, scales_stride_dim0, @@ -160,6 +276,47 @@ void launch_mx_block_rearrange_2d_K_groups( output_stride_per_block, num_groups ); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("CUDA Error (naive): %s\n", cudaGetErrorString(err)); + } +} + +void launch_mx_block_rearrange_2d_K_groups( + const uint8_t* scales_ptr, + int scales_stride_dim0, + int scale_rows, + int scale_cols, + int padded_rows, + const int32_t* input_group_end_offsets, + uint8_t* output_scales_ptr, + int num_groups, + cudaStream_t stream +) { + int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS; + int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS; + + // Over-allocate column blocks to avoid D2H sync required for calculating exact number of col blocks + // required then moving to host to use in the grid. + // Maximum is: total_col_blocks + num_groups (one extra per group for padding) + int total_col_blocks = (scale_cols + BLOCK_COLS - 1) / BLOCK_COLS + num_groups; + + dim3 grid(total_col_blocks, num_row_blocks); + dim3 block(128); + + mx_block_rearrange_2d_K_groups_parallel_kernel<<>>( + scales_ptr, + scales_stride_dim0, + scale_rows, + scale_cols, + padded_rows, + input_group_end_offsets, + output_scales_ptr, + output_stride_per_block, + num_groups + ); + cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("CUDA Error: %s\n", cudaGetErrorString(err)); diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp index 55db12c40e..2690ba1449 100644 --- a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp +++ b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp @@ -36,6 +36,17 @@ void launch_mx_block_rearrange_2d_K_groups( int num_groups, cudaStream_t stream); +void launch_mx_block_rearrange_2d_K_groups_naive( + const uint8_t* scales_ptr, + int scales_stride_dim0, + int scale_rows, + int scale_cols, + int padded_rows, + const int32_t* input_group_end_offsets, + uint8_t* output_scales_ptr, + int num_groups, + cudaStream_t stream); + // Helper for tensor validation void check_cuda_tensor(const torch::Tensor &t, const char *name) { TORCH_CHECK(t.is_cuda(), name, " must be a CUDA tensor"); @@ -233,7 +244,7 @@ torch::Tensor mx_block_rearrange_2d_K_groups( const int32_t* offsets_ptr = input_group_end_offsets.data_ptr(); uint8_t* output_ptr = output.data_ptr(); - // Launch kernel + // Launch parallel kernel (optimized) launch_mx_block_rearrange_2d_K_groups( scales_ptr, scales_tensor.stride(0), @@ -248,6 +259,66 @@ torch::Tensor mx_block_rearrange_2d_K_groups( return output; } +// Python wrapper for mx_block_rearrange_2d_K_groups_naive +torch::Tensor mx_block_rearrange_2d_K_groups_naive( + torch::Tensor scales_tensor, + torch::Tensor input_group_end_offsets) { + + // Validate inputs + check_cuda_tensor(scales_tensor, "scales_tensor"); + check_cuda_tensor(input_group_end_offsets, "input_group_end_offsets"); + + TORCH_CHECK(scales_tensor.dim() == 2, "scales_tensor must be 2D"); + TORCH_CHECK(scales_tensor.scalar_type() == torch::kUInt8 || + scales_tensor.scalar_type() == torch::kFloat8_e8m0fnu, + "scales_tensor must be uint8 or e8m0"); + TORCH_CHECK(input_group_end_offsets.scalar_type() == torch::kInt32, + "input_group_end_offsets must be int32"); + TORCH_CHECK(input_group_end_offsets.dim() == 1, + "input_group_end_offsets must be 1D"); + + c10::cuda::CUDAGuard device_guard(scales_tensor.device()); + + const int rows = scales_tensor.size(0); + const int cols = scales_tensor.size(1); + const int num_groups = input_group_end_offsets.size(0); + TORCH_CHECK(num_groups <= 32, "num_groups must be <= 32"); + + // Calculate blocks needed + const int BLOCK_ROWS = 128; + const int BLOCK_COLS = 4; + const int num_row_blocks = (rows + BLOCK_ROWS - 1) / BLOCK_ROWS; + const int padded_rows = num_row_blocks * BLOCK_ROWS; + + // Padding per group is variable/data dependent, so pad each group by upper bound + const int padded_cols = cols + num_groups * BLOCK_COLS; + + // Create output tensor + auto output = torch::zeros({padded_rows, padded_cols}, + torch::TensorOptions() + .dtype(scales_tensor.scalar_type()) + .device(scales_tensor.device())); + + // Get raw pointers + const uint8_t* scales_ptr = scales_tensor.data_ptr(); + const int32_t* offsets_ptr = input_group_end_offsets.data_ptr(); + uint8_t* output_ptr = output.data_ptr(); + + // Launch naive kernel (original with while loop) + launch_mx_block_rearrange_2d_K_groups_naive( + scales_ptr, + scales_tensor.stride(0), + rows, + cols, + padded_rows, + offsets_ptr, + output_ptr, + num_groups, + at::cuda::getCurrentCUDAStream()); + + return output; +} + } // namespace mxfp8 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { @@ -266,7 +337,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("mx_block_rearrange_2d_K_groups", &mxfp8::mx_block_rearrange_2d_K_groups, - "Rearrange E8M0 scales to block-scaled swizzle format for cuBLAS Tmem", + "Rearrange E8M0 scales to block-scaled swizzle format (parallelized)", + py::arg("scales_tensor"), + py::arg("input_group_end_offsets")); + + m.def("mx_block_rearrange_2d_K_groups_naive", + &mxfp8::mx_block_rearrange_2d_K_groups_naive, + "Rearrange E8M0 scales to block-scaled swizzle format (naive version)", py::arg("scales_tensor"), py::arg("input_group_end_offsets")); } diff --git a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py index 9b779c655c..764d9953da 100644 --- a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py +++ b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py @@ -28,7 +28,7 @@ "-O3", "--use_fast_math", "-std=c++17", - "-gencode=arch=compute_90,code=sm_90", + "-gencode=arch=compute_100,code=sm_100", ], extra_cflags=["-O3", "-std=c++17"], verbose=True, @@ -68,9 +68,6 @@ def test_kernel(): ao_root = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", "..", "..")) sys.path.insert(0, ao_root) - from torchao.prototype.moe_training.kernels.mxfp8 import ( - triton_mx_block_rearrange_2d_K_groups, - ) from torchao.prototype.moe_training.kernels.mxfp8.quant import ( triton_mx_block_rearrange_2d_K_groups_naive, ) @@ -118,31 +115,55 @@ def test_kernel(): # Test CUDA kernel print("\n" + "-" * 80) - print("Running CUDA kernel...") - cuda_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups( + print("Running CUDA parallel kernel (optimized)...") + cuda_parallel_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups( + e8m0_scales.view(torch.uint8), + scale_group_offsets, + ) + print("✓ CUDA parallel kernel completed successfully") + + # Test CUDA naive kernel + print("\n" + "-" * 80) + print("Running CUDA naive kernel...") + cuda_naive_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups_naive( e8m0_scales.view(torch.uint8), scale_group_offsets, ) - print("✓ CUDA kernel completed successfully") + print("✓ CUDA naive kernel completed successfully") - output_bytes = cuda_out_scales.numel() * bytes_per_element + output_bytes = cuda_parallel_out_scales.numel() * bytes_per_element total_bytes = input_bytes + output_bytes - # Compare with Triton reference + # Compare with Triton naive reference print("\n" + "-" * 80) - print("Running Triton reference kernels...") + print("Running Triton naive reference kernel...") triton_naive_out = triton_mx_block_rearrange_2d_K_groups_naive( e8m0_scales, scale_group_offsets, ) - print("✓ Triton kernel completed successfully") + print("✓ Triton naive kernel completed successfully") # Verify correctness - cuda_out_e8m0 = cuda_out_scales.view(torch.float8_e8m0fnu) - if not torch.equal(triton_naive_out, cuda_out_e8m0): - print("✗ CUDA and Triton naive outputs differ!") + cuda_parallel_out_e8m0 = cuda_parallel_out_scales.view(torch.float8_e8m0fnu) + cuda_naive_out_e8m0 = cuda_naive_out_scales.view(torch.float8_e8m0fnu) + + print("\nVerifying correctness...") + if not torch.equal(triton_naive_out, cuda_naive_out_e8m0): + print("✗ CUDA naive and Triton naive outputs differ!") + return False + print("✓ CUDA naive matches Triton naive") + + if not torch.equal(triton_naive_out, cuda_parallel_out_e8m0): + print("✗ CUDA parallel and Triton naive outputs differ!") + return False + print("✓ CUDA parallel matches Triton naive") + + if not torch.equal(cuda_naive_out_e8m0, cuda_parallel_out_e8m0): + print("✗ CUDA naive and CUDA parallel outputs differ!") return False - print("✓ All outputs are IDENTICAL!") + print("✓ CUDA naive and CUDA parallel match each other") + + print("\n✓ All outputs are IDENTICAL!") # Benchmark section print("\n" + "=" * 80) @@ -159,21 +180,21 @@ def test_kernel(): ) triton_naive_bw_gbps = (total_bytes / 1e9) / (triton_naive_time_us / 1e6) - # Benchmark Triton parallel - triton_parallel_time_us = benchmark_kernel( - triton_mx_block_rearrange_2d_K_groups, - e8m0_scales, + # Benchmark CUDA parallel (optimized) + cuda_parallel_time_us = benchmark_kernel( + mx_block_rearrange.mx_block_rearrange_2d_K_groups, + e8m0_scales.view(torch.uint8), scale_group_offsets, ) - triton_parallel_bw_gbps = (total_bytes / 1e9) / (triton_parallel_time_us / 1e6) + cuda_parallel_bw_gbps = (total_bytes / 1e9) / (cuda_parallel_time_us / 1e6) - # Benchmark CUDA - cuda_time_us = benchmark_kernel( - mx_block_rearrange.mx_block_rearrange_2d_K_groups, + # Benchmark CUDA naive + cuda_naive_time_us = benchmark_kernel( + mx_block_rearrange.mx_block_rearrange_2d_K_groups_naive, e8m0_scales.view(torch.uint8), scale_group_offsets, ) - cuda_bw_gbps = (total_bytes / 1e9) / (cuda_time_us / 1e6) + cuda_naive_bw_gbps = (total_bytes / 1e9) / (cuda_naive_time_us / 1e6) # Print results print("\nResults:") @@ -186,19 +207,19 @@ def test_kernel(): f"{'Triton Naive':<25} {triton_naive_time_us:<15.2f} {triton_naive_bw_gbps:<20.2f} {'1.00x':<10}" ) print( - f"{'Triton Parallel':<25} {triton_parallel_time_us:<15.2f} {triton_parallel_bw_gbps:<20.2f} {triton_naive_time_us / triton_parallel_time_us:<10.2f}x" + f"{'CUDA Naive':<25} {cuda_naive_time_us:<15.2f} {cuda_naive_bw_gbps:<20.2f} {triton_naive_time_us / cuda_naive_time_us:<10.2f}x" ) print( - f"{'CUDA (Optimized)':<25} {cuda_time_us:<15.2f} {cuda_bw_gbps:<20.2f} {triton_naive_time_us / cuda_time_us:<10.2f}x" + f"{'CUDA Parallel':<25} {cuda_parallel_time_us:<15.2f} {cuda_parallel_bw_gbps:<20.2f} {triton_naive_time_us / cuda_parallel_time_us:<10.2f}x" ) print() # Highlight best performer - best_bw = max(triton_naive_bw_gbps, triton_parallel_bw_gbps, cuda_bw_gbps) - if cuda_bw_gbps == best_bw: - print("🏆 CUDA kernel achieves highest memory bandwidth!") - elif triton_parallel_bw_gbps == best_bw: - print("🏆 Triton parallel kernel achieves highest memory bandwidth!") + best_bw = max(triton_naive_bw_gbps, cuda_naive_bw_gbps, cuda_parallel_bw_gbps) + if cuda_parallel_bw_gbps == best_bw: + print("🏆 CUDA parallel kernel achieves highest memory bandwidth!") + elif cuda_naive_bw_gbps == best_bw: + print("🏆 CUDA naive kernel achieves highest memory bandwidth!") else: print("🏆 Triton naive kernel achieves highest memory bandwidth!") From f838d7bac7450bed8449aa91d310c650ed4b968d Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Sun, 7 Dec 2025 19:30:30 -0800 Subject: [PATCH 4/4] remove triton parallel and cuda naive impls --- ...h_triton_mx_block_rearrange_2d_K_groups.py | 29 +-- .../mx_block_rearrange_2d_K_groups.cu | 134 +---------- .../csrc/cuda/mx_kernels/mxfp8_extension.cpp | 77 ------- .../test_mx_block_rearrange_standalone.py | 65 ++---- .../moe_training/kernels/mxfp8/quant.py | 216 ++---------------- 5 files changed, 53 insertions(+), 468 deletions(-) diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py index f473166e4b..d782b01f4d 100644 --- a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py +++ b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py @@ -15,11 +15,8 @@ from tqdm import tqdm from benchmarks.utils import benchmark_cuda_function_in_microseconds -from torchao.prototype.moe_training.kernels.mxfp8 import ( - triton_mx_block_rearrange_2d_K_groups, -) from torchao.prototype.moe_training.kernels.mxfp8.quant import ( - triton_mx_block_rearrange_2d_K_groups_naive, + triton_mx_block_rearrange_2d_K_groups, ) from torchao.prototype.moe_training.utils import generate_jagged_offs @@ -101,7 +98,7 @@ def get_configs() -> List[ExperimentConfig]: (2048, 131072 // block_size), ] num_groups = [8] - versions = ["triton_naive", "triton_parallel", "cuda_parallel", "cuda_naive"] + versions = ["triton", "cuda_parallel"] configs = [] for shape, groups, version in itertools.product( @@ -138,18 +135,12 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult: input_group_offsets = generate_jagged_offs(num_groups, Kg, multiple_of=block_size) # Select which kernel to benchmark based on version - if version == "triton_naive": - kernel_fn = triton_mx_block_rearrange_2d_K_groups_naive - elif version == "triton_parallel": + if version == "triton": kernel_fn = triton_mx_block_rearrange_2d_K_groups elif version == "cuda_parallel": if mxfp8_cuda is None: raise RuntimeError("CUDA kernel not available") kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups - elif version == "cuda_naive": - if mxfp8_cuda is None: - raise RuntimeError("CUDA kernel not available") - kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups_naive else: raise ValueError(f"Unknown version: {version}") @@ -197,7 +188,7 @@ def print_results(experiments: List[Experiment]): "time_us", "mem_bw_gbps", "fastest_version", - "speedup_vs_triton_naive", + "speedup_vs_triton", ] rows = [] @@ -205,17 +196,17 @@ def print_results(experiments: List[Experiment]): # Find fastest version for this shape fastest_version = min(versions.items(), key=lambda x: x[1].time_us)[0] - # Get naive baseline time for speedup calculation - naive_time_us = ( - versions.get("triton_naive").time_us if "triton_naive" in versions else None + # Get triton baseline time for speedup calculation + triton_time_us = ( + versions.get("triton").time_us if "triton" in versions else None ) # Add rows for each version for version, result in versions.items(): - # Calculate speedup vs naive + # Calculate speedup vs triton speedup_str = "" - if naive_time_us and naive_time_us > 0: - speedup = naive_time_us / result.time_us + if version != "triton" and triton_time_us > 0: + speedup = triton_time_us / result.time_us speedup_str = f"{speedup:.2f}x" rows.append( diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu index a904e472ed..6d36455528 100644 --- a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu +++ b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu @@ -66,95 +66,7 @@ __device__ __forceinline__ int compute_swizzled_index(int row, int col) { int r_mod_32 = row % 32; return r_mod_32 * 16 + r_div_32 * 4 + col; } -__global__ void mx_block_rearrange_2d_K_groups_naive_kernel( - const uint8_t* __restrict__ scales_ptr, - int scales_stride_dim0, - int scale_rows, - int scale_cols, - int padded_rows, - const int32_t* __restrict__ input_group_end_offsets, - uint8_t* __restrict__ output_scales_ptr, - int output_stride_per_block, - int num_groups -) { - const int group_id = blockIdx.x; - const int block_row_id = blockIdx.y; - const int tid = threadIdx.x; - - __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS]; - - int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0; - int input_group_end_col = input_group_end_offsets[group_id]; - int num_cols_in_group = input_group_end_col - input_group_start_col; - - int output_group_start_col = compute_output_group_start_col( - group_id, input_group_end_offsets, num_groups, 4); - - int out_group_base_offset = output_group_start_col * padded_rows; - - int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS); - int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block; - - int input_row = block_row_id * BLOCK_ROWS + tid; - - int curr_input_start_col = input_group_start_col; - int curr_out_col_block = 0; - - while (curr_input_start_col < input_group_end_col) { - int cols_remaining = input_group_end_col - curr_input_start_col; - int cols_to_load = min(BLOCK_COLS, cols_remaining); - - uint32_t row_data = 0; - if (input_row < scale_rows && curr_input_start_col < input_group_end_col) { - int input_offset = input_row * scales_stride_dim0 + curr_input_start_col; - const uint8_t* input_ptr = scales_ptr + input_offset; - - uintptr_t ptr_addr = reinterpret_cast(input_ptr); - if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) { - row_data = __ldg(reinterpret_cast(input_ptr)); - } else { - uint8_t* row_bytes = reinterpret_cast(&row_data); - for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) { - row_bytes[i] = __ldg(input_ptr + i); - } - } - } - - uint8_t* row_bytes = reinterpret_cast(&row_data); - #pragma unroll - for (int col = 0; col < BLOCK_COLS; col++) { - int swizzled_idx = compute_swizzled_index(tid, col); - smem_block[swizzled_idx] = row_bytes[col]; - } - - __syncthreads(); - - int offset_in_group = block_row_id * stride_per_row_of_blocks_in_group + - curr_out_col_block * output_stride_per_block; - int final_offset = out_group_base_offset + offset_in_group; - - uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS; - uintptr_t out_ptr_addr = reinterpret_cast(output_ptr); - - if (out_ptr_addr % 4 == 0 && cols_to_load >= 4) { - *reinterpret_cast(output_ptr) = - *reinterpret_cast(&smem_block[tid * BLOCK_COLS]); - } else { - const uint8_t* smem_ptr = &smem_block[tid * BLOCK_COLS]; - #pragma unroll - for (int i = 0; i < cols_to_load; i++) { - output_ptr[i] = smem_ptr[i]; - } - } - curr_input_start_col += BLOCK_COLS; - curr_out_col_block += 1; - - if (curr_input_start_col < input_group_end_col) { - __syncthreads(); - } - } -} __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel( const uint8_t* __restrict__ scales_ptr, int scales_stride_dim0, @@ -172,6 +84,7 @@ __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel( __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS]; __shared__ int smem_cumsum[32]; + __shared__ int output_group_start_col; int group_id, local_col_block; find_group_and_local_offset( @@ -191,6 +104,12 @@ __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel( return; } + if (tid == 0) { + output_group_start_col = compute_output_group_start_col( + group_id, input_group_end_offsets, num_groups, 4 + ); + } + int input_row = row_block_pid * BLOCK_ROWS + tid; int cols_remaining = input_group_end_col - curr_input_start_col; int cols_to_load = min(BLOCK_COLS, cols_remaining); @@ -220,9 +139,6 @@ __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel( __syncthreads(); - int output_group_start_col = compute_output_group_start_col( - group_id, input_group_end_offsets, num_groups, 4 - ); int out_group_base_offset = output_group_start_col * padded_rows; int num_cols_in_group = input_group_end_col - input_group_start_col; @@ -247,42 +163,8 @@ __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel( } } } -namespace mxfp8 { -void launch_mx_block_rearrange_2d_K_groups_naive( - const uint8_t* scales_ptr, - int scales_stride_dim0, - int scale_rows, - int scale_cols, - int padded_rows, - const int32_t* input_group_end_offsets, - uint8_t* output_scales_ptr, - int num_groups, - cudaStream_t stream -) { - int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS; - int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS; - - dim3 grid(num_groups, num_row_blocks); - dim3 block(128); - - mx_block_rearrange_2d_K_groups_naive_kernel<<>>( - scales_ptr, - scales_stride_dim0, - scale_rows, - scale_cols, - padded_rows, - input_group_end_offsets, - output_scales_ptr, - output_stride_per_block, - num_groups - ); - - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - printf("CUDA Error (naive): %s\n", cudaGetErrorString(err)); - } -} +namespace mxfp8 { void launch_mx_block_rearrange_2d_K_groups( const uint8_t* scales_ptr, int scales_stride_dim0, diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp index 2690ba1449..7fba7da2d9 100644 --- a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp +++ b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp @@ -36,17 +36,6 @@ void launch_mx_block_rearrange_2d_K_groups( int num_groups, cudaStream_t stream); -void launch_mx_block_rearrange_2d_K_groups_naive( - const uint8_t* scales_ptr, - int scales_stride_dim0, - int scale_rows, - int scale_cols, - int padded_rows, - const int32_t* input_group_end_offsets, - uint8_t* output_scales_ptr, - int num_groups, - cudaStream_t stream); - // Helper for tensor validation void check_cuda_tensor(const torch::Tensor &t, const char *name) { TORCH_CHECK(t.is_cuda(), name, " must be a CUDA tensor"); @@ -259,66 +248,6 @@ torch::Tensor mx_block_rearrange_2d_K_groups( return output; } -// Python wrapper for mx_block_rearrange_2d_K_groups_naive -torch::Tensor mx_block_rearrange_2d_K_groups_naive( - torch::Tensor scales_tensor, - torch::Tensor input_group_end_offsets) { - - // Validate inputs - check_cuda_tensor(scales_tensor, "scales_tensor"); - check_cuda_tensor(input_group_end_offsets, "input_group_end_offsets"); - - TORCH_CHECK(scales_tensor.dim() == 2, "scales_tensor must be 2D"); - TORCH_CHECK(scales_tensor.scalar_type() == torch::kUInt8 || - scales_tensor.scalar_type() == torch::kFloat8_e8m0fnu, - "scales_tensor must be uint8 or e8m0"); - TORCH_CHECK(input_group_end_offsets.scalar_type() == torch::kInt32, - "input_group_end_offsets must be int32"); - TORCH_CHECK(input_group_end_offsets.dim() == 1, - "input_group_end_offsets must be 1D"); - - c10::cuda::CUDAGuard device_guard(scales_tensor.device()); - - const int rows = scales_tensor.size(0); - const int cols = scales_tensor.size(1); - const int num_groups = input_group_end_offsets.size(0); - TORCH_CHECK(num_groups <= 32, "num_groups must be <= 32"); - - // Calculate blocks needed - const int BLOCK_ROWS = 128; - const int BLOCK_COLS = 4; - const int num_row_blocks = (rows + BLOCK_ROWS - 1) / BLOCK_ROWS; - const int padded_rows = num_row_blocks * BLOCK_ROWS; - - // Padding per group is variable/data dependent, so pad each group by upper bound - const int padded_cols = cols + num_groups * BLOCK_COLS; - - // Create output tensor - auto output = torch::zeros({padded_rows, padded_cols}, - torch::TensorOptions() - .dtype(scales_tensor.scalar_type()) - .device(scales_tensor.device())); - - // Get raw pointers - const uint8_t* scales_ptr = scales_tensor.data_ptr(); - const int32_t* offsets_ptr = input_group_end_offsets.data_ptr(); - uint8_t* output_ptr = output.data_ptr(); - - // Launch naive kernel (original with while loop) - launch_mx_block_rearrange_2d_K_groups_naive( - scales_ptr, - scales_tensor.stride(0), - rows, - cols, - padded_rows, - offsets_ptr, - output_ptr, - num_groups, - at::cuda::getCurrentCUDAStream()); - - return output; -} - } // namespace mxfp8 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { @@ -340,10 +269,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "Rearrange E8M0 scales to block-scaled swizzle format (parallelized)", py::arg("scales_tensor"), py::arg("input_group_end_offsets")); - - m.def("mx_block_rearrange_2d_K_groups_naive", - &mxfp8::mx_block_rearrange_2d_K_groups_naive, - "Rearrange E8M0 scales to block-scaled swizzle format (naive version)", - py::arg("scales_tensor"), - py::arg("input_group_end_offsets")); } diff --git a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py index 764d9953da..00733d60a7 100644 --- a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py +++ b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py @@ -69,7 +69,7 @@ def test_kernel(): sys.path.insert(0, ao_root) from torchao.prototype.moe_training.kernels.mxfp8.quant import ( - triton_mx_block_rearrange_2d_K_groups_naive, + triton_mx_block_rearrange_2d_K_groups, ) from torchao.prototype.moe_training.utils import generate_jagged_offs from torchao.prototype.mx_formats.mx_tensor import to_mx @@ -122,46 +122,26 @@ def test_kernel(): ) print("✓ CUDA parallel kernel completed successfully") - # Test CUDA naive kernel - print("\n" + "-" * 80) - print("Running CUDA naive kernel...") - cuda_naive_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups_naive( - e8m0_scales.view(torch.uint8), - scale_group_offsets, - ) - print("✓ CUDA naive kernel completed successfully") - output_bytes = cuda_parallel_out_scales.numel() * bytes_per_element total_bytes = input_bytes + output_bytes - # Compare with Triton naive reference + # Compare with Triton reference print("\n" + "-" * 80) - print("Running Triton naive reference kernel...") - triton_naive_out = triton_mx_block_rearrange_2d_K_groups_naive( + print("Running Triton reference kernel...") + triton_out = triton_mx_block_rearrange_2d_K_groups( e8m0_scales, scale_group_offsets, ) - print("✓ Triton naive kernel completed successfully") + print("✓ Triton kernel completed successfully") # Verify correctness cuda_parallel_out_e8m0 = cuda_parallel_out_scales.view(torch.float8_e8m0fnu) - cuda_naive_out_e8m0 = cuda_naive_out_scales.view(torch.float8_e8m0fnu) print("\nVerifying correctness...") - if not torch.equal(triton_naive_out, cuda_naive_out_e8m0): - print("✗ CUDA naive and Triton naive outputs differ!") - return False - print("✓ CUDA naive matches Triton naive") - - if not torch.equal(triton_naive_out, cuda_parallel_out_e8m0): - print("✗ CUDA parallel and Triton naive outputs differ!") + if not torch.equal(triton_out, cuda_parallel_out_e8m0): + print("✗ CUDA parallel and Triton outputs differ!") return False - print("✓ CUDA parallel matches Triton naive") - - if not torch.equal(cuda_naive_out_e8m0, cuda_parallel_out_e8m0): - print("✗ CUDA naive and CUDA parallel outputs differ!") - return False - print("✓ CUDA naive and CUDA parallel match each other") + print("✓ CUDA parallel matches Triton") print("\n✓ All outputs are IDENTICAL!") @@ -172,13 +152,13 @@ def test_kernel(): print("\nBenchmarking kernels (100 iterations each)...") - # Benchmark Triton naive - triton_naive_time_us = benchmark_kernel( - triton_mx_block_rearrange_2d_K_groups_naive, + # Benchmark Triton + triton_time_us = benchmark_kernel( + triton_mx_block_rearrange_2d_K_groups, e8m0_scales, scale_group_offsets, ) - triton_naive_bw_gbps = (total_bytes / 1e9) / (triton_naive_time_us / 1e6) + triton_bw_gbps = (total_bytes / 1e9) / (triton_time_us / 1e6) # Benchmark CUDA parallel (optimized) cuda_parallel_time_us = benchmark_kernel( @@ -188,14 +168,6 @@ def test_kernel(): ) cuda_parallel_bw_gbps = (total_bytes / 1e9) / (cuda_parallel_time_us / 1e6) - # Benchmark CUDA naive - cuda_naive_time_us = benchmark_kernel( - mx_block_rearrange.mx_block_rearrange_2d_K_groups_naive, - e8m0_scales.view(torch.uint8), - scale_group_offsets, - ) - cuda_naive_bw_gbps = (total_bytes / 1e9) / (cuda_naive_time_us / 1e6) - # Print results print("\nResults:") print(f" Input size: {input_bytes / 1e6:.2f} MB") @@ -204,24 +176,19 @@ def test_kernel(): print(f"{'Kernel':<25} {'Time (μs)':<15} {'Bandwidth (GB/s)':<20} {'Speedup':<10}") print("-" * 70) print( - f"{'Triton Naive':<25} {triton_naive_time_us:<15.2f} {triton_naive_bw_gbps:<20.2f} {'1.00x':<10}" - ) - print( - f"{'CUDA Naive':<25} {cuda_naive_time_us:<15.2f} {cuda_naive_bw_gbps:<20.2f} {triton_naive_time_us / cuda_naive_time_us:<10.2f}x" + f"{'Triton':<25} {triton_time_us:<15.2f} {triton_bw_gbps:<20.2f} {'1.00x':<10}" ) print( - f"{'CUDA Parallel':<25} {cuda_parallel_time_us:<15.2f} {cuda_parallel_bw_gbps:<20.2f} {triton_naive_time_us / cuda_parallel_time_us:<10.2f}x" + f"{'CUDA Parallel':<25} {cuda_parallel_time_us:<15.2f} {cuda_parallel_bw_gbps:<20.2f} {triton_time_us / cuda_parallel_time_us:<10.2f}x" ) print() # Highlight best performer - best_bw = max(triton_naive_bw_gbps, cuda_naive_bw_gbps, cuda_parallel_bw_gbps) + best_bw = max(triton_bw_gbps, cuda_parallel_bw_gbps) if cuda_parallel_bw_gbps == best_bw: print("🏆 CUDA parallel kernel achieves highest memory bandwidth!") - elif cuda_naive_bw_gbps == best_bw: - print("🏆 CUDA naive kernel achieves highest memory bandwidth!") else: - print("🏆 Triton naive kernel achieves highest memory bandwidth!") + print("🏆 Triton kernel achieves highest memory bandwidth!") return True diff --git a/torchao/prototype/moe_training/kernels/mxfp8/quant.py b/torchao/prototype/moe_training/kernels/mxfp8/quant.py index f6a9a045d1..1678d6e711 100644 --- a/torchao/prototype/moe_training/kernels/mxfp8/quant.py +++ b/torchao/prototype/moe_training/kernels/mxfp8/quant.py @@ -238,9 +238,9 @@ def triton_mx_block_rearrange_2d_M_groups( - Rearranged tensor in block-scaled swizzle format """ assert scales_tensor.ndim == 2, "scales tensor must be 2d" - assert scales_tensor.element_size() == 1, ( - "Expected element size to be 1 byte (8 bits)" - ) + assert ( + scales_tensor.element_size() == 1 + ), "Expected element size to be 1 byte (8 bits)" rows, cols = scales_tensor.shape num_groups = input_group_end_offsets.shape[0] @@ -380,9 +380,9 @@ def triton_mx_block_rearrange_per_group_3d(scale_tensor: torch.Tensor) -> torch. Rearranged tensor in block-scaled swizzle format """ assert scale_tensor.ndim == 3, "scales tensor must be 3d" - assert scale_tensor.element_size() == 1, ( - "Expected element size to be 1 byte (8 bits)" - ) + assert ( + scale_tensor.element_size() == 1 + ), "Expected element size to be 1 byte (8 bits)" num_groups, rows, cols = scale_tensor.shape input_stride_dim0 = scale_tensor.stride(0) @@ -484,12 +484,12 @@ def triton_scale_swizzle_per_group_3d( ) -def triton_mx_block_rearrange_2d_K_groups_naive( +@triton_op("torchao::triton_mx_block_rearrange_2d_K_groups", mutates_args={}) +def triton_mx_block_rearrange_2d_K_groups( scales_tensor: torch.Tensor, input_group_end_offsets: torch.Tensor, ) -> torch.Tensor: """ - Naive version with while loop (before optimization). Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis, where the groups are along the contraction dimension of the GEMM. @@ -503,9 +503,9 @@ def triton_mx_block_rearrange_2d_K_groups_naive( - Rearranged tensor in block-scaled swizzle format """ assert scales_tensor.ndim == 2, "scales tensor must be 2d" - assert scales_tensor.element_size() == 1, ( - "Expected element size to be 1 byte (8 bits)" - ) + assert ( + scales_tensor.element_size() == 1 + ), "Expected element size to be 1 byte (8 bits)" rows, cols = scales_tensor.shape # Calculate blocks needed num_groups = input_group_end_offsets.shape[0] @@ -545,186 +545,6 @@ def triton_mx_block_rearrange_2d_K_groups_naive( return output -@triton_op("torchao::triton_mx_block_rearrange_2d_K_groups", mutates_args={}) -def triton_mx_block_rearrange_2d_K_groups( - scales_tensor: torch.Tensor, - input_group_end_offsets: torch.Tensor, -) -> torch.Tensor: - """ - Parallel version (parallelized over column blocks). - Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis, - where the groups are along the contraction dimension of the GEMM. - - This format is suitable for Tmem as described in NVIDIA documentation: - https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout - - Args: - scales_tensor: Input tensor containing e8m0 scales for each logical group of a target tensor. - input_group_end_offsets: tensor of int32 values representing group end indexes for the input scales - Returns: - - Rearranged tensor in block-scaled swizzle format - """ - assert scales_tensor.ndim == 2, "scales tensor must be 2d" - assert scales_tensor.element_size() == 1, ( - "Expected element size to be 1 byte (8 bits)" - ) - rows, cols = scales_tensor.shape - # Calculate blocks needed - num_groups = input_group_end_offsets.shape[0] - num_row_blocks = ceil_div(rows, 128) - padded_rows = num_row_blocks * 128 - - # Padding needing per group is variable/data dependent, so we just pad each group by - # the upper bound of 4 cols to avoid a d2h sync caused by iterating over each group. - padded_cols = cols + num_groups * 4 - output = scales_tensor.new_zeros((padded_rows, padded_cols)) - - # Output block stride for the rearranged format - BLOCK_ROWS, BLOCK_COLS = 128, 4 - output_stride_per_block = BLOCK_ROWS * BLOCK_COLS - - # Calculate column blocks for the ORIGINAL input tensor (before padding) - # Simply divide the number of columns by BLOCK_COLS - total_col_blocks = (cols + BLOCK_COLS - 1) // BLOCK_COLS - - # Compute per-group column block counts on GPU for the kernel to use - zero = torch.zeros( - 1, dtype=input_group_end_offsets.dtype, device=scales_tensor.device - ) - group_sizes = torch.diff(input_group_end_offsets, prepend=zero) - group_col_block_counts = (group_sizes + BLOCK_COLS - 1) // BLOCK_COLS - - # We parallelize over all column blocks across all groups and row blocks - grid = lambda META: ( - total_col_blocks, - num_row_blocks, - ) - wrap_triton(triton_scale_swizzle_2d_K_groups_parallel)[grid]( - scales_tensor.view(torch.uint8), - scales_tensor.stride(0), - scales_tensor.stride(1), - rows, - cols, - padded_rows, - input_group_end_offsets, - group_col_block_counts, - output.view(torch.uint8), - output_stride_per_block, - num_groups=num_groups, - BLOCK_ROWS=BLOCK_ROWS, - BLOCK_COLS=BLOCK_COLS, - ) - return output - - -@triton.jit -def triton_scale_swizzle_2d_K_groups_parallel( - scales_ptr, # (M, total_K//block_size) - scales_stride_dim0, - scales_stride_dim1, - scale_rows, - scale_cols, - padded_rows, - orig_offsets, # (num_groups,) - group_col_block_counts, # (num_groups,) - number of column blocks per group - output_scales_ptr, - output_stride_per_block, - num_groups: tl.constexpr, - BLOCK_ROWS: tl.constexpr, - BLOCK_COLS: tl.constexpr, -): - """ - Parallel version that parallelizes over column blocks. - Each thread block processes exactly one (row_block, col_block) pair. - Uses simple linear search to find which group a column block belongs to. - """ - col_block_pid = tl.program_id(0) - row_block_pid = tl.program_id(1) - - # Vectorized search to find which group this column block belongs to - # Load all group block counts at once - group_indices = tl.arange(0, num_groups) - all_block_counts = tl.load(group_col_block_counts + group_indices) - - # Compute cumulative sums to get start/end positions of each group - # cumsum_inclusive[i] = total blocks from group 0 to i (inclusive) - cumsum_inclusive = tl.cumsum(all_block_counts, axis=0) - - # cumsum_exclusive[i] = total blocks before group i (exclusive) - # For i > 0: cumsum_exclusive[i] = cumsum_inclusive[i] - all_block_counts[i] - # For i == 0: cumsum_exclusive[i] = 0 - cumsum_exclusive = tl.where( - group_indices > 0, cumsum_inclusive - all_block_counts, 0 - ) - - # Find which group col_block_pid belongs to - # A block belongs to group i if: cumsum_exclusive[i] <= block_id < cumsum_inclusive[i] - is_in_group = (col_block_pid >= cumsum_exclusive) & ( - col_block_pid < cumsum_inclusive - ) - - # Extract the group_pid (sum of indices where condition is true) - group_pid = tl.sum(tl.where(is_in_group, group_indices, 0)) - - # Extract the local column block offset within the group - local_col_block = tl.sum(tl.where(is_in_group, col_block_pid - cumsum_exclusive, 0)) - - # Load group offset boundaries - input_group_start_col = tl.load( - orig_offsets + group_pid - 1, mask=group_pid > 0, other=0 - ) - input_group_end_col = tl.load(orig_offsets + group_pid) - - # Compute input column offset for this specific column block - curr_input_start_col = input_group_start_col + local_col_block * BLOCK_COLS - - # Early exit if beyond group boundary - if curr_input_start_col >= input_group_end_col: - return - - # Calculate this group's start col after blocked format padding - output_group_start_col = _blocked_group_start_idx( - group_pid, orig_offsets, num_groups, 4 - ) - - row_offs = tl.arange(0, BLOCK_ROWS)[:, None] - col_offs = tl.arange(0, BLOCK_COLS)[None, :] - - # Read block of input scales - block_row_offs = row_block_pid * BLOCK_ROWS + row_offs - block_col_offs = curr_input_start_col + col_offs - block_offs = ( - block_row_offs * scales_stride_dim0 + block_col_offs * scales_stride_dim1 - ) - mask = (block_row_offs < scale_rows) & (block_col_offs < input_group_end_col) - input_scales = tl.load(scales_ptr + block_offs, mask=mask, other=0.0) - scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS)) - - # Compute output offset - out_group_base_offset = output_group_start_col * padded_rows - - num_cols_in_group = input_group_end_col - input_group_start_col - num_col_blocks_in_group = tl.cdiv(num_cols_in_group, BLOCK_COLS) - stride_per_row_of_blocks_in_group = ( - num_col_blocks_in_group * output_stride_per_block - ) - - offset_in_group = ( - row_block_pid * stride_per_row_of_blocks_in_group - + local_col_block * output_stride_per_block - ) - final_offset = out_group_base_offset + offset_in_group - - # Apply swizzling and write - dest_indices_flat = _dest_indices_for_block( - row_offs, col_offs, BLOCK_ROWS, BLOCK_COLS - ) - tl.store( - output_scales_ptr + final_offset + dest_indices_flat, - scales_flat, - ) - - @triton.jit def triton_scale_swizzle_2d_K_groups( scales_ptr, # (M, total_K//block_size) @@ -887,9 +707,10 @@ def mxfp8_quantize_cuda_3d( torch.Tensor: scales tensor """ assert x.ndim == 3, "Input tensor must be 3D" - assert x.dtype in (torch.float32, torch.bfloat16), ( - "Input tensor must be float32 or bfloat16" - ) + assert x.dtype in ( + torch.float32, + torch.bfloat16, + ), "Input tensor must be float32 or bfloat16" q_data, scales = mxfp8_cuda.quantize_3d( x, scale_dim_n=block_size, scaling_mode=scaling_mode ) @@ -902,9 +723,10 @@ def _fake_mxfp8_quantize_cuda_3d( scaling_mode: str = "floor", ) -> Tuple[torch.Tensor, torch.Tensor]: assert x.ndim == 3, "Input tensor must be 3D" - assert x.dtype in (torch.float32, torch.bfloat16), ( - "Input tensor must be float32 or bfloat16" - ) + assert x.dtype in ( + torch.float32, + torch.bfloat16, + ), "Input tensor must be float32 or bfloat16" E, N, K = x.shape # Quantized tensor is in column major layouts q_data = x.new_empty(x.shape, dtype=torch.float8_e4m3fn).as_strided(