From 825340e6384703b077c8dc19939409bab6cc4e6e Mon Sep 17 00:00:00 2001
From: Daniel Vega-Myhre <danvm@meta.com>
Date: Tue, 2 Dec 2025 14:29:32 -0800
Subject: [PATCH 1/4] [mxfp8 moe training] parallelize along col blocks in
 scale blocked format kernel for groups along K

stack-info: PR: https://github.com/pytorch/ao/pull/3416, branch: danielvegamyhre/stack/85
---
 ...h_triton_mx_block_rearrange_2d_K_groups.py |  229 ++
 log.txt                                       | 1952 +++++++++++++++++
 setup.py                                      |    1 +
 test/prototype/moe_training/test_kernels.py   |   59 +
 .../mx_block_rearrange_2d_K_groups.cu         |  199 ++
 .../csrc/cuda/mx_kernels/mxfp8_extension.cpp  |   77 +
 .../test_mx_block_rearrange_standalone.py     |  217 ++
 .../moe_training/kernels/mxfp8/quant.py       |  188 +-
 .../moe_training/scaled_grouped_mm.py         |    8 +-
 torchao/prototype/mx_formats/kernels.py       |    9 +-
 10 files changed, 2927 insertions(+), 12 deletions(-)
 create mode 100644 benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
 create mode 100644 log.txt
 create mode 100644 torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
 create mode 100644 torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py

diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
new file mode 100644
index 0000000000..35245f13bf
--- /dev/null
+++ b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
@@ -0,0 +1,229 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import os
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from torch.utils.cpp_extension import load
+from tqdm import tqdm
+
+from benchmarks.utils import benchmark_cuda_function_in_microseconds
+from torchao.prototype.moe_training.kernels.mxfp8 import (
+    triton_mx_block_rearrange_2d_K_groups,
+)
+from torchao.prototype.moe_training.kernels.mxfp8.quant import (
+    triton_mx_block_rearrange_2d_K_groups_naive,
+)
+from torchao.prototype.moe_training.utils import generate_jagged_offs
+
+# Build CUDA kernel directly using torch.utils.cpp_extension.load
+mxfp8_cuda = None
+try:
+    # Get the kernel source directory
+    KERNEL_DIR = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        "..",
+        "..",
+        "..",
+        "..",
+        "torchao",
+        "csrc",
+        "cuda",
+        "mx_kernels",
+    )
+    KERNEL_DIR = os.path.normpath(KERNEL_DIR)
+
+    print("Compiling CUDA kernel...")
+    mxfp8_cuda = load(
+        name="mxfp8_cuda",
+        sources=[
+            os.path.join(KERNEL_DIR, "mxfp8_extension.cpp"),
+            os.path.join(KERNEL_DIR, "mxfp8_cuda.cu"),
+            os.path.join(KERNEL_DIR, "mx_block_rearrange_2d_K_groups.cu"),
+        ],
+        extra_cuda_cflags=[
+            "-O3",
+            "--use_fast_math",
+            "-std=c++17",
+            "-gencode=arch=compute_90,code=sm_90",
+        ],
+        extra_cflags=["-O3", "-std=c++17"],
+        verbose=True,
+    )
+    print("✓ CUDA kernel compilation successful!")
+except (ImportError, RuntimeError) as e:
+    print(f"⚠ CUDA kernel not available: {e}")
+    print("The benchmark will only run 'naive' and 'parallel' Triton versions.\n")
+
+device = torch.device("cuda")
+
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    input_shape: tuple[int]
+    num_groups: int
+    version: str  # "naive" or "parallel"
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    time_us: float
+    mem_bw_gbps: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    # Llama4 and DSV3 671b shapes. Input activations are scaled along the total_M dim, which contains all the token groups.
+    block_size = 32
+    input_shapes = [
+        (5120, 16384 // block_size),
+        (5120, 131072 // block_size),
+        (8192, 16384 // block_size),
+        (8192, 131072 // block_size),
+        (7168, 16384 // block_size),
+        (7168, 131072 // block_size),
+        (2048, 16384 // block_size),
+        (2048, 131072 // block_size),
+    ]
+    num_groups = [8]
+    versions = ["naive", "parallel", "cuda"]
+
+    configs = []
+    for shape, groups, version in itertools.product(
+        input_shapes,
+        num_groups,
+        versions,
+    ):
+        configs.append(
+            ExperimentConfig(
+                input_shape=shape,
+                num_groups=groups,
+                version=version,
+            )
+        )
+    return configs
+
+
+def run_experiment(config: ExperimentConfig) -> ExperimentResult:
+    input_shape, num_groups, version = (
+        config.input_shape,
+        config.num_groups,
+        config.version,
+    )
+    input_tensor = torch.randint(
+        low=0,
+        high=256,
+        size=input_shape,
+        dtype=torch.uint8,
+        device=device,
+    )
+
+    M, Kg = input_shape
+    block_size = 32
+    input_group_offsets = generate_jagged_offs(num_groups, Kg, multiple_of=block_size)
+
+    # Select which kernel to benchmark based on version
+    if version == "naive":
+        kernel_fn = triton_mx_block_rearrange_2d_K_groups_naive
+    elif version == "parallel":
+        kernel_fn = triton_mx_block_rearrange_2d_K_groups
+    elif version == "cuda":
+        kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups
+    else:
+        raise ValueError(f"Unknown version: {version}")
+
+    # Run kernel to get output shape
+    out_scales = kernel_fn(
+        input_tensor,
+        input_group_offsets,
+    )
+
+    # Benchmark the kernel
+    assert input_tensor.is_contiguous()
+    time_us = benchmark_cuda_function_in_microseconds(
+        kernel_fn,
+        input_tensor,
+        input_group_offsets,
+    )
+
+    # Calculate memory bandwidth
+    bytes_per_input_el = torch.finfo(torch.float8_e8m0fnu).bits / 8
+    bytes_per_output_el = torch.finfo(torch.float8_e4m3fn).bits / 8
+
+    read_bytes = input_tensor.numel() * bytes_per_input_el
+    write_bytes = out_scales.numel() * bytes_per_output_el
+
+    mem_bw_gbps = ((read_bytes + write_bytes) / 1e9) / (time_us / 1e6)
+
+    return ExperimentResult(
+        time_us=time_us,
+        mem_bw_gbps=mem_bw_gbps,
+    )
+
+
+def print_results(experiments: List[Experiment]):
+    # Group experiments by input shape
+    shapes_dict = {}
+    for exp in experiments:
+        shape_key = exp.config.input_shape
+        if shape_key not in shapes_dict:
+            shapes_dict[shape_key] = {}
+        shapes_dict[shape_key][exp.config.version] = exp.result
+
+    headers = [
+        "kernel_version",
+        "input_shape",
+        "time_us",
+        "mem_bw_gbps",
+        "fastest_version",
+    ]
+
+    rows = []
+    for shape, versions in shapes_dict.items():
+        # Find fastest version for this shape
+        fastest_version = min(versions.items(), key=lambda x: x[1].time_us)[0]
+
+        # Add rows for each version
+        for version, result in versions.items():
+            rows.append(
+                [
+                    version,
+                    f"({shape[0]}, {shape[1]})",
+                    f"{result.time_us:.2f}",
+                    round(result.mem_bw_gbps, 3),
+                    fastest_version,
+                ]
+            )
+
+    print(tabulate(rows, headers=headers))
+
+
+def main():
+    torch.random.manual_seed(123)
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/log.txt b/log.txt
new file mode 100644
index 0000000000..e891bce92e
--- /dev/null
+++ b/log.txt
@@ -0,0 +1,1952 @@
+============================= test session starts ==============================
+platform linux -- Python 3.12.12, pytest-8.4.2, pluggy-1.6.0 -- /home/danvm/.conda/envs/release/bin/python3.12
+cachedir: .pytest_cache
+hypothesis profile 'default'
+rootdir: /home/danvm/ao
+configfile: pyproject.toml
+plugins: hypothesis-6.148.6, anyio-4.12.0
+collecting ... TMA benchmarks will be running without grid constant TMA descriptor.
+NOTE: Using slow Hadamard transform for SpinQuant. For better performance on GPU, install `fast_hadamard_transform`: `pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git`
+collected 10930 items / 9 skipped
+
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config0] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config1] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config2] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config3] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config4] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config5] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config6] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config7] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config8] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config9] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config10] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config11] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config12] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config13] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config14] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config15] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config16] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config17] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config18] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config19] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config20] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config21] PASSED
+test/core/test_config.py::test_reconstructable_dict_file_round_trip[config22] PASSED
+test/core/test_config.py::test_granularity_serialization[granularity0] TMA benchmarks will be running without grid constant TMA descriptor.
+PASSED
+test/core/test_config.py::test_granularity_serialization[granularity1] TMA benchmarks will be running without grid constant TMA descriptor.
+PASSED
+test/core/test_config.py::test_granularity_serialization[granularity2] TMA benchmarks will be running without grid constant TMA descriptor.
+PASSED
+test/core/test_config.py::test_disallowed_modules PASSED
+test/core/test_config.py::test_version_mismatch PASSED
+test/core/test_config.py::test_default_version PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant0 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant1 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant2 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant3 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant4 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_copy__mismatch_metadata_apply_quant5 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_print_quantized_module FAILED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_register_new_dispatch PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_tensor_core_layout_transpose PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant0 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant1 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant2 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant3 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant4 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_test_copy__apply_apply_quant5 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_affine_quantized_intx_static PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant0 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant1 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant2 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant3 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_to_device_apply_quant4 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantized::test_weights_only PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_alias_device_cpu_bfloat16 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_alias_device_cuda_bfloat16 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_flatten_unflatten_device_cpu_bfloat16 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_flatten_unflatten_device_cuda_bfloat16 FAILED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_matmul_cuda_bfloat16 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_mm_int4wo_cuda_bfloat16 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_slice_and_copy_int4wo_cuda_bfloat16 PASSED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_slice_gemlite_cuda_bfloat16 SKIPPED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_slice_gemlite_cuda_float16 SKIPPED
+test/dtypes/test_affine_quantized.py::TestAffineQuantizedBasic::test_slice_int4wo_cuda_bfloat16 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_choose_scale_float8_bounds_float8_e4m3fn_bfloat16 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_choose_scale_float8_bounds_float8_e4m3fn_float32 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_choose_scale_float8_bounds_float8_e5m2_bfloat16 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_choose_scale_float8_bounds_float8_e5m2_float32 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_bfloat16_block_size0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_bfloat16_block_size1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_bfloat16_block_size2 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_bfloat16_block_size3 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_float32_block_size0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_float32_block_size1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_float32_block_size2 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e4m3fn_float32_block_size3 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_bfloat16_block_size0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_bfloat16_block_size1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_bfloat16_block_size2 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_bfloat16_block_size3 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_float32_block_size0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_float32_block_size1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_float32_block_size2 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_float8_e5m2_float32_block_size3 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_dequantize_affine_float8_scale_broadcasting PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_expected_kernels_on_gpu_granularity0_float8_config_version_1 frames [('total', 1), ('ok', 1)]
+stats [('calls_captured', 1), ('unique_graphs', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 34), ('benchmarking.InductorBenchmarker.benchmark_gpu', 34), ('coordesc_tuning_bench', 34), ('triton_bundler_save_kernel', 24), ('async_compile_cache_miss', 3), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('triton_bundler_save_static_autotuner', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_512_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_expected_kernels_on_gpu_granularity0_float8_config_version_2 frames [('total', 1), ('ok', 1)]
+stats [('calls_captured', 1), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 34), ('benchmarking.InductorBenchmarker.benchmark_gpu', 34), ('coordesc_tuning_bench', 34), ('triton_bundler_save_kernel', 24), ('async_compile_cache_miss', 6), ('async_compile_cache_hit', 3), ('pattern_matcher_count', 1), ('pattern_matcher_nodes', 1), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_512_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_expected_kernels_on_gpu_granularity1_float8_config_version_1 frames [('total', 1), ('ok', 1)]
+stats [('calls_captured', 1), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 8), ('benchmarking.InductorBenchmarker.benchmark', 5), ('benchmarking.InductorBenchmarker.benchmark_gpu', 5), ('coordesc_tuning_bench', 5), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('async_compile_cache_miss', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_hit', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_512_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_expected_kernels_on_gpu_granularity1_float8_config_version_2 frames [('total', 1), ('ok', 1)]
+stats [('calls_captured', 1), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 8), ('benchmarking.InductorBenchmarker.benchmark', 6), ('benchmarking.InductorBenchmarker.benchmark_gpu', 6), ('coordesc_tuning_bench', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('async_compile_cache_miss', 2), ('fxgraph_cache_miss', 1), ('extern_calls', 1), ('async_compile_cache_hit', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_512_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_basic_granularity0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_basic_granularity1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_edge_cases PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_functional_correctness_granularity0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_functional_correctness_granularity1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_per_row PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_float8_tensor_slicing_per_tensor PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_False_granularity0_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_False_granularity0_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_False_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_False_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 79), ('benchmarking.InductorBenchmarker.benchmark_gpu', 79), ('coordesc_tuning_bench', 79), ('triton_bundler_save_kernel', 48), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 64), ('benchmarking.InductorBenchmarker.benchmark', 61), ('benchmarking.InductorBenchmarker.benchmark_gpu', 61), ('coordesc_tuning_bench', 57), ('async_compile_cache_miss', 12), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_hit', 6), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_True_granularity1_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 32), ('benchmarking.InductorBenchmarker.benchmark', 24), ('benchmarking.InductorBenchmarker.benchmark_gpu', 24), ('coordesc_tuning_bench', 21), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_dynamic_compile_True_granularity1_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 32), ('benchmarking.InductorBenchmarker.benchmark', 18), ('benchmarking.InductorBenchmarker.benchmark_gpu', 18), ('coordesc_tuning_bench', 15), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_False_granularity0_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_False_granularity0_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_False_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_False_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 24), ('benchmarking.InductorBenchmarker.benchmark_gpu', 24), ('coordesc_tuning_bench', 24), ('triton_bundler_save_kernel', 16), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 32), ('benchmarking.InductorBenchmarker.benchmark', 19), ('benchmarking.InductorBenchmarker.benchmark_gpu', 19), ('coordesc_tuning_bench', 15), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_True_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_static_compile_True_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_False_granularity0_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_False_granularity0_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_False_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_False_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 56), ('benchmarking.InductorBenchmarker.benchmark', 45), ('benchmarking.InductorBenchmarker.benchmark_gpu', 45), ('coordesc_tuning_bench', 39), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten.mm_128_256_128', 1), ('aten.mm_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 48), ('benchmarking.InductorBenchmarker.benchmark', 39), ('benchmarking.InductorBenchmarker.benchmark_gpu', 39), ('coordesc_tuning_bench', 34), ('pattern_matcher_nodes', 5), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('removed_pointless_view_pair', 1), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten.mm_4096_64_256', 1), ('aten.mm_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_True_granularity1_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 57), ('benchmarking.InductorBenchmarker.benchmark_gpu', 57), ('triton_bundler_save_kernel', 56), ('coordesc_tuning_bench', 51), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten.mm_128_256_128', 1), ('aten.mm_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_bfloat16_mode_weight-only_compile_True_granularity1_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 48), ('benchmarking.InductorBenchmarker.benchmark', 32), ('benchmarking.InductorBenchmarker.benchmark_gpu', 32), ('coordesc_tuning_bench', 27), ('pattern_matcher_nodes', 5), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('removed_pointless_view_pair', 1), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten.mm_4096_64_256', 1), ('aten.mm_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_False_granularity0_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_False_granularity0_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_False_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_False_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 69), ('benchmarking.InductorBenchmarker.benchmark_gpu', 69), ('coordesc_tuning_bench', 69), ('triton_bundler_save_kernel', 48), ('async_compile_cache_miss', 12), ('async_compile_cache_hit', 6), ('pattern_matcher_count', 2), ('pattern_matcher_nodes', 2), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 64), ('benchmarking.InductorBenchmarker.benchmark', 64), ('benchmarking.InductorBenchmarker.benchmark_gpu', 64), ('coordesc_tuning_bench', 60), ('async_compile_cache_miss', 12), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_hit', 6), ('extern_calls', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_True_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_dynamic_compile_True_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_False_granularity0_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_False_granularity0_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_False_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_False_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 18), ('benchmarking.InductorBenchmarker.benchmark_gpu', 18), ('coordesc_tuning_bench', 18), ('triton_bundler_save_kernel', 16), ('pattern_matcher_count', 4), ('pattern_matcher_nodes', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_128_256_128', 1), ('aten._scaled_mm.default_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 32), ('benchmarking.InductorBenchmarker.benchmark', 25), ('benchmarking.InductorBenchmarker.benchmark_gpu', 25), ('coordesc_tuning_bench', 21), ('pattern_matcher_count', 6), ('pattern_matcher_nodes', 6), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten._scaled_mm.default_4096_64_256', 1), ('aten._scaled_mm.default_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_True_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_static_compile_True_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_False_granularity0_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_False_granularity0_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_False_granularity1_sizes0 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_False_granularity1_sizes1 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_True_granularity0_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 20), ('benchmarking.InductorBenchmarker.benchmark_gpu', 20), ('coordesc_tuning_bench', 20), ('triton_bundler_save_kernel', 16), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten.mm_128_256_128', 1), ('aten.mm_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_True_granularity0_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 48), ('benchmarking.InductorBenchmarker.benchmark', 41), ('benchmarking.InductorBenchmarker.benchmark_gpu', 41), ('coordesc_tuning_bench', 36), ('pattern_matcher_nodes', 5), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('removed_pointless_view_pair', 1), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten.mm_4096_64_256', 1), ('aten.mm_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_True_granularity1_sizes0 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('benchmarking.InductorBenchmarker.benchmark', 26), ('benchmarking.InductorBenchmarker.benchmark_gpu', 26), ('coordesc_tuning_bench', 26), ('triton_bundler_save_kernel', 16), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten.mm_128_256_128', 1), ('aten.mm_128_128_256', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_linear_variants_float32_mode_weight-only_compile_True_granularity1_sizes1 stats [('calls_captured', 2), ('unique_graphs', 1)]
+aot_autograd [('total', 1), ('autograd_cache_miss', 1), ('autograd_cache_saved', 1), ('ok', 1)]
+inductor [('triton_bundler_save_kernel', 48), ('benchmarking.InductorBenchmarker.benchmark', 39), ('benchmarking.InductorBenchmarker.benchmark_gpu', 39), ('coordesc_tuning_bench', 34), ('pattern_matcher_nodes', 5), ('pattern_matcher_count', 4), ('async_compile_cache_miss', 4), ('extern_calls', 2), ('async_compile_cache_hit', 2), ('removed_pointless_view_pair', 1), ('fxgraph_cache_miss', 1), ('triton_bundler_save_static_autotuner', 1)]
+graph_break []
+aten_mm_info [('aten.mm_4096_64_256', 1), ('aten.mm_4096_256_64', 1)]
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_fp8_weight_dimension_warning ToyLinearModel(
+  (linear1): Linear(in_features=10, out_features=25, bias=False, weight=Tensor: <class 'torch.nn.parameter.Parameter'>)
+  (linear2): Linear(in_features=25, out_features=10, bias=False, weight=Tensor: <class 'torch.nn.parameter.Parameter'>)
+)
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_invalid_granularity PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mismatched_granularity PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape0_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape0_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape1_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape1_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape2_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape2_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape3_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape3_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape4_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_1024_out_features_512_leading_shape4_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape0_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape0_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape1_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape1_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape2_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape2_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape3_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape3_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape4_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_256_out_features_768_leading_shape4_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape0_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape0_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape1_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape1_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape2_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape2_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape3_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape3_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape4_bias_False PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_mm_float8dq_per_row_in_features_512_out_features_1024_leading_shape4_bias_True PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_per_row_with_float32 PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_preprocess_scale_3d_reshape PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_quantize_dequantize_fp8_inductor_float8_e4m3fn_bfloat16 frames [('total', 2), ('ok', 2)]
+stats [('calls_captured', 2), ('unique_graphs', 2)]
+aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
+inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2)]
+graph_break []
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_quantize_dequantize_fp8_inductor_float8_e4m3fn_float32 frames [('total', 2), ('ok', 2)]
+stats [('calls_captured', 2), ('unique_graphs', 2)]
+aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
+inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2)]
+graph_break []
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_quantize_dequantize_fp8_inductor_float8_e5m2_bfloat16 frames [('total', 2), ('ok', 2)]
+stats [('calls_captured', 2), ('unique_graphs', 2)]
+aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
+inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2)]
+graph_break []
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_quantize_dequantize_fp8_inductor_float8_e5m2_float32 frames [('total', 2), ('ok', 2)]
+stats [('calls_captured', 2), ('unique_graphs', 2)]
+aot_autograd [('total', 2), ('autograd_cache_miss', 2), ('autograd_cache_saved', 2), ('ok', 2)]
+inductor [('extern_calls', 4), ('fxgraph_cache_miss', 2)]
+graph_break []
+PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_serialization_mode_dynamic PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_serialization_mode_static PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_serialization_mode_weight-only PASSED
+test/dtypes/test_affine_quantized_float.py::TestAffineQuantizedFloat8Compile::test_unsupported_granularity PASSED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt8woAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:49:48.793000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 3994849
+I1204 14:49:48.795000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 3994853
+I1204 14:49:48.797000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 3994857
+I1204 14:49:48.799000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 3994860
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+NCCL version 2.27.5+cuda12.9
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+PASSED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt8woAffineQuantizedTensorParallel::test_tp_float16 I1204 14:50:02.048000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4001302
+I1204 14:50:02.050000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4001308
+I1204 14:50:02.058000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4001314
+I1204 14:50:02.060000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4001329
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+NCCL version 2.27.5+cuda12.9
+PASSED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt8woAffineQuantizedTensorParallel::test_tp_float32 I1204 14:50:18.788000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4019773
+I1204 14:50:18.790000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4019776
+I1204 14:50:18.792000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4019780
+I1204 14:50:18.795000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4019786
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1352: UserWarning: Config Deprecation: version 1 of Int8WeightOnlyConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+NCCL version 2.27.5+cuda12.9
+PASSED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt4woAffineQuantizedTensorParallel::test_tp_bfloat16 SKIPPED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestGemliteLayoutTensorParallel::test_tp_gemlite_float16 SKIPPED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestInt8dqAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:50:35.226000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4034897
+I1204 14:50:35.228000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4034904
+I1204 14:50:35.230000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4034908
+I1204 14:50:35.232000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4034913
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1566: UserWarning: Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1566: UserWarning: Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1566: UserWarning: Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quant_api.py:1566: UserWarning: Config Deprecation: version 1 of Int8DynamicActivationInt8WeightConfig is deprecated and will no longer be supported in a future release, please use version 2, see https://github.com/pytorch/ao/issues/2752 for more details
+  warnings.warn(
+NCCL version 2.27.5+cuda12.9
+PASSED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8woAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:50:53.577000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4050466
+I1204 14:50:53.579000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4050467
+I1204 14:50:53.581000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4050469
+I1204 14:50:53.583000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4050471
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:50:58.642000 4050467 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 1 with exit code: 10
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:50:59.271000 4050471 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 3 with exit code: 10
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:50:59.272000 4050469 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 2 with exit code: 10
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:50:59.277000 4050466 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 0 with exit code: 10
+Process 1 terminated with exit code 10, terminating remaining processes.
+FAILED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8woAffineQuantizedTensorParallel::test_tp_float16 I1204 14:50:59.619000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4052343
+I1204 14:50:59.621000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4052344
+I1204 14:50:59.625000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4052346
+I1204 14:50:59.627000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4052348
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float16
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:05.179000 4052343 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 0 with exit code: 10
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float16
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:05.677000 4052348 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 3 with exit code: 10
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float16
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:05.905000 4052344 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 1 with exit code: 10
+Process 0 terminated with exit code 10, terminating remaining processes.
+FAILED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8woAffineQuantizedTensorParallel::test_tp_float32 I1204 14:51:05.970000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4055063
+I1204 14:51:05.972000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4055068
+I1204 14:51:05.974000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4055071
+I1204 14:51:05.976000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4055074
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float32
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:12.993000 4055068 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 1 with exit code: 10
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float32
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:13.012000 4055071 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 2 with exit code: 10
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float32
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:13.015000 4055074 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 3 with exit code: 10
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 210, in test_tp
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8woAffineQuantizedTensorParallel.test_tp_float32
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:13.019000 4055063 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 0 with exit code: 10
+Process 1 terminated with exit code 10, terminating remaining processes.
+FAILED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8dqTensorAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:51:13.816000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4061064
+I1204 14:51:13.818000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4061080
+I1204 14:51:13.820000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4061085
+I1204 14:51:13.821000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4061089
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:17.737000 4061080 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 1 with exit code: 10
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:18.209000 4061089 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 3 with exit code: 10
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:18.232000 4061085 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 2 with exit code: 10
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:18.235000 4061064 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 0 with exit code: 10
+Process 1 terminated with exit code 10, terminating remaining processes.
+FAILED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8dqTensorAffineQuantizedTensorParallel::test_tp_float16 I1204 14:51:18.662000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4064730
+I1204 14:51:18.664000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4064731
+I1204 14:51:18.666000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4064733
+I1204 14:51:18.668000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4064735
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float16
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:25.499000 4064730 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 0 with exit code: 10
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float16
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:25.504000 4064731 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 1 with exit code: 10
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float16
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:25.509000 4064735 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 3 with exit code: 10
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float16
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:25.521000 4064733 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 2 with exit code: 10
+Process 0 terminated with exit code 10, terminating remaining processes.
+FAILED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8dqTensorAffineQuantizedTensorParallel::test_tp_float32 I1204 14:51:26.309000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4070526
+I1204 14:51:26.311000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4070527
+I1204 14:51:26.313000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4070529
+I1204 14:51:26.315000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4070531
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float32
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:30.182000 4070526 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 0 with exit code: 10
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float32
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:30.186000 4070531 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 3 with exit code: 10
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float32
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:30.839000 4070529 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 2 with exit code: 10
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 223, in test_tp
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqTensorAffineQuantizedTensorParallel.test_tp_float32
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:30.883000 4070527 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 1 with exit code: 10
+Process 0 terminated with exit code 10, terminating remaining processes.
+FAILED
+test/dtypes/test_affine_quantized_tensor_parallel.py::TestFloat8dqRowAffineQuantizedTensorParallel::test_tp_bfloat16 I1204 14:51:31.250000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 0 with pid 4074998
+I1204 14:51:31.255000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 1 with pid 4074999
+I1204 14:51:31.258000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 2 with pid 4075002
+I1204 14:51:31.259000 3932578 site-packages/torch/testing/_internal/common_distributed.py:849] Started process 3 with pid 4075003
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+TMA benchmarks will be running without grid constant TMA descriptor.
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 236, in test_tp
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqRowAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:37.349000 4075003 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 3 with exit code: 10
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 236, in test_tp
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqRowAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:37.436000 4074998 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 0 with exit code: 10
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 236, in test_tp
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqRowAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:37.475000 4075002 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 2 with exit code: 10
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] Caught exception: 
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] Traceback (most recent call last):
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 925, in run_test
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     getattr(self, test_name)()
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_distributed.py", line 772, in wrapper
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     fn()
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 3329, in wrapper
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     method(*args, **kwargs)
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/common_utils.py", line 586, in instantiated_test
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     test(self, **param_kwargs)
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 533, in wrapper
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     raise e
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/testing/_internal/distributed/_tensor/common_dtensor.py", line 530, in wrapper
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     func(self, *args, **kwargs)  # type: ignore[misc]
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 236, in test_tp
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     return self._test_tp(dtype)
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 122, in _test_tp
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     up_dist = self.colwise_shard(up_quant, mesh)
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/ao/test/dtypes/test_affine_quantized_tensor_parallel.py", line 61, in colwise_shard
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     dtensor = DTensor.from_local(local_shard, mesh, [Shard(0)])
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 441, in from_local
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     return _FromTorchTensor.apply(  # pyre-ignore[16]: autograd func
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/autograd/function.py", line 583, in apply
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     return super().apply(*args, **kwargs)  # type: ignore[misc]
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torch/distributed/tensor/_api.py", line 192, in forward
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     input.view_as(input),
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     ^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 636, in _dispatch__torch_function__
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     return func(*args, **kwargs)
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 652, in _dispatch__torch_dispatch__
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     return cls._ATEN_OP_TABLE[cls][func](func, types, args, kwargs)
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/utils.py", line 417, in wrapper
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     return _func(f, types, args, kwargs)
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]   File "/home/danvm/.conda/envs/release/lib/python3.12/site-packages/torchao/quantization/quantize_/workflows/float8/float8_tensor.py", line 866, in _
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     scale_shape.append(qdata.shape[i] // self.block_size[i])
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]                        ~~~~~~~~~~~^^^
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] IndexError: tuple index out of range
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] To execute this test, run the following from the base repo dir:
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]     python test/dtypes/test_affine_quantized_tensor_parallel.py TestFloat8dqRowAffineQuantizedTensorParallel.test_tp_bfloat16
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] 
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935] This message can be suppressed by setting PYTORCH_PRINT_REPRO_ON_FAILURE=0
+E1204 14:51:37.530000 4074999 site-packages/torch/testing/_internal/common_distributed.py:935]  exiting process 1 with exit code: 10
+Process 0 terminated with exit code 10, terminating remaining processes.
+FAILED
+test/dtypes/test_bitpacking.py::test_CPU[0-1] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[0-2] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[0-3] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[0-4] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[0-5] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[0-6] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[0-7] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[-1-1] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[-1-2] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[-1-3] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[-1-4] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[-1-5] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[-1-6] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[-1-7] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[1-1] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[1-2] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[1-3] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[1-4] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[1-5] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[1-6] PASSED
+test/dtypes/test_bitpacking.py::test_CPU[1-7] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[0-1] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[0-2] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[0-3] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[0-4] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[0-5] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[0-6] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[0-7] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[-1-1] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[-1-2] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[-1-3] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[-1-4] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[-1-5] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[-1-6] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[-1-7] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[1-1] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[1-2] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[1-3] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[1-4] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[1-5] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[1-6] PASSED
+test/dtypes/test_bitpacking.py::test_GPU[1-7] PASSED
+test/dtypes/test_bitpacking.py::test_compile[0-1] PASSED
+test/dtypes/test_bitpacking.py::test_compile[0-2] PASSED
+test/dtypes/test_bitpacking.py::test_compile[0-3] PASSED
+test/dtypes/test_bitpacking.py::test_compile[0-4] PASSED
+test/dtypes/test_bitpacking.py::test_compile[0-5] PASSED
+test/dtypes/test_bitpacking.py::test_compile[0-6] PASSED
+test/dtypes/test_bitpacking.py::test_compile[0-7] PASSED
+test/dtypes/test_bitpacking.py::test_compile[-1-1] PASSED
+test/dtypes/test_bitpacking.py::test_compile[-1-2] PASSED
+test/dtypes/test_bitpacking.py::test_compile[-1-3] PASSED
+test/dtypes/test_bitpacking.py::test_compile[-1-4] PASSED
+test/dtypes/test_bitpacking.py::test_compile[-1-5] PASSED
+test/dtypes/test_bitpacking.py::test_compile[-1-6] PASSED
+test/dtypes/test_bitpacking.py::test_compile[-1-7] PASSED
+test/dtypes/test_bitpacking.py::test_compile[1-1] PASSED
+test/dtypes/test_bitpacking.py::test_compile[1-2] PASSED
+test/dtypes/test_bitpacking.py::test_compile[1-3] PASSED
+test/dtypes/test_bitpacking.py::test_compile[1-4] PASSED
+test/dtypes/test_bitpacking.py::test_compile[1-5] PASSED
+test/dtypes/test_bitpacking.py::test_compile[1-6] PASSED
+test/dtypes/test_bitpacking.py::test_compile[1-7] PASSED
+test/dtypes/test_bitpacking.py::test_pack_example tensor([  0, 105, 151,  37], device='cuda:0', dtype=torch.uint8) tensor([ 39, 146], device='cuda:0', dtype=torch.uint8)
+PASSED
+test/dtypes/test_bitpacking.py::test_pack_example_CPU tensor([  0, 105, 151,  37], dtype=torch.uint8) tensor([ 39, 146], dtype=torch.uint8)
+PASSED
+test/dtypes/test_floatx.py::TestFloatxTensorCoreAQTTensorImpl::test_fpx_weight_only_ebits_2_mbits_2_bias_False_bfloat16 CUDA error (/__w/ao/ao/pytorch/ao/torchao/csrc/cuda/fp6_llm/fp6_linear.cu:84): no kernel image is available for execution on the device
diff --git a/setup.py b/setup.py
index 9d2d7bce1c..9136371a4d 100644
--- a/setup.py
+++ b/setup.py
@@ -702,6 +702,7 @@ def get_extensions():
         mxfp8_sources = [
             os.path.join(mxfp8_extension_dir, "mxfp8_extension.cpp"),
             os.path.join(mxfp8_extension_dir, "mxfp8_cuda.cu"),
+            os.path.join(mxfp8_extension_dir, "mx_block_rearrange_2d_K_groups.cu"),
         ]
 
         # Only add the extension if the source files exist AND we are building for sm100
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
index ecd4cefe6a..d59eb62906 100644
--- a/test/prototype/moe_training/test_kernels.py
+++ b/test/prototype/moe_training/test_kernels.py
@@ -354,3 +354,62 @@ def test_cuda_mx_dim1_3d_numerics(E, N, K, input_dtype, scaling_mode):
     # Check quantized values
     torch.testing.assert_close(y_d1, y_d1_ref, rtol=0, atol=0)
     assert y_d1.stride() == y_d1_ref.stride(), "quantized tensor strides do not match"
+
+
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="MXFP8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.parametrize("m", [256, 512, 1024, 5120])
+@pytest.mark.parametrize("total_k", [512, 1024, 2048, 4096, 8192, 16384])
+@pytest.mark.parametrize("n_groups", [1, 4, 8, 16])
+def test_cuda_mx_block_rearrange_2d_K_groups(
+    m: int,
+    total_k: int,
+    n_groups: int,
+):
+    """
+    Test CUDA kernel for mx_block_rearrange_2d_K_groups against Triton reference.
+    This kernel rearranges E8M0 scales to block-scaled swizzle format for cuBLAS Tmem.
+    """
+    from torchao.prototype import mxfp8_cuda
+
+    device = "cuda"
+    block_size = 32
+    input_data = torch.randn(m, total_k, device=device)
+
+    e8m0_scales, _ = to_mx(
+        input_data, elem_dtype=torch.float8_e4m3fn, block_size=block_size
+    )
+
+    # Generate group end offsets along total_K, then divide by block_size to get scale group end offsets
+    input_group_offsets = generate_jagged_offs(
+        n_groups, total_k, multiple_of=block_size, device=device
+    )
+    scale_group_offsets = input_group_offsets // block_size
+
+    # Triton reference implementation
+    triton_out_scales = triton_mx_block_rearrange_2d_K_groups(
+        e8m0_scales,
+        scale_group_offsets,
+    )
+
+    # CUDA kernel implementation
+    cuda_out_scales = mxfp8_cuda.mx_block_rearrange_2d_K_groups(
+        e8m0_scales.view(torch.uint8),
+        scale_group_offsets,
+    )
+
+    # Check that outputs match
+    assert torch.equal(triton_out_scales, cuda_out_scales.view(torch.float8_e8m0fnu)), (
+        "CUDA and Triton blocked scales not equal"
+    )
+
+    # Verify output shape
+    expected_rows = ((m + 127) // 128) * 128  # Padded to multiple of 128
+    expected_cols = (
+        e8m0_scales.size(1) + n_groups * 4
+    )  # Original cols + padding per group
+    assert cuda_out_scales.shape == (expected_rows, expected_cols), (
+        f"Output shape mismatch: expected {(expected_rows, expected_cols)}, got {cuda_out_scales.shape}"
+    )
diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
new file mode 100644
index 0000000000..c97baec10d
--- /dev/null
+++ b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
@@ -0,0 +1,199 @@
+#include <cuda_runtime.h>
+#include <cuda_fp8.h>
+#include <cstdint>
+#include <cstdio>
+
+#define BLOCK_ROWS 128
+#define BLOCK_COLS 4
+
+// Helper function to compute ceil division
+__device__ __forceinline__ int ceil_div(int a, int b) {
+    return (a + b - 1) / b;
+}
+
+// Helper function to compute the start index of a group after padding
+__device__ __forceinline__ int compute_output_group_start_col(
+    int group_id,
+    const int32_t* input_group_end_offsets,
+    int num_groups,
+    int padding_size
+) {
+    int start_idx = 0;
+
+    // Compute prefix sum of padded group sizes
+    for (int i = 0; i < group_id; i++) {
+        int prev_offset = (i > 0) ? input_group_end_offsets[i - 1] : 0;
+        int curr_offset = input_group_end_offsets[i];
+        int group_size = curr_offset - prev_offset;
+        int padded_size = ceil_div(group_size, padding_size) * padding_size;
+        start_idx += padded_size;
+    }
+
+    return start_idx;
+}
+
+// Compute destination index for swizzled block layout
+// For a 128x4 block: r_div_32 = row / 32, r_mod_32 = row % 32
+// Swizzle: dest = r_mod_32 * 16 + r_div_32 * 4 + col
+__device__ __forceinline__ int compute_swizzled_index(int row, int col) {
+    int r_div_32 = row / 32;
+    int r_mod_32 = row % 32;
+    return r_mod_32 * 16 + r_div_32 * 4 + col;
+}
+
+__global__ void mx_block_rearrange_2d_K_groups_naive_kernel(
+    const uint8_t* __restrict__ scales_ptr,
+    int scales_stride_dim0,
+    int scale_rows,
+    int scale_cols,
+    int padded_rows,
+    const int32_t* __restrict__ input_group_end_offsets,
+    uint8_t* __restrict__ output_scales_ptr,
+    int output_stride_per_block,
+    int num_groups
+) {
+    const int group_id = blockIdx.x;
+    const int block_row_id = blockIdx.y;
+    const int tid = threadIdx.x;  // 128 threads, each handles one row
+
+    // Shared memory for one 128x4 block
+    __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS];
+
+    // Get start/end cols of this input group
+    int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0;
+    int input_group_end_col = input_group_end_offsets[group_id];
+    int num_cols_in_group = input_group_end_col - input_group_start_col;
+
+    // Get output group start column
+    int output_group_start_col = compute_output_group_start_col(
+        group_id,
+        input_group_end_offsets,
+        num_groups,
+        4); // scaling factor column padding size
+
+    // Compute base offset for this group in output
+    int out_group_base_offset = output_group_start_col * padded_rows;
+
+    // Compute stride per row of blocks in this group
+    int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS);
+    int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block;
+
+    // Each thread handles one row
+    int input_row = block_row_id * BLOCK_ROWS + tid;
+
+    // Loop through column blocks in this group
+    int curr_input_start_col = input_group_start_col;
+    int curr_out_col_block = 0;
+
+    while (curr_input_start_col < input_group_end_col) {
+        // Calculate how many columns to load for this block
+        int cols_remaining = input_group_end_col - curr_input_start_col;
+        int cols_to_load = min(BLOCK_COLS, cols_remaining);
+
+        // Load data for this row using vectorized loads when possible
+        uint32_t row_data = 0;
+
+        if (input_row < scale_rows && curr_input_start_col < input_group_end_col) {
+            int input_offset = input_row * scales_stride_dim0 + curr_input_start_col;
+            const uint8_t* input_ptr = scales_ptr + input_offset;
+
+            // Check alignment and available columns within this group
+            uintptr_t ptr_addr = reinterpret_cast<uintptr_t>(input_ptr);
+
+            if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) {
+                // 4-byte aligned and have 4 columns within group: use uint32_t load
+                row_data = *reinterpret_cast<const uint32_t*>(input_ptr);
+            } else {
+                // Byte-by-byte loads for unaligned or partial blocks
+                uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
+                for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) {
+                    row_bytes[i] = input_ptr[i];
+                }
+            }
+        }
+
+        // Write to swizzled positions in shared memory
+        uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
+
+        #pragma unroll
+        for (int col = 0; col < BLOCK_COLS; col++) {
+            int swizzled_idx = compute_swizzled_index(tid, col);
+            smem_block[swizzled_idx] = row_bytes[col];
+        }
+
+        __syncthreads();
+
+        // Write from shared memory to global memory
+        // Calculate the output offset for this specific block
+        int offset_in_group = block_row_id * stride_per_row_of_blocks_in_group +
+                              curr_out_col_block * output_stride_per_block;
+        int final_offset = out_group_base_offset + offset_in_group;
+
+        // Each thread writes 4 bytes (one row of the 128x4 block)
+        uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS;
+
+        // Check output alignment for vectorized write
+        uintptr_t out_ptr_addr = reinterpret_cast<uintptr_t>(output_ptr);
+        if (out_ptr_addr % 4 == 0) {
+            // Aligned: use uint32_t store
+            *reinterpret_cast<uint32_t*>(output_ptr) =
+                *reinterpret_cast<const uint32_t*>(&smem_block[tid * BLOCK_COLS]);
+        } else {
+            // Unaligned: byte by byte
+            const uint8_t* smem_ptr = &smem_block[tid * BLOCK_COLS];
+            #pragma unroll
+            for (int i = 0; i < BLOCK_COLS; i++) {
+                output_ptr[i] = smem_ptr[i];
+            }
+        }
+
+        __syncthreads();
+
+        // Advance to next column block
+        curr_input_start_col += BLOCK_COLS;
+        curr_out_col_block += 1;
+    }
+}
+
+// Host function to launch the kernel
+namespace mxfp8 {
+
+void launch_mx_block_rearrange_2d_K_groups(
+    const uint8_t* scales_ptr,
+    int scales_stride_dim0,
+    int scale_rows,
+    int scale_cols,
+    int padded_rows,
+    const int32_t* input_group_end_offsets,
+    uint8_t* output_scales_ptr,
+    int num_groups,
+    cudaStream_t stream
+) {
+    int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS;
+
+    // Grid parallelizes over (num_groups, num_row_blocks)
+    // Each thread block loops through column blocks within its group
+    dim3 grid(num_groups, num_row_blocks);
+    dim3 block(128); // 128 threads, each handling one row
+
+    int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS;
+
+    mx_block_rearrange_2d_K_groups_naive_kernel<<<grid, block, 0, stream>>>(
+        scales_ptr,
+        scales_stride_dim0,
+        scale_rows,
+        scale_cols,
+        padded_rows,
+        input_group_end_offsets,
+        output_scales_ptr,
+        output_stride_per_block,
+        num_groups
+    );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA Error: %s\n", cudaGetErrorString(err));
+    }
+}
+
+} // namespace mxfp8
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
index d445fcad4d..55db12c40e 100644
--- a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
+++ b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
@@ -25,6 +25,17 @@ void mxfp8_quantize_3d_cuda(const torch::Tensor &input,
                              const std::string &fp8_format,
                              const std::string &scaling_mode);
 
+void launch_mx_block_rearrange_2d_K_groups(
+    const uint8_t* scales_ptr,
+    int scales_stride_dim0,
+    int scale_rows,
+    int scale_cols,
+    int padded_rows,
+    const int32_t* input_group_end_offsets,
+    uint8_t* output_scales_ptr,
+    int num_groups,
+    cudaStream_t stream);
+
 // Helper for tensor validation
 void check_cuda_tensor(const torch::Tensor &t, const char *name) {
   TORCH_CHECK(t.is_cuda(), name, " must be a CUDA tensor");
@@ -177,6 +188,66 @@ mxfp8_quantize_3d(torch::Tensor input, int64_t scale_dim_n,
   return std::make_tuple(output_colwise, scales_colwise);
 }
 
+// Python wrapper for mx_block_rearrange_2d_K_groups
+torch::Tensor mx_block_rearrange_2d_K_groups(
+    torch::Tensor scales_tensor,
+    torch::Tensor input_group_end_offsets) {
+
+  // Validate inputs
+  check_cuda_tensor(scales_tensor, "scales_tensor");
+  check_cuda_tensor(input_group_end_offsets, "input_group_end_offsets");
+  
+  TORCH_CHECK(scales_tensor.dim() == 2, "scales_tensor must be 2D");
+  TORCH_CHECK(scales_tensor.scalar_type() == torch::kUInt8 || 
+              scales_tensor.scalar_type() == torch::kFloat8_e8m0fnu,
+              "scales_tensor must be uint8 or e8m0");
+  TORCH_CHECK(input_group_end_offsets.scalar_type() == torch::kInt32,
+              "input_group_end_offsets must be int32");
+  TORCH_CHECK(input_group_end_offsets.dim() == 1,
+              "input_group_end_offsets must be 1D");
+
+  c10::cuda::CUDAGuard device_guard(scales_tensor.device());
+
+  const int rows = scales_tensor.size(0);
+  const int cols = scales_tensor.size(1);
+  const int num_groups = input_group_end_offsets.size(0);
+  TORCH_CHECK(num_groups <= 32, "num_groups must be <= 32");
+  
+  // Calculate blocks needed
+  const int BLOCK_ROWS = 128;
+  const int BLOCK_COLS = 4;
+  const int num_row_blocks = (rows + BLOCK_ROWS - 1) / BLOCK_ROWS;
+  const int padded_rows = num_row_blocks * BLOCK_ROWS;
+  
+  // Padding per group is variable/data dependent, so pad each group by upper bound
+  const int padded_cols = cols + num_groups * BLOCK_COLS;
+  
+  // Create output tensor
+  auto output = torch::zeros({padded_rows, padded_cols},
+                            torch::TensorOptions()
+                                .dtype(scales_tensor.scalar_type())
+                                .device(scales_tensor.device()));
+  
+  // Get raw pointers
+  const uint8_t* scales_ptr = scales_tensor.data_ptr<uint8_t>();
+  const int32_t* offsets_ptr = input_group_end_offsets.data_ptr<int32_t>();
+  uint8_t* output_ptr = output.data_ptr<uint8_t>();
+  
+  // Launch kernel
+  launch_mx_block_rearrange_2d_K_groups(
+      scales_ptr,
+      scales_tensor.stride(0),
+      rows,
+      cols,
+      padded_rows,
+      offsets_ptr,
+      output_ptr,
+      num_groups,
+      at::cuda::getCurrentCUDAStream());
+  
+  return output;
+}
+
 } // namespace mxfp8
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
@@ -192,4 +263,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("input"), py::arg("scale_dim_n") = 32,
         py::arg("fp8_format") = "e4m3",
         py::arg("scaling_mode") = "floor");
+
+  m.def("mx_block_rearrange_2d_K_groups", 
+        &mxfp8::mx_block_rearrange_2d_K_groups,
+        "Rearrange E8M0 scales to block-scaled swizzle format for cuBLAS Tmem",
+        py::arg("scales_tensor"),
+        py::arg("input_group_end_offsets"));
 }
diff --git a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py
new file mode 100644
index 0000000000..9b779c655c
--- /dev/null
+++ b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py
@@ -0,0 +1,217 @@
+"""
+Standalone test for mx_block_rearrange_2d_K_groups CUDA kernel.
+Uses torch.utils.cpp_extension.load for quick compilation and iteration.
+
+Usage:
+    python test_mx_block_rearrange_standalone.py
+"""
+
+import os
+import sys
+
+import torch
+from torch.utils.cpp_extension import load
+
+# Get the directory where this script is located
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Load the CUDA extension
+print("Compiling CUDA kernel...")
+mx_block_rearrange = load(
+    name="mx_block_rearrange_2d_K_groups",
+    sources=[
+        os.path.join(SCRIPT_DIR, "mxfp8_extension.cpp"),
+        os.path.join(SCRIPT_DIR, "mxfp8_cuda.cu"),
+        os.path.join(SCRIPT_DIR, "mx_block_rearrange_2d_K_groups.cu"),
+    ],
+    extra_cuda_cflags=[
+        "-O3",
+        "--use_fast_math",
+        "-std=c++17",
+        "-gencode=arch=compute_90,code=sm_90",
+    ],
+    extra_cflags=["-O3", "-std=c++17"],
+    verbose=True,
+)
+
+print("✓ Compilation successful!")
+
+
+def benchmark_kernel(kernel_fn, *args, warmup=10, iterations=100):
+    """Benchmark a kernel function and return average time in microseconds."""
+    # Warmup
+    for _ in range(warmup):
+        kernel_fn(*args)
+    torch.cuda.synchronize()
+
+    # Benchmark
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for _ in range(iterations):
+        kernel_fn(*args)
+    end_event.record()
+
+    torch.cuda.synchronize()
+    elapsed_ms = start_event.elapsed_time(end_event)
+    return (elapsed_ms / iterations) * 1000  # Convert to microseconds
+
+
+def test_kernel():
+    print("\n" + "=" * 80)
+    print("Testing mx_block_rearrange_2d_K_groups kernel")
+    print("=" * 80)
+
+    # Try importing the Triton reference implementation
+    try:
+        ao_root = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", "..", ".."))
+        sys.path.insert(0, ao_root)
+
+        from torchao.prototype.moe_training.kernels.mxfp8 import (
+            triton_mx_block_rearrange_2d_K_groups,
+        )
+        from torchao.prototype.moe_training.kernels.mxfp8.quant import (
+            triton_mx_block_rearrange_2d_K_groups_naive,
+        )
+        from torchao.prototype.moe_training.utils import generate_jagged_offs
+        from torchao.prototype.mx_formats.mx_tensor import to_mx
+
+        has_triton = True
+        print("✓ Triton reference implementation available")
+    except ImportError as e:
+        print(f"⚠ Triton reference not available: {e}")
+        has_triton = False
+
+    # Test parameters - use larger size for meaningful benchmarks
+    device = "cuda"
+    m, total_k = 5120, 16384
+    n_groups = 8
+    block_size = 32
+
+    print("\nTest configuration:")
+    print(f"  Matrix size: {m} x {total_k}")
+    print(f"  Number of groups: {n_groups}")
+
+    # Generate test data
+    print("\nGenerating test data...")
+    torch.manual_seed(42)
+    input_data = torch.randn(m, total_k, device=device)
+
+    if has_triton:
+        e8m0_scales, _ = to_mx(
+            input_data, elem_dtype=torch.float8_e4m3fn, block_size=block_size
+        )
+
+        input_group_offsets = generate_jagged_offs(
+            n_groups, total_k, multiple_of=block_size, device=device
+        )
+        scale_group_offsets = input_group_offsets // block_size
+
+        print(f"  Scales shape: {e8m0_scales.shape}")
+    else:
+        return False
+
+    # Calculate memory bandwidth metrics
+    bytes_per_element = 1
+    input_bytes = e8m0_scales.numel() * bytes_per_element
+
+    # Test CUDA kernel
+    print("\n" + "-" * 80)
+    print("Running CUDA kernel...")
+    cuda_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups(
+        e8m0_scales.view(torch.uint8),
+        scale_group_offsets,
+    )
+    print("✓ CUDA kernel completed successfully")
+
+    output_bytes = cuda_out_scales.numel() * bytes_per_element
+    total_bytes = input_bytes + output_bytes
+
+    # Compare with Triton reference
+    print("\n" + "-" * 80)
+    print("Running Triton reference kernels...")
+    triton_naive_out = triton_mx_block_rearrange_2d_K_groups_naive(
+        e8m0_scales,
+        scale_group_offsets,
+    )
+    print("✓ Triton kernel completed successfully")
+
+    # Verify correctness
+    cuda_out_e8m0 = cuda_out_scales.view(torch.float8_e8m0fnu)
+    if not torch.equal(triton_naive_out, cuda_out_e8m0):
+        print("✗ CUDA and Triton naive outputs differ!")
+        return False
+    print("✓ All outputs are IDENTICAL!")
+
+    # Benchmark section
+    print("\n" + "=" * 80)
+    print("BENCHMARKING MEMORY BANDWIDTH")
+    print("=" * 80)
+
+    print("\nBenchmarking kernels (100 iterations each)...")
+
+    # Benchmark Triton naive
+    triton_naive_time_us = benchmark_kernel(
+        triton_mx_block_rearrange_2d_K_groups_naive,
+        e8m0_scales,
+        scale_group_offsets,
+    )
+    triton_naive_bw_gbps = (total_bytes / 1e9) / (triton_naive_time_us / 1e6)
+
+    # Benchmark Triton parallel
+    triton_parallel_time_us = benchmark_kernel(
+        triton_mx_block_rearrange_2d_K_groups,
+        e8m0_scales,
+        scale_group_offsets,
+    )
+    triton_parallel_bw_gbps = (total_bytes / 1e9) / (triton_parallel_time_us / 1e6)
+
+    # Benchmark CUDA
+    cuda_time_us = benchmark_kernel(
+        mx_block_rearrange.mx_block_rearrange_2d_K_groups,
+        e8m0_scales.view(torch.uint8),
+        scale_group_offsets,
+    )
+    cuda_bw_gbps = (total_bytes / 1e9) / (cuda_time_us / 1e6)
+
+    # Print results
+    print("\nResults:")
+    print(f"  Input size:  {input_bytes / 1e6:.2f} MB")
+    print(f"  Output size: {output_bytes / 1e6:.2f} MB")
+    print(f"  Total I/O:   {total_bytes / 1e6:.2f} MB\n")
+    print(f"{'Kernel':<25} {'Time (μs)':<15} {'Bandwidth (GB/s)':<20} {'Speedup':<10}")
+    print("-" * 70)
+    print(
+        f"{'Triton Naive':<25} {triton_naive_time_us:<15.2f} {triton_naive_bw_gbps:<20.2f} {'1.00x':<10}"
+    )
+    print(
+        f"{'Triton Parallel':<25} {triton_parallel_time_us:<15.2f} {triton_parallel_bw_gbps:<20.2f} {triton_naive_time_us / triton_parallel_time_us:<10.2f}x"
+    )
+    print(
+        f"{'CUDA (Optimized)':<25} {cuda_time_us:<15.2f} {cuda_bw_gbps:<20.2f} {triton_naive_time_us / cuda_time_us:<10.2f}x"
+    )
+    print()
+
+    # Highlight best performer
+    best_bw = max(triton_naive_bw_gbps, triton_parallel_bw_gbps, cuda_bw_gbps)
+    if cuda_bw_gbps == best_bw:
+        print("🏆 CUDA kernel achieves highest memory bandwidth!")
+    elif triton_parallel_bw_gbps == best_bw:
+        print("🏆 Triton parallel kernel achieves highest memory bandwidth!")
+    else:
+        print("🏆 Triton naive kernel achieves highest memory bandwidth!")
+
+    return True
+
+
+if __name__ == "__main__":
+    success = test_kernel()
+
+    print("\n" + "=" * 80)
+    if success:
+        print("🎉 ALL TESTS PASSED!")
+        sys.exit(0)
+    else:
+        print("❌ TESTS FAILED")
+        sys.exit(1)
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/quant.py b/torchao/prototype/moe_training/kernels/mxfp8/quant.py
index 24915d6359..f6a9a045d1 100644
--- a/torchao/prototype/moe_training/kernels/mxfp8/quant.py
+++ b/torchao/prototype/moe_training/kernels/mxfp8/quant.py
@@ -484,12 +484,12 @@ def triton_scale_swizzle_per_group_3d(
     )
 
 
-@triton_op("torchao::triton_mx_block_rearrange_2d_K_groups", mutates_args={})
-def triton_mx_block_rearrange_2d_K_groups(
+def triton_mx_block_rearrange_2d_K_groups_naive(
     scales_tensor: torch.Tensor,
     input_group_end_offsets: torch.Tensor,
 ) -> torch.Tensor:
     """
+    Naive version with while loop (before optimization).
     Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis,
     where the groups are along the contraction dimension of the GEMM.
 
@@ -499,7 +499,6 @@ def triton_mx_block_rearrange_2d_K_groups(
     Args:
         scales_tensor: Input tensor containing e8m0 scales for each logical group of a target tensor.
         input_group_end_offsets: tensor of int32 values representing group end indexes for the input scales
-        output_group_start_offsets: tensor of int32 values representing pre-computed group start indexes after blocked format padding
     Returns:
         - Rearranged tensor in block-scaled swizzle format
     """
@@ -522,8 +521,7 @@ def triton_mx_block_rearrange_2d_K_groups(
     BLOCK_ROWS, BLOCK_COLS = 128, 4
     output_stride_per_block = BLOCK_ROWS * BLOCK_COLS
 
-    # We parallelize per group and per row block.
-    # Cols per group is variable, so we just loop through col blocks for each group.
+    # Naive grid - only parallelize by group and row
     grid = lambda META: (
         num_groups,
         num_row_blocks,
@@ -547,6 +545,186 @@ def triton_mx_block_rearrange_2d_K_groups(
     return output
 
 
+@triton_op("torchao::triton_mx_block_rearrange_2d_K_groups", mutates_args={})
+def triton_mx_block_rearrange_2d_K_groups(
+    scales_tensor: torch.Tensor,
+    input_group_end_offsets: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Parallel version (parallelized over column blocks).
+    Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis,
+    where the groups are along the contraction dimension of the GEMM.
+
+    This format is suitable for Tmem as described in NVIDIA documentation:
+    https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
+
+    Args:
+        scales_tensor: Input tensor containing e8m0 scales for each logical group of a target tensor.
+        input_group_end_offsets: tensor of int32 values representing group end indexes for the input scales
+    Returns:
+        - Rearranged tensor in block-scaled swizzle format
+    """
+    assert scales_tensor.ndim == 2, "scales tensor must be 2d"
+    assert scales_tensor.element_size() == 1, (
+        "Expected element size to be 1 byte (8 bits)"
+    )
+    rows, cols = scales_tensor.shape
+    # Calculate blocks needed
+    num_groups = input_group_end_offsets.shape[0]
+    num_row_blocks = ceil_div(rows, 128)
+    padded_rows = num_row_blocks * 128
+
+    # Padding needing per group is variable/data dependent, so we just pad each group by
+    # the upper bound of 4 cols to avoid a d2h sync caused by iterating over each group.
+    padded_cols = cols + num_groups * 4
+    output = scales_tensor.new_zeros((padded_rows, padded_cols))
+
+    # Output block stride for the rearranged format
+    BLOCK_ROWS, BLOCK_COLS = 128, 4
+    output_stride_per_block = BLOCK_ROWS * BLOCK_COLS
+
+    # Calculate column blocks for the ORIGINAL input tensor (before padding)
+    # Simply divide the number of columns by BLOCK_COLS
+    total_col_blocks = (cols + BLOCK_COLS - 1) // BLOCK_COLS
+
+    # Compute per-group column block counts on GPU for the kernel to use
+    zero = torch.zeros(
+        1, dtype=input_group_end_offsets.dtype, device=scales_tensor.device
+    )
+    group_sizes = torch.diff(input_group_end_offsets, prepend=zero)
+    group_col_block_counts = (group_sizes + BLOCK_COLS - 1) // BLOCK_COLS
+
+    # We parallelize over all column blocks across all groups and row blocks
+    grid = lambda META: (
+        total_col_blocks,
+        num_row_blocks,
+    )
+    wrap_triton(triton_scale_swizzle_2d_K_groups_parallel)[grid](
+        scales_tensor.view(torch.uint8),
+        scales_tensor.stride(0),
+        scales_tensor.stride(1),
+        rows,
+        cols,
+        padded_rows,
+        input_group_end_offsets,
+        group_col_block_counts,
+        output.view(torch.uint8),
+        output_stride_per_block,
+        num_groups=num_groups,
+        BLOCK_ROWS=BLOCK_ROWS,
+        BLOCK_COLS=BLOCK_COLS,
+    )
+    return output
+
+
+@triton.jit
+def triton_scale_swizzle_2d_K_groups_parallel(
+    scales_ptr,  # (M, total_K//block_size)
+    scales_stride_dim0,
+    scales_stride_dim1,
+    scale_rows,
+    scale_cols,
+    padded_rows,
+    orig_offsets,  # (num_groups,)
+    group_col_block_counts,  # (num_groups,) - number of column blocks per group
+    output_scales_ptr,
+    output_stride_per_block,
+    num_groups: tl.constexpr,
+    BLOCK_ROWS: tl.constexpr,
+    BLOCK_COLS: tl.constexpr,
+):
+    """
+    Parallel version that parallelizes over column blocks.
+    Each thread block processes exactly one (row_block, col_block) pair.
+    Uses simple linear search to find which group a column block belongs to.
+    """
+    col_block_pid = tl.program_id(0)
+    row_block_pid = tl.program_id(1)
+
+    # Vectorized search to find which group this column block belongs to
+    # Load all group block counts at once
+    group_indices = tl.arange(0, num_groups)
+    all_block_counts = tl.load(group_col_block_counts + group_indices)
+
+    # Compute cumulative sums to get start/end positions of each group
+    # cumsum_inclusive[i] = total blocks from group 0 to i (inclusive)
+    cumsum_inclusive = tl.cumsum(all_block_counts, axis=0)
+
+    # cumsum_exclusive[i] = total blocks before group i (exclusive)
+    # For i > 0: cumsum_exclusive[i] = cumsum_inclusive[i] - all_block_counts[i]
+    # For i == 0: cumsum_exclusive[i] = 0
+    cumsum_exclusive = tl.where(
+        group_indices > 0, cumsum_inclusive - all_block_counts, 0
+    )
+
+    # Find which group col_block_pid belongs to
+    # A block belongs to group i if: cumsum_exclusive[i] <= block_id < cumsum_inclusive[i]
+    is_in_group = (col_block_pid >= cumsum_exclusive) & (
+        col_block_pid < cumsum_inclusive
+    )
+
+    # Extract the group_pid (sum of indices where condition is true)
+    group_pid = tl.sum(tl.where(is_in_group, group_indices, 0))
+
+    # Extract the local column block offset within the group
+    local_col_block = tl.sum(tl.where(is_in_group, col_block_pid - cumsum_exclusive, 0))
+
+    # Load group offset boundaries
+    input_group_start_col = tl.load(
+        orig_offsets + group_pid - 1, mask=group_pid > 0, other=0
+    )
+    input_group_end_col = tl.load(orig_offsets + group_pid)
+
+    # Compute input column offset for this specific column block
+    curr_input_start_col = input_group_start_col + local_col_block * BLOCK_COLS
+
+    # Early exit if beyond group boundary
+    if curr_input_start_col >= input_group_end_col:
+        return
+
+    # Calculate this group's start col after blocked format padding
+    output_group_start_col = _blocked_group_start_idx(
+        group_pid, orig_offsets, num_groups, 4
+    )
+
+    row_offs = tl.arange(0, BLOCK_ROWS)[:, None]
+    col_offs = tl.arange(0, BLOCK_COLS)[None, :]
+
+    # Read block of input scales
+    block_row_offs = row_block_pid * BLOCK_ROWS + row_offs
+    block_col_offs = curr_input_start_col + col_offs
+    block_offs = (
+        block_row_offs * scales_stride_dim0 + block_col_offs * scales_stride_dim1
+    )
+    mask = (block_row_offs < scale_rows) & (block_col_offs < input_group_end_col)
+    input_scales = tl.load(scales_ptr + block_offs, mask=mask, other=0.0)
+    scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS))
+
+    # Compute output offset
+    out_group_base_offset = output_group_start_col * padded_rows
+
+    num_cols_in_group = input_group_end_col - input_group_start_col
+    num_col_blocks_in_group = tl.cdiv(num_cols_in_group, BLOCK_COLS)
+    stride_per_row_of_blocks_in_group = (
+        num_col_blocks_in_group * output_stride_per_block
+    )
+
+    offset_in_group = (
+        row_block_pid * stride_per_row_of_blocks_in_group
+        + local_col_block * output_stride_per_block
+    )
+    final_offset = out_group_base_offset + offset_in_group
+
+    # Apply swizzling and write
+    dest_indices_flat = _dest_indices_for_block(
+        row_offs, col_offs, BLOCK_ROWS, BLOCK_COLS
+    )
+    tl.store(
+        output_scales_ptr + final_offset + dest_indices_flat,
+        scales_flat,
+    )
+
+
 @triton.jit
 def triton_scale_swizzle_2d_K_groups(
     scales_ptr,  # (M, total_K//block_size)
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
index 3a4ad43b4f..4cb6525c16 100644
--- a/torchao/prototype/moe_training/scaled_grouped_mm.py
+++ b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -26,13 +26,13 @@
     _is_column_major,
 )
 from torchao.prototype.mx_formats.config import (
+    KernelPreference,
     MXFP8Dim1CastKernelChoice,
     ScaleCalculationMode,
 )
 from torchao.prototype.mx_formats.kernels import triton_to_mxfp8_dim0
 from torchao.prototype.mx_formats.mx_tensor import to_mx
 from torchao.prototype.mx_formats.utils import _to_mxfp8_dim1_kernel_wrapper
-from torchao.quantization.quantize_.common import KernelPreference
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -412,7 +412,7 @@ def backward(ctx, grad_out: torch.Tensor):
             block_size,
             elem_dtype=torch.float8_e4m3fn,
             hp_dtype=grad_out.dtype,
-            kernel_preference=KernelPreference.AUTO,  # Not used
+            kernel_preference=KernelPreference.AUTO,
             cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
             scale_calculation_mode=scale_calculation_mode,
         )
@@ -428,7 +428,7 @@ def backward(ctx, grad_out: torch.Tensor):
             block_size,
             elem_dtype=torch.float8_e4m3fn,
             hp_dtype=A.dtype,
-            kernel_preference=KernelPreference.AUTO,  # Not used
+            kernel_preference=KernelPreference.AUTO,
             cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
             scale_calculation_mode=scale_calculation_mode,
         )
@@ -475,7 +475,7 @@ def _to_mxfp8_dim1_3d(
         block_size,
         elem_dtype=torch.float8_e4m3fn,
         hp_dtype=B_reshaped.dtype,
-        kernel_preference=KernelPreference.AUTO,  # Not used
+        kernel_preference=KernelPreference.AUTO,
         cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
         scale_calculation_mode=scaling_mode,
     )
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
index b4cd192244..72a19e2c86 100644
--- a/torchao/prototype/mx_formats/kernels.py
+++ b/torchao/prototype/mx_formats/kernels.py
@@ -626,9 +626,10 @@ def triton_mxfp8_dequant_dim0(
         scale_block_size: int = 32,
     ) -> torch.Tensor:
         assert scale_block_size == 32, "scale_block_size must be 32 for now"
-        assert out_dtype in (torch.bfloat16, torch.float32), (
-            "out_dtype must be bf16 or fp32"
-        )
+        assert out_dtype in (
+            torch.bfloat16,
+            torch.float32,
+        ), "out_dtype must be bf16 or fp32"
 
         # Input shape must be 2D.
         orig_shape = e4m3_data.shape
@@ -1055,6 +1056,7 @@ def _(scale_tensor):
         padded_cols = n_col_blocks * 4
 
         return scale_tensor.new_empty((padded_rows, padded_cols))
+
 else:
 
     def triton_to_mxfp8_dim0(
@@ -1216,6 +1218,7 @@ def custom_mxfp8_quantize_cuda_dim1_sharding(
             rule_for_input_sharded_dim1,
         ]
         return acceptable_shardings
+
 else:
 
     def mxfp8_quantize_cuda(

From daf9ffdf1c051583134b63beda9c58caeb0d5ff7 Mon Sep 17 00:00:00 2001
From: Daniel Vega-Myhre <danvm@meta.com>
Date: Sat, 6 Dec 2025 23:01:12 -0800
Subject: [PATCH 2/4] stick with loop for now

---
 .../mx_block_rearrange_2d_K_groups.cu         | 43 +++----------------
 1 file changed, 6 insertions(+), 37 deletions(-)

diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
index c97baec10d..38eb4159c5 100644
--- a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
+++ b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
@@ -2,15 +2,12 @@
 #include <cuda_fp8.h>
 #include <cstdint>
 #include <cstdio>
-
 #define BLOCK_ROWS 128
 #define BLOCK_COLS 4
-
 // Helper function to compute ceil division
 __device__ __forceinline__ int ceil_div(int a, int b) {
     return (a + b - 1) / b;
 }
-
 // Helper function to compute the start index of a group after padding
 __device__ __forceinline__ int compute_output_group_start_col(
     int group_id,
@@ -19,7 +16,6 @@ __device__ __forceinline__ int compute_output_group_start_col(
     int padding_size
 ) {
     int start_idx = 0;
-
     // Compute prefix sum of padded group sizes
     for (int i = 0; i < group_id; i++) {
         int prev_offset = (i > 0) ? input_group_end_offsets[i - 1] : 0;
@@ -28,10 +24,8 @@ __device__ __forceinline__ int compute_output_group_start_col(
         int padded_size = ceil_div(group_size, padding_size) * padding_size;
         start_idx += padded_size;
     }
-
     return start_idx;
 }
-
 // Compute destination index for swizzled block layout
 // For a 128x4 block: r_div_32 = row / 32, r_mod_32 = row % 32
 // Swizzle: dest = r_mod_32 * 16 + r_div_32 * 4 + col
@@ -40,7 +34,6 @@ __device__ __forceinline__ int compute_swizzled_index(int row, int col) {
     int r_mod_32 = row % 32;
     return r_mod_32 * 16 + r_div_32 * 4 + col;
 }
-
 __global__ void mx_block_rearrange_2d_K_groups_naive_kernel(
     const uint8_t* __restrict__ scales_ptr,
     int scales_stride_dim0,
@@ -55,83 +48,65 @@ __global__ void mx_block_rearrange_2d_K_groups_naive_kernel(
     const int group_id = blockIdx.x;
     const int block_row_id = blockIdx.y;
     const int tid = threadIdx.x;  // 128 threads, each handles one row
-
     // Shared memory for one 128x4 block
     __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS];
-
     // Get start/end cols of this input group
     int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0;
     int input_group_end_col = input_group_end_offsets[group_id];
     int num_cols_in_group = input_group_end_col - input_group_start_col;
-
     // Get output group start column
     int output_group_start_col = compute_output_group_start_col(
         group_id,
         input_group_end_offsets,
         num_groups,
         4); // scaling factor column padding size
-
     // Compute base offset for this group in output
     int out_group_base_offset = output_group_start_col * padded_rows;
-
     // Compute stride per row of blocks in this group
     int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS);
     int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block;
-
     // Each thread handles one row
     int input_row = block_row_id * BLOCK_ROWS + tid;
-
     // Loop through column blocks in this group
     int curr_input_start_col = input_group_start_col;
     int curr_out_col_block = 0;
-
     while (curr_input_start_col < input_group_end_col) {
         // Calculate how many columns to load for this block
         int cols_remaining = input_group_end_col - curr_input_start_col;
         int cols_to_load = min(BLOCK_COLS, cols_remaining);
-
         // Load data for this row using vectorized loads when possible
         uint32_t row_data = 0;
-
         if (input_row < scale_rows && curr_input_start_col < input_group_end_col) {
             int input_offset = input_row * scales_stride_dim0 + curr_input_start_col;
             const uint8_t* input_ptr = scales_ptr + input_offset;
-
             // Check alignment and available columns within this group
             uintptr_t ptr_addr = reinterpret_cast<uintptr_t>(input_ptr);
-
             if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) {
                 // 4-byte aligned and have 4 columns within group: use uint32_t load
-                row_data = *reinterpret_cast<const uint32_t*>(input_ptr);
+                row_data = __ldg(reinterpret_cast<const uint32_t*>(input_ptr));
             } else {
                 // Byte-by-byte loads for unaligned or partial blocks
                 uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
                 for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) {
-                    row_bytes[i] = input_ptr[i];
+                    row_bytes[i] = __ldg(input_ptr + i);
                 }
             }
         }
-
         // Write to swizzled positions in shared memory
         uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
-
         #pragma unroll
         for (int col = 0; col < BLOCK_COLS; col++) {
             int swizzled_idx = compute_swizzled_index(tid, col);
             smem_block[swizzled_idx] = row_bytes[col];
         }
-
         __syncthreads();
-
         // Write from shared memory to global memory
         // Calculate the output offset for this specific block
         int offset_in_group = block_row_id * stride_per_row_of_blocks_in_group +
                               curr_out_col_block * output_stride_per_block;
         int final_offset = out_group_base_offset + offset_in_group;
-
         // Each thread writes 4 bytes (one row of the 128x4 block)
         uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS;
-
         // Check output alignment for vectorized write
         uintptr_t out_ptr_addr = reinterpret_cast<uintptr_t>(output_ptr);
         if (out_ptr_addr % 4 == 0) {
@@ -146,18 +121,17 @@ __global__ void mx_block_rearrange_2d_K_groups_naive_kernel(
                 output_ptr[i] = smem_ptr[i];
             }
         }
-
-        __syncthreads();
-
         // Advance to next column block
         curr_input_start_col += BLOCK_COLS;
         curr_out_col_block += 1;
+        // Only sync if there's another iteration
+        if (curr_input_start_col < input_group_end_col) {
+            __syncthreads();
+        }
     }
 }
-
 // Host function to launch the kernel
 namespace mxfp8 {
-
 void launch_mx_block_rearrange_2d_K_groups(
     const uint8_t* scales_ptr,
     int scales_stride_dim0,
@@ -170,14 +144,11 @@ void launch_mx_block_rearrange_2d_K_groups(
     cudaStream_t stream
 ) {
     int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS;
-
     // Grid parallelizes over (num_groups, num_row_blocks)
     // Each thread block loops through column blocks within its group
     dim3 grid(num_groups, num_row_blocks);
     dim3 block(128); // 128 threads, each handling one row
-
     int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS;
-
     mx_block_rearrange_2d_K_groups_naive_kernel<<<grid, block, 0, stream>>>(
         scales_ptr,
         scales_stride_dim0,
@@ -189,11 +160,9 @@ void launch_mx_block_rearrange_2d_K_groups(
         output_stride_per_block,
         num_groups
     );
-
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         printf("CUDA Error: %s\n", cudaGetErrorString(err));
     }
 }
-
 } // namespace mxfp8

From a5d83e317d5f26e44bc12da64840b2b300eff8bf Mon Sep 17 00:00:00 2001
From: Daniel Vega-Myhre <danvm@meta.com>
Date: Sun, 7 Dec 2025 11:38:28 -0800
Subject: [PATCH 3/4] cuda parallel idea working now

---
 ...h_triton_mx_block_rearrange_2d_K_groups.py |  29 ++-
 .../mx_block_rearrange_2d_K_groups.cu         | 237 +++++++++++++++---
 .../csrc/cuda/mx_kernels/mxfp8_extension.cpp  |  81 +++++-
 .../test_mx_block_rearrange_standalone.py     |  83 +++---
 4 files changed, 352 insertions(+), 78 deletions(-)

diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
index 35245f13bf..f473166e4b 100644
--- a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
+++ b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
@@ -52,7 +52,7 @@
             "-O3",
             "--use_fast_math",
             "-std=c++17",
-            "-gencode=arch=compute_90,code=sm_90",
+            "-gencode=arch=compute_100,code=sm_100",
         ],
         extra_cflags=["-O3", "-std=c++17"],
         verbose=True,
@@ -101,7 +101,7 @@ def get_configs() -> List[ExperimentConfig]:
         (2048, 131072 // block_size),
     ]
     num_groups = [8]
-    versions = ["naive", "parallel", "cuda"]
+    versions = ["triton_naive", "triton_parallel", "cuda_parallel", "cuda_naive"]
 
     configs = []
     for shape, groups, version in itertools.product(
@@ -138,12 +138,18 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     input_group_offsets = generate_jagged_offs(num_groups, Kg, multiple_of=block_size)
 
     # Select which kernel to benchmark based on version
-    if version == "naive":
+    if version == "triton_naive":
         kernel_fn = triton_mx_block_rearrange_2d_K_groups_naive
-    elif version == "parallel":
+    elif version == "triton_parallel":
         kernel_fn = triton_mx_block_rearrange_2d_K_groups
-    elif version == "cuda":
+    elif version == "cuda_parallel":
+        if mxfp8_cuda is None:
+            raise RuntimeError("CUDA kernel not available")
         kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups
+    elif version == "cuda_naive":
+        if mxfp8_cuda is None:
+            raise RuntimeError("CUDA kernel not available")
+        kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups_naive
     else:
         raise ValueError(f"Unknown version: {version}")
 
@@ -191,6 +197,7 @@ def print_results(experiments: List[Experiment]):
         "time_us",
         "mem_bw_gbps",
         "fastest_version",
+        "speedup_vs_triton_naive",
     ]
 
     rows = []
@@ -198,8 +205,19 @@ def print_results(experiments: List[Experiment]):
         # Find fastest version for this shape
         fastest_version = min(versions.items(), key=lambda x: x[1].time_us)[0]
 
+        # Get naive baseline time for speedup calculation
+        naive_time_us = (
+            versions.get("triton_naive").time_us if "triton_naive" in versions else None
+        )
+
         # Add rows for each version
         for version, result in versions.items():
+            # Calculate speedup vs naive
+            speedup_str = ""
+            if naive_time_us and naive_time_us > 0:
+                speedup = naive_time_us / result.time_us
+                speedup_str = f"{speedup:.2f}x"
+
             rows.append(
                 [
                     version,
@@ -207,6 +225,7 @@ def print_results(experiments: List[Experiment]):
                     f"{result.time_us:.2f}",
                     round(result.mem_bw_gbps, 3),
                     fastest_version,
+                    speedup_str,
                 ]
             )
 
diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
index 38eb4159c5..a904e472ed 100644
--- a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
+++ b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
@@ -2,13 +2,48 @@
 #include <cuda_fp8.h>
 #include <cstdint>
 #include <cstdio>
+
 #define BLOCK_ROWS 128
 #define BLOCK_COLS 4
-// Helper function to compute ceil division
+
 __device__ __forceinline__ int ceil_div(int a, int b) {
     return (a + b - 1) / b;
 }
-// Helper function to compute the start index of a group after padding
+
+__device__ void find_group_and_local_offset(
+    int col_block_pid,
+    const int32_t* __restrict__ input_group_end_offsets,
+    int num_groups,
+    int* __restrict__ smem_cumsum,
+    int& group_id,
+    int& local_col_block
+) {
+    if (threadIdx.x == 0) {
+        int cumsum = 0;
+        for (int g = 0; g < num_groups; g++) {
+            int input_group_start = (g > 0) ? input_group_end_offsets[g - 1] : 0;
+            int input_group_end = input_group_end_offsets[g];
+            int group_size = input_group_end - input_group_start;
+            int num_col_blocks = ceil_div(group_size, BLOCK_COLS);
+            cumsum += num_col_blocks;
+            smem_cumsum[g] = cumsum;
+        }
+    }
+    __syncthreads();
+
+    group_id = 0;
+    int cumsum_before = 0;
+    for (int g = 0; g < num_groups; g++) {
+        int cumsum_at_g = smem_cumsum[g];
+        if (col_block_pid < cumsum_at_g) {
+            group_id = g;
+            local_col_block = col_block_pid - cumsum_before;
+            break;
+        }
+        cumsum_before = cumsum_at_g;
+    }
+}
+
 __device__ __forceinline__ int compute_output_group_start_col(
     int group_id,
     const int32_t* input_group_end_offsets,
@@ -16,7 +51,6 @@ __device__ __forceinline__ int compute_output_group_start_col(
     int padding_size
 ) {
     int start_idx = 0;
-    // Compute prefix sum of padded group sizes
     for (int i = 0; i < group_id; i++) {
         int prev_offset = (i > 0) ? input_group_end_offsets[i - 1] : 0;
         int curr_offset = input_group_end_offsets[i];
@@ -26,9 +60,7 @@ __device__ __forceinline__ int compute_output_group_start_col(
     }
     return start_idx;
 }
-// Compute destination index for swizzled block layout
-// For a 128x4 block: r_div_32 = row / 32, r_mod_32 = row % 32
-// Swizzle: dest = r_mod_32 * 16 + r_div_32 * 4 + col
+
 __device__ __forceinline__ int compute_swizzled_index(int row, int col) {
     int r_div_32 = row / 32;
     int r_mod_32 = row % 32;
@@ -47,92 +79,176 @@ __global__ void mx_block_rearrange_2d_K_groups_naive_kernel(
 ) {
     const int group_id = blockIdx.x;
     const int block_row_id = blockIdx.y;
-    const int tid = threadIdx.x;  // 128 threads, each handles one row
-    // Shared memory for one 128x4 block
+    const int tid = threadIdx.x;
+
     __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS];
-    // Get start/end cols of this input group
+
     int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0;
     int input_group_end_col = input_group_end_offsets[group_id];
     int num_cols_in_group = input_group_end_col - input_group_start_col;
-    // Get output group start column
+
     int output_group_start_col = compute_output_group_start_col(
-        group_id,
-        input_group_end_offsets,
-        num_groups,
-        4); // scaling factor column padding size
-    // Compute base offset for this group in output
+        group_id, input_group_end_offsets, num_groups, 4);
+
     int out_group_base_offset = output_group_start_col * padded_rows;
-    // Compute stride per row of blocks in this group
+
     int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS);
     int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block;
-    // Each thread handles one row
+
     int input_row = block_row_id * BLOCK_ROWS + tid;
-    // Loop through column blocks in this group
+
     int curr_input_start_col = input_group_start_col;
     int curr_out_col_block = 0;
+
     while (curr_input_start_col < input_group_end_col) {
-        // Calculate how many columns to load for this block
         int cols_remaining = input_group_end_col - curr_input_start_col;
         int cols_to_load = min(BLOCK_COLS, cols_remaining);
-        // Load data for this row using vectorized loads when possible
+
         uint32_t row_data = 0;
         if (input_row < scale_rows && curr_input_start_col < input_group_end_col) {
             int input_offset = input_row * scales_stride_dim0 + curr_input_start_col;
             const uint8_t* input_ptr = scales_ptr + input_offset;
-            // Check alignment and available columns within this group
+
             uintptr_t ptr_addr = reinterpret_cast<uintptr_t>(input_ptr);
             if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) {
-                // 4-byte aligned and have 4 columns within group: use uint32_t load
                 row_data = __ldg(reinterpret_cast<const uint32_t*>(input_ptr));
             } else {
-                // Byte-by-byte loads for unaligned or partial blocks
                 uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
                 for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) {
                     row_bytes[i] = __ldg(input_ptr + i);
                 }
             }
         }
-        // Write to swizzled positions in shared memory
+
         uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
         #pragma unroll
         for (int col = 0; col < BLOCK_COLS; col++) {
             int swizzled_idx = compute_swizzled_index(tid, col);
             smem_block[swizzled_idx] = row_bytes[col];
         }
+
         __syncthreads();
-        // Write from shared memory to global memory
-        // Calculate the output offset for this specific block
+
         int offset_in_group = block_row_id * stride_per_row_of_blocks_in_group +
                               curr_out_col_block * output_stride_per_block;
         int final_offset = out_group_base_offset + offset_in_group;
-        // Each thread writes 4 bytes (one row of the 128x4 block)
+
         uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS;
-        // Check output alignment for vectorized write
         uintptr_t out_ptr_addr = reinterpret_cast<uintptr_t>(output_ptr);
-        if (out_ptr_addr % 4 == 0) {
-            // Aligned: use uint32_t store
+
+        if (out_ptr_addr % 4 == 0 && cols_to_load >= 4) {
             *reinterpret_cast<uint32_t*>(output_ptr) =
                 *reinterpret_cast<const uint32_t*>(&smem_block[tid * BLOCK_COLS]);
         } else {
-            // Unaligned: byte by byte
             const uint8_t* smem_ptr = &smem_block[tid * BLOCK_COLS];
             #pragma unroll
-            for (int i = 0; i < BLOCK_COLS; i++) {
+            for (int i = 0; i < cols_to_load; i++) {
                 output_ptr[i] = smem_ptr[i];
             }
         }
-        // Advance to next column block
+
         curr_input_start_col += BLOCK_COLS;
         curr_out_col_block += 1;
-        // Only sync if there's another iteration
+
         if (curr_input_start_col < input_group_end_col) {
             __syncthreads();
         }
     }
 }
-// Host function to launch the kernel
+__global__ void mx_block_rearrange_2d_K_groups_parallel_kernel(
+    const uint8_t* __restrict__ scales_ptr,
+    int scales_stride_dim0,
+    int scale_rows,
+    int scale_cols,
+    int padded_rows,
+    const int32_t* __restrict__ input_group_end_offsets,
+    uint8_t* __restrict__ output_scales_ptr,
+    int output_stride_per_block,
+    int num_groups
+) {
+    const int col_block_pid = blockIdx.x;
+    const int row_block_pid = blockIdx.y;
+    const int tid = threadIdx.x;
+
+    __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS];
+    __shared__ int smem_cumsum[32];
+
+    int group_id, local_col_block;
+    find_group_and_local_offset(
+        col_block_pid,
+        input_group_end_offsets,
+        num_groups,
+        smem_cumsum,
+        group_id,
+        local_col_block
+    );
+
+    int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0;
+    int input_group_end_col = input_group_end_offsets[group_id];
+    int curr_input_start_col = input_group_start_col + local_col_block * BLOCK_COLS;
+
+    if (curr_input_start_col >= input_group_end_col) {
+        return;
+    }
+
+    int input_row = row_block_pid * BLOCK_ROWS + tid;
+    int cols_remaining = input_group_end_col - curr_input_start_col;
+    int cols_to_load = min(BLOCK_COLS, cols_remaining);
+
+    uint32_t row_data = 0;
+    if (input_row < scale_rows && curr_input_start_col < input_group_end_col) {
+        int input_offset = input_row * scales_stride_dim0 + curr_input_start_col;
+        const uint8_t* input_ptr = scales_ptr + input_offset;
+
+        uintptr_t ptr_addr = reinterpret_cast<uintptr_t>(input_ptr);
+        if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) {
+            row_data = __ldg(reinterpret_cast<const uint32_t*>(input_ptr));
+        } else {
+            uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
+            for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) {
+                row_bytes[i] = __ldg(input_ptr + i);
+            }
+        }
+    }
+
+    uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
+    #pragma unroll
+    for (int col = 0; col < BLOCK_COLS; col++) {
+        int swizzled_idx = compute_swizzled_index(tid, col);
+        smem_block[swizzled_idx] = row_bytes[col];
+    }
+
+    __syncthreads();
+
+    int output_group_start_col = compute_output_group_start_col(
+        group_id, input_group_end_offsets, num_groups, 4
+    );
+    int out_group_base_offset = output_group_start_col * padded_rows;
+
+    int num_cols_in_group = input_group_end_col - input_group_start_col;
+    int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS);
+    int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block;
+
+    int offset_in_group = row_block_pid * stride_per_row_of_blocks_in_group +
+                          local_col_block * output_stride_per_block;
+    int final_offset = out_group_base_offset + offset_in_group;
+
+    uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS;
+    uintptr_t out_ptr_addr = reinterpret_cast<uintptr_t>(output_ptr);
+
+    if (out_ptr_addr % 4 == 0 && cols_to_load >= 4) {
+        *reinterpret_cast<uint32_t*>(output_ptr) =
+            *reinterpret_cast<const uint32_t*>(&smem_block[tid * BLOCK_COLS]);
+    } else {
+        const uint8_t* smem_ptr = &smem_block[tid * BLOCK_COLS];
+        #pragma unroll
+        for (int i = 0; i < cols_to_load; i++) {
+            output_ptr[i] = smem_ptr[i];
+        }
+    }
+}
 namespace mxfp8 {
-void launch_mx_block_rearrange_2d_K_groups(
+void launch_mx_block_rearrange_2d_K_groups_naive(
     const uint8_t* scales_ptr,
     int scales_stride_dim0,
     int scale_rows,
@@ -144,11 +260,11 @@ void launch_mx_block_rearrange_2d_K_groups(
     cudaStream_t stream
 ) {
     int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS;
-    // Grid parallelizes over (num_groups, num_row_blocks)
-    // Each thread block loops through column blocks within its group
-    dim3 grid(num_groups, num_row_blocks);
-    dim3 block(128); // 128 threads, each handling one row
     int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS;
+
+    dim3 grid(num_groups, num_row_blocks);
+    dim3 block(128);
+
     mx_block_rearrange_2d_K_groups_naive_kernel<<<grid, block, 0, stream>>>(
         scales_ptr,
         scales_stride_dim0,
@@ -160,6 +276,47 @@ void launch_mx_block_rearrange_2d_K_groups(
         output_stride_per_block,
         num_groups
     );
+
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("CUDA Error (naive): %s\n", cudaGetErrorString(err));
+    }
+}
+
+void launch_mx_block_rearrange_2d_K_groups(
+    const uint8_t* scales_ptr,
+    int scales_stride_dim0,
+    int scale_rows,
+    int scale_cols,
+    int padded_rows,
+    const int32_t* input_group_end_offsets,
+    uint8_t* output_scales_ptr,
+    int num_groups,
+    cudaStream_t stream
+) {
+    int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS;
+    int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS;
+
+    // Over-allocate column blocks to avoid D2H sync required for calculating exact number of col blocks
+    // required then moving to host to use in the grid.
+    // Maximum is: total_col_blocks + num_groups (one extra per group for padding)
+    int total_col_blocks = (scale_cols + BLOCK_COLS - 1) / BLOCK_COLS + num_groups;
+
+    dim3 grid(total_col_blocks, num_row_blocks);
+    dim3 block(128);
+
+    mx_block_rearrange_2d_K_groups_parallel_kernel<<<grid, block, 0, stream>>>(
+        scales_ptr,
+        scales_stride_dim0,
+        scale_rows,
+        scale_cols,
+        padded_rows,
+        input_group_end_offsets,
+        output_scales_ptr,
+        output_stride_per_block,
+        num_groups
+    );
+
     cudaError_t err = cudaGetLastError();
     if (err != cudaSuccess) {
         printf("CUDA Error: %s\n", cudaGetErrorString(err));
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
index 55db12c40e..2690ba1449 100644
--- a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
+++ b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
@@ -36,6 +36,17 @@ void launch_mx_block_rearrange_2d_K_groups(
     int num_groups,
     cudaStream_t stream);
 
+void launch_mx_block_rearrange_2d_K_groups_naive(
+    const uint8_t* scales_ptr,
+    int scales_stride_dim0,
+    int scale_rows,
+    int scale_cols,
+    int padded_rows,
+    const int32_t* input_group_end_offsets,
+    uint8_t* output_scales_ptr,
+    int num_groups,
+    cudaStream_t stream);
+
 // Helper for tensor validation
 void check_cuda_tensor(const torch::Tensor &t, const char *name) {
   TORCH_CHECK(t.is_cuda(), name, " must be a CUDA tensor");
@@ -233,7 +244,7 @@ torch::Tensor mx_block_rearrange_2d_K_groups(
   const int32_t* offsets_ptr = input_group_end_offsets.data_ptr<int32_t>();
   uint8_t* output_ptr = output.data_ptr<uint8_t>();
   
-  // Launch kernel
+  // Launch parallel kernel (optimized)
   launch_mx_block_rearrange_2d_K_groups(
       scales_ptr,
       scales_tensor.stride(0),
@@ -248,6 +259,66 @@ torch::Tensor mx_block_rearrange_2d_K_groups(
   return output;
 }
 
+// Python wrapper for mx_block_rearrange_2d_K_groups_naive
+torch::Tensor mx_block_rearrange_2d_K_groups_naive(
+    torch::Tensor scales_tensor,
+    torch::Tensor input_group_end_offsets) {
+
+  // Validate inputs
+  check_cuda_tensor(scales_tensor, "scales_tensor");
+  check_cuda_tensor(input_group_end_offsets, "input_group_end_offsets");
+  
+  TORCH_CHECK(scales_tensor.dim() == 2, "scales_tensor must be 2D");
+  TORCH_CHECK(scales_tensor.scalar_type() == torch::kUInt8 || 
+              scales_tensor.scalar_type() == torch::kFloat8_e8m0fnu,
+              "scales_tensor must be uint8 or e8m0");
+  TORCH_CHECK(input_group_end_offsets.scalar_type() == torch::kInt32,
+              "input_group_end_offsets must be int32");
+  TORCH_CHECK(input_group_end_offsets.dim() == 1,
+              "input_group_end_offsets must be 1D");
+
+  c10::cuda::CUDAGuard device_guard(scales_tensor.device());
+
+  const int rows = scales_tensor.size(0);
+  const int cols = scales_tensor.size(1);
+  const int num_groups = input_group_end_offsets.size(0);
+  TORCH_CHECK(num_groups <= 32, "num_groups must be <= 32");
+  
+  // Calculate blocks needed
+  const int BLOCK_ROWS = 128;
+  const int BLOCK_COLS = 4;
+  const int num_row_blocks = (rows + BLOCK_ROWS - 1) / BLOCK_ROWS;
+  const int padded_rows = num_row_blocks * BLOCK_ROWS;
+  
+  // Padding per group is variable/data dependent, so pad each group by upper bound
+  const int padded_cols = cols + num_groups * BLOCK_COLS;
+  
+  // Create output tensor
+  auto output = torch::zeros({padded_rows, padded_cols},
+                            torch::TensorOptions()
+                                .dtype(scales_tensor.scalar_type())
+                                .device(scales_tensor.device()));
+  
+  // Get raw pointers
+  const uint8_t* scales_ptr = scales_tensor.data_ptr<uint8_t>();
+  const int32_t* offsets_ptr = input_group_end_offsets.data_ptr<int32_t>();
+  uint8_t* output_ptr = output.data_ptr<uint8_t>();
+  
+  // Launch naive kernel (original with while loop)
+  launch_mx_block_rearrange_2d_K_groups_naive(
+      scales_ptr,
+      scales_tensor.stride(0),
+      rows,
+      cols,
+      padded_rows,
+      offsets_ptr,
+      output_ptr,
+      num_groups,
+      at::cuda::getCurrentCUDAStream());
+  
+  return output;
+}
+
 } // namespace mxfp8
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
@@ -266,7 +337,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
   m.def("mx_block_rearrange_2d_K_groups", 
         &mxfp8::mx_block_rearrange_2d_K_groups,
-        "Rearrange E8M0 scales to block-scaled swizzle format for cuBLAS Tmem",
+        "Rearrange E8M0 scales to block-scaled swizzle format (parallelized)",
+        py::arg("scales_tensor"),
+        py::arg("input_group_end_offsets"));
+
+  m.def("mx_block_rearrange_2d_K_groups_naive", 
+        &mxfp8::mx_block_rearrange_2d_K_groups_naive,
+        "Rearrange E8M0 scales to block-scaled swizzle format (naive version)",
         py::arg("scales_tensor"),
         py::arg("input_group_end_offsets"));
 }
diff --git a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py
index 9b779c655c..764d9953da 100644
--- a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py
+++ b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py
@@ -28,7 +28,7 @@
         "-O3",
         "--use_fast_math",
         "-std=c++17",
-        "-gencode=arch=compute_90,code=sm_90",
+        "-gencode=arch=compute_100,code=sm_100",
     ],
     extra_cflags=["-O3", "-std=c++17"],
     verbose=True,
@@ -68,9 +68,6 @@ def test_kernel():
         ao_root = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", "..", ".."))
         sys.path.insert(0, ao_root)
 
-        from torchao.prototype.moe_training.kernels.mxfp8 import (
-            triton_mx_block_rearrange_2d_K_groups,
-        )
         from torchao.prototype.moe_training.kernels.mxfp8.quant import (
             triton_mx_block_rearrange_2d_K_groups_naive,
         )
@@ -118,31 +115,55 @@ def test_kernel():
 
     # Test CUDA kernel
     print("\n" + "-" * 80)
-    print("Running CUDA kernel...")
-    cuda_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups(
+    print("Running CUDA parallel kernel (optimized)...")
+    cuda_parallel_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups(
+        e8m0_scales.view(torch.uint8),
+        scale_group_offsets,
+    )
+    print("✓ CUDA parallel kernel completed successfully")
+
+    # Test CUDA naive kernel
+    print("\n" + "-" * 80)
+    print("Running CUDA naive kernel...")
+    cuda_naive_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups_naive(
         e8m0_scales.view(torch.uint8),
         scale_group_offsets,
     )
-    print("✓ CUDA kernel completed successfully")
+    print("✓ CUDA naive kernel completed successfully")
 
-    output_bytes = cuda_out_scales.numel() * bytes_per_element
+    output_bytes = cuda_parallel_out_scales.numel() * bytes_per_element
     total_bytes = input_bytes + output_bytes
 
-    # Compare with Triton reference
+    # Compare with Triton naive reference
     print("\n" + "-" * 80)
-    print("Running Triton reference kernels...")
+    print("Running Triton naive reference kernel...")
     triton_naive_out = triton_mx_block_rearrange_2d_K_groups_naive(
         e8m0_scales,
         scale_group_offsets,
     )
-    print("✓ Triton kernel completed successfully")
+    print("✓ Triton naive kernel completed successfully")
 
     # Verify correctness
-    cuda_out_e8m0 = cuda_out_scales.view(torch.float8_e8m0fnu)
-    if not torch.equal(triton_naive_out, cuda_out_e8m0):
-        print("✗ CUDA and Triton naive outputs differ!")
+    cuda_parallel_out_e8m0 = cuda_parallel_out_scales.view(torch.float8_e8m0fnu)
+    cuda_naive_out_e8m0 = cuda_naive_out_scales.view(torch.float8_e8m0fnu)
+
+    print("\nVerifying correctness...")
+    if not torch.equal(triton_naive_out, cuda_naive_out_e8m0):
+        print("✗ CUDA naive and Triton naive outputs differ!")
+        return False
+    print("✓ CUDA naive matches Triton naive")
+
+    if not torch.equal(triton_naive_out, cuda_parallel_out_e8m0):
+        print("✗ CUDA parallel and Triton naive outputs differ!")
+        return False
+    print("✓ CUDA parallel matches Triton naive")
+
+    if not torch.equal(cuda_naive_out_e8m0, cuda_parallel_out_e8m0):
+        print("✗ CUDA naive and CUDA parallel outputs differ!")
         return False
-    print("✓ All outputs are IDENTICAL!")
+    print("✓ CUDA naive and CUDA parallel match each other")
+
+    print("\n✓ All outputs are IDENTICAL!")
 
     # Benchmark section
     print("\n" + "=" * 80)
@@ -159,21 +180,21 @@ def test_kernel():
     )
     triton_naive_bw_gbps = (total_bytes / 1e9) / (triton_naive_time_us / 1e6)
 
-    # Benchmark Triton parallel
-    triton_parallel_time_us = benchmark_kernel(
-        triton_mx_block_rearrange_2d_K_groups,
-        e8m0_scales,
+    # Benchmark CUDA parallel (optimized)
+    cuda_parallel_time_us = benchmark_kernel(
+        mx_block_rearrange.mx_block_rearrange_2d_K_groups,
+        e8m0_scales.view(torch.uint8),
         scale_group_offsets,
     )
-    triton_parallel_bw_gbps = (total_bytes / 1e9) / (triton_parallel_time_us / 1e6)
+    cuda_parallel_bw_gbps = (total_bytes / 1e9) / (cuda_parallel_time_us / 1e6)
 
-    # Benchmark CUDA
-    cuda_time_us = benchmark_kernel(
-        mx_block_rearrange.mx_block_rearrange_2d_K_groups,
+    # Benchmark CUDA naive
+    cuda_naive_time_us = benchmark_kernel(
+        mx_block_rearrange.mx_block_rearrange_2d_K_groups_naive,
         e8m0_scales.view(torch.uint8),
         scale_group_offsets,
     )
-    cuda_bw_gbps = (total_bytes / 1e9) / (cuda_time_us / 1e6)
+    cuda_naive_bw_gbps = (total_bytes / 1e9) / (cuda_naive_time_us / 1e6)
 
     # Print results
     print("\nResults:")
@@ -186,19 +207,19 @@ def test_kernel():
         f"{'Triton Naive':<25} {triton_naive_time_us:<15.2f} {triton_naive_bw_gbps:<20.2f} {'1.00x':<10}"
     )
     print(
-        f"{'Triton Parallel':<25} {triton_parallel_time_us:<15.2f} {triton_parallel_bw_gbps:<20.2f} {triton_naive_time_us / triton_parallel_time_us:<10.2f}x"
+        f"{'CUDA Naive':<25} {cuda_naive_time_us:<15.2f} {cuda_naive_bw_gbps:<20.2f} {triton_naive_time_us / cuda_naive_time_us:<10.2f}x"
     )
     print(
-        f"{'CUDA (Optimized)':<25} {cuda_time_us:<15.2f} {cuda_bw_gbps:<20.2f} {triton_naive_time_us / cuda_time_us:<10.2f}x"
+        f"{'CUDA Parallel':<25} {cuda_parallel_time_us:<15.2f} {cuda_parallel_bw_gbps:<20.2f} {triton_naive_time_us / cuda_parallel_time_us:<10.2f}x"
     )
     print()
 
     # Highlight best performer
-    best_bw = max(triton_naive_bw_gbps, triton_parallel_bw_gbps, cuda_bw_gbps)
-    if cuda_bw_gbps == best_bw:
-        print("🏆 CUDA kernel achieves highest memory bandwidth!")
-    elif triton_parallel_bw_gbps == best_bw:
-        print("🏆 Triton parallel kernel achieves highest memory bandwidth!")
+    best_bw = max(triton_naive_bw_gbps, cuda_naive_bw_gbps, cuda_parallel_bw_gbps)
+    if cuda_parallel_bw_gbps == best_bw:
+        print("🏆 CUDA parallel kernel achieves highest memory bandwidth!")
+    elif cuda_naive_bw_gbps == best_bw:
+        print("🏆 CUDA naive kernel achieves highest memory bandwidth!")
     else:
         print("🏆 Triton naive kernel achieves highest memory bandwidth!")
 

From f838d7bac7450bed8449aa91d310c650ed4b968d Mon Sep 17 00:00:00 2001
From: Daniel Vega-Myhre <danvm@meta.com>
Date: Sun, 7 Dec 2025 19:30:30 -0800
Subject: [PATCH 4/4] remove triton parallel and cuda naive impls

---
 ...h_triton_mx_block_rearrange_2d_K_groups.py |  29 +--
 .../mx_block_rearrange_2d_K_groups.cu         | 134 +----------
 .../csrc/cuda/mx_kernels/mxfp8_extension.cpp  |  77 -------
 .../test_mx_block_rearrange_standalone.py     |  65 ++----
 .../moe_training/kernels/mxfp8/quant.py       | 216 ++----------------
 5 files changed, 53 insertions(+), 468 deletions(-)

diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
index f473166e4b..d782b01f4d 100644
--- a/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
+++ b/benchmarks/prototype/moe_training/mxfp8/bench_triton_mx_block_rearrange_2d_K_groups.py
@@ -15,11 +15,8 @@
 from tqdm import tqdm
 
 from benchmarks.utils import benchmark_cuda_function_in_microseconds
-from torchao.prototype.moe_training.kernels.mxfp8 import (
-    triton_mx_block_rearrange_2d_K_groups,
-)
 from torchao.prototype.moe_training.kernels.mxfp8.quant import (
-    triton_mx_block_rearrange_2d_K_groups_naive,
+    triton_mx_block_rearrange_2d_K_groups,
 )
 from torchao.prototype.moe_training.utils import generate_jagged_offs
 
@@ -101,7 +98,7 @@ def get_configs() -> List[ExperimentConfig]:
         (2048, 131072 // block_size),
     ]
     num_groups = [8]
-    versions = ["triton_naive", "triton_parallel", "cuda_parallel", "cuda_naive"]
+    versions = ["triton", "cuda_parallel"]
 
     configs = []
     for shape, groups, version in itertools.product(
@@ -138,18 +135,12 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
     input_group_offsets = generate_jagged_offs(num_groups, Kg, multiple_of=block_size)
 
     # Select which kernel to benchmark based on version
-    if version == "triton_naive":
-        kernel_fn = triton_mx_block_rearrange_2d_K_groups_naive
-    elif version == "triton_parallel":
+    if version == "triton":
         kernel_fn = triton_mx_block_rearrange_2d_K_groups
     elif version == "cuda_parallel":
         if mxfp8_cuda is None:
             raise RuntimeError("CUDA kernel not available")
         kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups
-    elif version == "cuda_naive":
-        if mxfp8_cuda is None:
-            raise RuntimeError("CUDA kernel not available")
-        kernel_fn = mxfp8_cuda.mx_block_rearrange_2d_K_groups_naive
     else:
         raise ValueError(f"Unknown version: {version}")
 
@@ -197,7 +188,7 @@ def print_results(experiments: List[Experiment]):
         "time_us",
         "mem_bw_gbps",
         "fastest_version",
-        "speedup_vs_triton_naive",
+        "speedup_vs_triton",
     ]
 
     rows = []
@@ -205,17 +196,17 @@ def print_results(experiments: List[Experiment]):
         # Find fastest version for this shape
         fastest_version = min(versions.items(), key=lambda x: x[1].time_us)[0]
 
-        # Get naive baseline time for speedup calculation
-        naive_time_us = (
-            versions.get("triton_naive").time_us if "triton_naive" in versions else None
+        # Get triton baseline time for speedup calculation
+        triton_time_us = (
+            versions.get("triton").time_us if "triton" in versions else None
         )
 
         # Add rows for each version
         for version, result in versions.items():
-            # Calculate speedup vs naive
+            # Calculate speedup vs triton
             speedup_str = ""
-            if naive_time_us and naive_time_us > 0:
-                speedup = naive_time_us / result.time_us
+            if version != "triton" and triton_time_us > 0:
+                speedup = triton_time_us / result.time_us
                 speedup_str = f"{speedup:.2f}x"
 
             rows.append(
diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
index a904e472ed..6d36455528 100644
--- a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
+++ b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_K_groups.cu
@@ -66,95 +66,7 @@ __device__ __forceinline__ int compute_swizzled_index(int row, int col) {
     int r_mod_32 = row % 32;
     return r_mod_32 * 16 + r_div_32 * 4 + col;
 }
-__global__ void mx_block_rearrange_2d_K_groups_naive_kernel(
-    const uint8_t* __restrict__ scales_ptr,
-    int scales_stride_dim0,
-    int scale_rows,
-    int scale_cols,
-    int padded_rows,
-    const int32_t* __restrict__ input_group_end_offsets,
-    uint8_t* __restrict__ output_scales_ptr,
-    int output_stride_per_block,
-    int num_groups
-) {
-    const int group_id = blockIdx.x;
-    const int block_row_id = blockIdx.y;
-    const int tid = threadIdx.x;
-
-    __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS];
-
-    int input_group_start_col = (group_id > 0) ? input_group_end_offsets[group_id - 1] : 0;
-    int input_group_end_col = input_group_end_offsets[group_id];
-    int num_cols_in_group = input_group_end_col - input_group_start_col;
-
-    int output_group_start_col = compute_output_group_start_col(
-        group_id, input_group_end_offsets, num_groups, 4);
-
-    int out_group_base_offset = output_group_start_col * padded_rows;
-
-    int num_col_blocks_in_group = ceil_div(num_cols_in_group, BLOCK_COLS);
-    int stride_per_row_of_blocks_in_group = num_col_blocks_in_group * output_stride_per_block;
-
-    int input_row = block_row_id * BLOCK_ROWS + tid;
-
-    int curr_input_start_col = input_group_start_col;
-    int curr_out_col_block = 0;
-
-    while (curr_input_start_col < input_group_end_col) {
-        int cols_remaining = input_group_end_col - curr_input_start_col;
-        int cols_to_load = min(BLOCK_COLS, cols_remaining);
-
-        uint32_t row_data = 0;
-        if (input_row < scale_rows && curr_input_start_col < input_group_end_col) {
-            int input_offset = input_row * scales_stride_dim0 + curr_input_start_col;
-            const uint8_t* input_ptr = scales_ptr + input_offset;
-
-            uintptr_t ptr_addr = reinterpret_cast<uintptr_t>(input_ptr);
-            if (cols_to_load >= 4 && ptr_addr % 4 == 0 && curr_input_start_col + 4 <= input_group_end_col) {
-                row_data = __ldg(reinterpret_cast<const uint32_t*>(input_ptr));
-            } else {
-                uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
-                for (int i = 0; i < cols_to_load && (curr_input_start_col + i) < input_group_end_col; i++) {
-                    row_bytes[i] = __ldg(input_ptr + i);
-                }
-            }
-        }
-
-        uint8_t* row_bytes = reinterpret_cast<uint8_t*>(&row_data);
-        #pragma unroll
-        for (int col = 0; col < BLOCK_COLS; col++) {
-            int swizzled_idx = compute_swizzled_index(tid, col);
-            smem_block[swizzled_idx] = row_bytes[col];
-        }
-
-        __syncthreads();
-
-        int offset_in_group = block_row_id * stride_per_row_of_blocks_in_group +
-                              curr_out_col_block * output_stride_per_block;
-        int final_offset = out_group_base_offset + offset_in_group;
-
-        uint8_t* output_ptr = output_scales_ptr + final_offset + tid * BLOCK_COLS;
-        uintptr_t out_ptr_addr = reinterpret_cast<uintptr_t>(output_ptr);
-
-        if (out_ptr_addr % 4 == 0 && cols_to_load >= 4) {
-            *reinterpret_cast<uint32_t*>(output_ptr) =
-                *reinterpret_cast<const uint32_t*>(&smem_block[tid * BLOCK_COLS]);
-        } else {
-            const uint8_t* smem_ptr = &smem_block[tid * BLOCK_COLS];
-            #pragma unroll
-            for (int i = 0; i < cols_to_load; i++) {
-                output_ptr[i] = smem_ptr[i];
-            }
-        }
 
-        curr_input_start_col += BLOCK_COLS;
-        curr_out_col_block += 1;
-
-        if (curr_input_start_col < input_group_end_col) {
-            __syncthreads();
-        }
-    }
-}
 __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel(
     const uint8_t* __restrict__ scales_ptr,
     int scales_stride_dim0,
@@ -172,6 +84,7 @@ __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel(
 
     __shared__ __align__(16) uint8_t smem_block[BLOCK_ROWS * BLOCK_COLS];
     __shared__ int smem_cumsum[32];
+    __shared__ int output_group_start_col;
 
     int group_id, local_col_block;
     find_group_and_local_offset(
@@ -191,6 +104,12 @@ __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel(
         return;
     }
 
+    if (tid == 0) {
+        output_group_start_col = compute_output_group_start_col(
+            group_id, input_group_end_offsets, num_groups, 4
+        );
+    }
+
     int input_row = row_block_pid * BLOCK_ROWS + tid;
     int cols_remaining = input_group_end_col - curr_input_start_col;
     int cols_to_load = min(BLOCK_COLS, cols_remaining);
@@ -220,9 +139,6 @@ __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel(
 
     __syncthreads();
 
-    int output_group_start_col = compute_output_group_start_col(
-        group_id, input_group_end_offsets, num_groups, 4
-    );
     int out_group_base_offset = output_group_start_col * padded_rows;
 
     int num_cols_in_group = input_group_end_col - input_group_start_col;
@@ -247,42 +163,8 @@ __global__ void mx_block_rearrange_2d_K_groups_parallel_kernel(
         }
     }
 }
-namespace mxfp8 {
-void launch_mx_block_rearrange_2d_K_groups_naive(
-    const uint8_t* scales_ptr,
-    int scales_stride_dim0,
-    int scale_rows,
-    int scale_cols,
-    int padded_rows,
-    const int32_t* input_group_end_offsets,
-    uint8_t* output_scales_ptr,
-    int num_groups,
-    cudaStream_t stream
-) {
-    int num_row_blocks = (scale_rows + BLOCK_ROWS - 1) / BLOCK_ROWS;
-    int output_stride_per_block = BLOCK_ROWS * BLOCK_COLS;
-
-    dim3 grid(num_groups, num_row_blocks);
-    dim3 block(128);
-
-    mx_block_rearrange_2d_K_groups_naive_kernel<<<grid, block, 0, stream>>>(
-        scales_ptr,
-        scales_stride_dim0,
-        scale_rows,
-        scale_cols,
-        padded_rows,
-        input_group_end_offsets,
-        output_scales_ptr,
-        output_stride_per_block,
-        num_groups
-    );
-
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-        printf("CUDA Error (naive): %s\n", cudaGetErrorString(err));
-    }
-}
 
+namespace mxfp8 {
 void launch_mx_block_rearrange_2d_K_groups(
     const uint8_t* scales_ptr,
     int scales_stride_dim0,
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
index 2690ba1449..7fba7da2d9 100644
--- a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
+++ b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp
@@ -36,17 +36,6 @@ void launch_mx_block_rearrange_2d_K_groups(
     int num_groups,
     cudaStream_t stream);
 
-void launch_mx_block_rearrange_2d_K_groups_naive(
-    const uint8_t* scales_ptr,
-    int scales_stride_dim0,
-    int scale_rows,
-    int scale_cols,
-    int padded_rows,
-    const int32_t* input_group_end_offsets,
-    uint8_t* output_scales_ptr,
-    int num_groups,
-    cudaStream_t stream);
-
 // Helper for tensor validation
 void check_cuda_tensor(const torch::Tensor &t, const char *name) {
   TORCH_CHECK(t.is_cuda(), name, " must be a CUDA tensor");
@@ -259,66 +248,6 @@ torch::Tensor mx_block_rearrange_2d_K_groups(
   return output;
 }
 
-// Python wrapper for mx_block_rearrange_2d_K_groups_naive
-torch::Tensor mx_block_rearrange_2d_K_groups_naive(
-    torch::Tensor scales_tensor,
-    torch::Tensor input_group_end_offsets) {
-
-  // Validate inputs
-  check_cuda_tensor(scales_tensor, "scales_tensor");
-  check_cuda_tensor(input_group_end_offsets, "input_group_end_offsets");
-  
-  TORCH_CHECK(scales_tensor.dim() == 2, "scales_tensor must be 2D");
-  TORCH_CHECK(scales_tensor.scalar_type() == torch::kUInt8 || 
-              scales_tensor.scalar_type() == torch::kFloat8_e8m0fnu,
-              "scales_tensor must be uint8 or e8m0");
-  TORCH_CHECK(input_group_end_offsets.scalar_type() == torch::kInt32,
-              "input_group_end_offsets must be int32");
-  TORCH_CHECK(input_group_end_offsets.dim() == 1,
-              "input_group_end_offsets must be 1D");
-
-  c10::cuda::CUDAGuard device_guard(scales_tensor.device());
-
-  const int rows = scales_tensor.size(0);
-  const int cols = scales_tensor.size(1);
-  const int num_groups = input_group_end_offsets.size(0);
-  TORCH_CHECK(num_groups <= 32, "num_groups must be <= 32");
-  
-  // Calculate blocks needed
-  const int BLOCK_ROWS = 128;
-  const int BLOCK_COLS = 4;
-  const int num_row_blocks = (rows + BLOCK_ROWS - 1) / BLOCK_ROWS;
-  const int padded_rows = num_row_blocks * BLOCK_ROWS;
-  
-  // Padding per group is variable/data dependent, so pad each group by upper bound
-  const int padded_cols = cols + num_groups * BLOCK_COLS;
-  
-  // Create output tensor
-  auto output = torch::zeros({padded_rows, padded_cols},
-                            torch::TensorOptions()
-                                .dtype(scales_tensor.scalar_type())
-                                .device(scales_tensor.device()));
-  
-  // Get raw pointers
-  const uint8_t* scales_ptr = scales_tensor.data_ptr<uint8_t>();
-  const int32_t* offsets_ptr = input_group_end_offsets.data_ptr<int32_t>();
-  uint8_t* output_ptr = output.data_ptr<uint8_t>();
-  
-  // Launch naive kernel (original with while loop)
-  launch_mx_block_rearrange_2d_K_groups_naive(
-      scales_ptr,
-      scales_tensor.stride(0),
-      rows,
-      cols,
-      padded_rows,
-      offsets_ptr,
-      output_ptr,
-      num_groups,
-      at::cuda::getCurrentCUDAStream());
-  
-  return output;
-}
-
 } // namespace mxfp8
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
@@ -340,10 +269,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "Rearrange E8M0 scales to block-scaled swizzle format (parallelized)",
         py::arg("scales_tensor"),
         py::arg("input_group_end_offsets"));
-
-  m.def("mx_block_rearrange_2d_K_groups_naive", 
-        &mxfp8::mx_block_rearrange_2d_K_groups_naive,
-        "Rearrange E8M0 scales to block-scaled swizzle format (naive version)",
-        py::arg("scales_tensor"),
-        py::arg("input_group_end_offsets"));
 }
diff --git a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py
index 764d9953da..00733d60a7 100644
--- a/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py
+++ b/torchao/csrc/cuda/mx_kernels/test_mx_block_rearrange_standalone.py
@@ -69,7 +69,7 @@ def test_kernel():
         sys.path.insert(0, ao_root)
 
         from torchao.prototype.moe_training.kernels.mxfp8.quant import (
-            triton_mx_block_rearrange_2d_K_groups_naive,
+            triton_mx_block_rearrange_2d_K_groups,
         )
         from torchao.prototype.moe_training.utils import generate_jagged_offs
         from torchao.prototype.mx_formats.mx_tensor import to_mx
@@ -122,46 +122,26 @@ def test_kernel():
     )
     print("✓ CUDA parallel kernel completed successfully")
 
-    # Test CUDA naive kernel
-    print("\n" + "-" * 80)
-    print("Running CUDA naive kernel...")
-    cuda_naive_out_scales = mx_block_rearrange.mx_block_rearrange_2d_K_groups_naive(
-        e8m0_scales.view(torch.uint8),
-        scale_group_offsets,
-    )
-    print("✓ CUDA naive kernel completed successfully")
-
     output_bytes = cuda_parallel_out_scales.numel() * bytes_per_element
     total_bytes = input_bytes + output_bytes
 
-    # Compare with Triton naive reference
+    # Compare with Triton reference
     print("\n" + "-" * 80)
-    print("Running Triton naive reference kernel...")
-    triton_naive_out = triton_mx_block_rearrange_2d_K_groups_naive(
+    print("Running Triton reference kernel...")
+    triton_out = triton_mx_block_rearrange_2d_K_groups(
         e8m0_scales,
         scale_group_offsets,
     )
-    print("✓ Triton naive kernel completed successfully")
+    print("✓ Triton kernel completed successfully")
 
     # Verify correctness
     cuda_parallel_out_e8m0 = cuda_parallel_out_scales.view(torch.float8_e8m0fnu)
-    cuda_naive_out_e8m0 = cuda_naive_out_scales.view(torch.float8_e8m0fnu)
 
     print("\nVerifying correctness...")
-    if not torch.equal(triton_naive_out, cuda_naive_out_e8m0):
-        print("✗ CUDA naive and Triton naive outputs differ!")
-        return False
-    print("✓ CUDA naive matches Triton naive")
-
-    if not torch.equal(triton_naive_out, cuda_parallel_out_e8m0):
-        print("✗ CUDA parallel and Triton naive outputs differ!")
+    if not torch.equal(triton_out, cuda_parallel_out_e8m0):
+        print("✗ CUDA parallel and Triton outputs differ!")
         return False
-    print("✓ CUDA parallel matches Triton naive")
-
-    if not torch.equal(cuda_naive_out_e8m0, cuda_parallel_out_e8m0):
-        print("✗ CUDA naive and CUDA parallel outputs differ!")
-        return False
-    print("✓ CUDA naive and CUDA parallel match each other")
+    print("✓ CUDA parallel matches Triton")
 
     print("\n✓ All outputs are IDENTICAL!")
 
@@ -172,13 +152,13 @@ def test_kernel():
 
     print("\nBenchmarking kernels (100 iterations each)...")
 
-    # Benchmark Triton naive
-    triton_naive_time_us = benchmark_kernel(
-        triton_mx_block_rearrange_2d_K_groups_naive,
+    # Benchmark Triton
+    triton_time_us = benchmark_kernel(
+        triton_mx_block_rearrange_2d_K_groups,
         e8m0_scales,
         scale_group_offsets,
     )
-    triton_naive_bw_gbps = (total_bytes / 1e9) / (triton_naive_time_us / 1e6)
+    triton_bw_gbps = (total_bytes / 1e9) / (triton_time_us / 1e6)
 
     # Benchmark CUDA parallel (optimized)
     cuda_parallel_time_us = benchmark_kernel(
@@ -188,14 +168,6 @@ def test_kernel():
     )
     cuda_parallel_bw_gbps = (total_bytes / 1e9) / (cuda_parallel_time_us / 1e6)
 
-    # Benchmark CUDA naive
-    cuda_naive_time_us = benchmark_kernel(
-        mx_block_rearrange.mx_block_rearrange_2d_K_groups_naive,
-        e8m0_scales.view(torch.uint8),
-        scale_group_offsets,
-    )
-    cuda_naive_bw_gbps = (total_bytes / 1e9) / (cuda_naive_time_us / 1e6)
-
     # Print results
     print("\nResults:")
     print(f"  Input size:  {input_bytes / 1e6:.2f} MB")
@@ -204,24 +176,19 @@ def test_kernel():
     print(f"{'Kernel':<25} {'Time (μs)':<15} {'Bandwidth (GB/s)':<20} {'Speedup':<10}")
     print("-" * 70)
     print(
-        f"{'Triton Naive':<25} {triton_naive_time_us:<15.2f} {triton_naive_bw_gbps:<20.2f} {'1.00x':<10}"
-    )
-    print(
-        f"{'CUDA Naive':<25} {cuda_naive_time_us:<15.2f} {cuda_naive_bw_gbps:<20.2f} {triton_naive_time_us / cuda_naive_time_us:<10.2f}x"
+        f"{'Triton':<25} {triton_time_us:<15.2f} {triton_bw_gbps:<20.2f} {'1.00x':<10}"
     )
     print(
-        f"{'CUDA Parallel':<25} {cuda_parallel_time_us:<15.2f} {cuda_parallel_bw_gbps:<20.2f} {triton_naive_time_us / cuda_parallel_time_us:<10.2f}x"
+        f"{'CUDA Parallel':<25} {cuda_parallel_time_us:<15.2f} {cuda_parallel_bw_gbps:<20.2f} {triton_time_us / cuda_parallel_time_us:<10.2f}x"
     )
     print()
 
     # Highlight best performer
-    best_bw = max(triton_naive_bw_gbps, cuda_naive_bw_gbps, cuda_parallel_bw_gbps)
+    best_bw = max(triton_bw_gbps, cuda_parallel_bw_gbps)
     if cuda_parallel_bw_gbps == best_bw:
         print("🏆 CUDA parallel kernel achieves highest memory bandwidth!")
-    elif cuda_naive_bw_gbps == best_bw:
-        print("🏆 CUDA naive kernel achieves highest memory bandwidth!")
     else:
-        print("🏆 Triton naive kernel achieves highest memory bandwidth!")
+        print("🏆 Triton kernel achieves highest memory bandwidth!")
 
     return True
 
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/quant.py b/torchao/prototype/moe_training/kernels/mxfp8/quant.py
index f6a9a045d1..1678d6e711 100644
--- a/torchao/prototype/moe_training/kernels/mxfp8/quant.py
+++ b/torchao/prototype/moe_training/kernels/mxfp8/quant.py
@@ -238,9 +238,9 @@ def triton_mx_block_rearrange_2d_M_groups(
         - Rearranged tensor in block-scaled swizzle format
     """
     assert scales_tensor.ndim == 2, "scales tensor must be 2d"
-    assert scales_tensor.element_size() == 1, (
-        "Expected element size to be 1 byte (8 bits)"
-    )
+    assert (
+        scales_tensor.element_size() == 1
+    ), "Expected element size to be 1 byte (8 bits)"
     rows, cols = scales_tensor.shape
     num_groups = input_group_end_offsets.shape[0]
 
@@ -380,9 +380,9 @@ def triton_mx_block_rearrange_per_group_3d(scale_tensor: torch.Tensor) -> torch.
         Rearranged tensor in block-scaled swizzle format
     """
     assert scale_tensor.ndim == 3, "scales tensor must be 3d"
-    assert scale_tensor.element_size() == 1, (
-        "Expected element size to be 1 byte (8 bits)"
-    )
+    assert (
+        scale_tensor.element_size() == 1
+    ), "Expected element size to be 1 byte (8 bits)"
 
     num_groups, rows, cols = scale_tensor.shape
     input_stride_dim0 = scale_tensor.stride(0)
@@ -484,12 +484,12 @@ def triton_scale_swizzle_per_group_3d(
     )
 
 
-def triton_mx_block_rearrange_2d_K_groups_naive(
+@triton_op("torchao::triton_mx_block_rearrange_2d_K_groups", mutates_args={})
+def triton_mx_block_rearrange_2d_K_groups(
     scales_tensor: torch.Tensor,
     input_group_end_offsets: torch.Tensor,
 ) -> torch.Tensor:
     """
-    Naive version with while loop (before optimization).
     Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis,
     where the groups are along the contraction dimension of the GEMM.
 
@@ -503,9 +503,9 @@ def triton_mx_block_rearrange_2d_K_groups_naive(
         - Rearranged tensor in block-scaled swizzle format
     """
     assert scales_tensor.ndim == 2, "scales tensor must be 2d"
-    assert scales_tensor.element_size() == 1, (
-        "Expected element size to be 1 byte (8 bits)"
-    )
+    assert (
+        scales_tensor.element_size() == 1
+    ), "Expected element size to be 1 byte (8 bits)"
     rows, cols = scales_tensor.shape
     # Calculate blocks needed
     num_groups = input_group_end_offsets.shape[0]
@@ -545,186 +545,6 @@ def triton_mx_block_rearrange_2d_K_groups_naive(
     return output
 
 
-@triton_op("torchao::triton_mx_block_rearrange_2d_K_groups", mutates_args={})
-def triton_mx_block_rearrange_2d_K_groups(
-    scales_tensor: torch.Tensor,
-    input_group_end_offsets: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Parallel version (parallelized over column blocks).
-    Rearranges an E8M0 tensor scale to block-scaled swizzle format on a per group basis,
-    where the groups are along the contraction dimension of the GEMM.
-
-    This format is suitable for Tmem as described in NVIDIA documentation:
-    https://docs.nvidia.com/cuda/cublas/index.html#d-block-scaling-factors-layout
-
-    Args:
-        scales_tensor: Input tensor containing e8m0 scales for each logical group of a target tensor.
-        input_group_end_offsets: tensor of int32 values representing group end indexes for the input scales
-    Returns:
-        - Rearranged tensor in block-scaled swizzle format
-    """
-    assert scales_tensor.ndim == 2, "scales tensor must be 2d"
-    assert scales_tensor.element_size() == 1, (
-        "Expected element size to be 1 byte (8 bits)"
-    )
-    rows, cols = scales_tensor.shape
-    # Calculate blocks needed
-    num_groups = input_group_end_offsets.shape[0]
-    num_row_blocks = ceil_div(rows, 128)
-    padded_rows = num_row_blocks * 128
-
-    # Padding needing per group is variable/data dependent, so we just pad each group by
-    # the upper bound of 4 cols to avoid a d2h sync caused by iterating over each group.
-    padded_cols = cols + num_groups * 4
-    output = scales_tensor.new_zeros((padded_rows, padded_cols))
-
-    # Output block stride for the rearranged format
-    BLOCK_ROWS, BLOCK_COLS = 128, 4
-    output_stride_per_block = BLOCK_ROWS * BLOCK_COLS
-
-    # Calculate column blocks for the ORIGINAL input tensor (before padding)
-    # Simply divide the number of columns by BLOCK_COLS
-    total_col_blocks = (cols + BLOCK_COLS - 1) // BLOCK_COLS
-
-    # Compute per-group column block counts on GPU for the kernel to use
-    zero = torch.zeros(
-        1, dtype=input_group_end_offsets.dtype, device=scales_tensor.device
-    )
-    group_sizes = torch.diff(input_group_end_offsets, prepend=zero)
-    group_col_block_counts = (group_sizes + BLOCK_COLS - 1) // BLOCK_COLS
-
-    # We parallelize over all column blocks across all groups and row blocks
-    grid = lambda META: (
-        total_col_blocks,
-        num_row_blocks,
-    )
-    wrap_triton(triton_scale_swizzle_2d_K_groups_parallel)[grid](
-        scales_tensor.view(torch.uint8),
-        scales_tensor.stride(0),
-        scales_tensor.stride(1),
-        rows,
-        cols,
-        padded_rows,
-        input_group_end_offsets,
-        group_col_block_counts,
-        output.view(torch.uint8),
-        output_stride_per_block,
-        num_groups=num_groups,
-        BLOCK_ROWS=BLOCK_ROWS,
-        BLOCK_COLS=BLOCK_COLS,
-    )
-    return output
-
-
-@triton.jit
-def triton_scale_swizzle_2d_K_groups_parallel(
-    scales_ptr,  # (M, total_K//block_size)
-    scales_stride_dim0,
-    scales_stride_dim1,
-    scale_rows,
-    scale_cols,
-    padded_rows,
-    orig_offsets,  # (num_groups,)
-    group_col_block_counts,  # (num_groups,) - number of column blocks per group
-    output_scales_ptr,
-    output_stride_per_block,
-    num_groups: tl.constexpr,
-    BLOCK_ROWS: tl.constexpr,
-    BLOCK_COLS: tl.constexpr,
-):
-    """
-    Parallel version that parallelizes over column blocks.
-    Each thread block processes exactly one (row_block, col_block) pair.
-    Uses simple linear search to find which group a column block belongs to.
-    """
-    col_block_pid = tl.program_id(0)
-    row_block_pid = tl.program_id(1)
-
-    # Vectorized search to find which group this column block belongs to
-    # Load all group block counts at once
-    group_indices = tl.arange(0, num_groups)
-    all_block_counts = tl.load(group_col_block_counts + group_indices)
-
-    # Compute cumulative sums to get start/end positions of each group
-    # cumsum_inclusive[i] = total blocks from group 0 to i (inclusive)
-    cumsum_inclusive = tl.cumsum(all_block_counts, axis=0)
-
-    # cumsum_exclusive[i] = total blocks before group i (exclusive)
-    # For i > 0: cumsum_exclusive[i] = cumsum_inclusive[i] - all_block_counts[i]
-    # For i == 0: cumsum_exclusive[i] = 0
-    cumsum_exclusive = tl.where(
-        group_indices > 0, cumsum_inclusive - all_block_counts, 0
-    )
-
-    # Find which group col_block_pid belongs to
-    # A block belongs to group i if: cumsum_exclusive[i] <= block_id < cumsum_inclusive[i]
-    is_in_group = (col_block_pid >= cumsum_exclusive) & (
-        col_block_pid < cumsum_inclusive
-    )
-
-    # Extract the group_pid (sum of indices where condition is true)
-    group_pid = tl.sum(tl.where(is_in_group, group_indices, 0))
-
-    # Extract the local column block offset within the group
-    local_col_block = tl.sum(tl.where(is_in_group, col_block_pid - cumsum_exclusive, 0))
-
-    # Load group offset boundaries
-    input_group_start_col = tl.load(
-        orig_offsets + group_pid - 1, mask=group_pid > 0, other=0
-    )
-    input_group_end_col = tl.load(orig_offsets + group_pid)
-
-    # Compute input column offset for this specific column block
-    curr_input_start_col = input_group_start_col + local_col_block * BLOCK_COLS
-
-    # Early exit if beyond group boundary
-    if curr_input_start_col >= input_group_end_col:
-        return
-
-    # Calculate this group's start col after blocked format padding
-    output_group_start_col = _blocked_group_start_idx(
-        group_pid, orig_offsets, num_groups, 4
-    )
-
-    row_offs = tl.arange(0, BLOCK_ROWS)[:, None]
-    col_offs = tl.arange(0, BLOCK_COLS)[None, :]
-
-    # Read block of input scales
-    block_row_offs = row_block_pid * BLOCK_ROWS + row_offs
-    block_col_offs = curr_input_start_col + col_offs
-    block_offs = (
-        block_row_offs * scales_stride_dim0 + block_col_offs * scales_stride_dim1
-    )
-    mask = (block_row_offs < scale_rows) & (block_col_offs < input_group_end_col)
-    input_scales = tl.load(scales_ptr + block_offs, mask=mask, other=0.0)
-    scales_flat = tl.reshape(input_scales, (BLOCK_ROWS * BLOCK_COLS))
-
-    # Compute output offset
-    out_group_base_offset = output_group_start_col * padded_rows
-
-    num_cols_in_group = input_group_end_col - input_group_start_col
-    num_col_blocks_in_group = tl.cdiv(num_cols_in_group, BLOCK_COLS)
-    stride_per_row_of_blocks_in_group = (
-        num_col_blocks_in_group * output_stride_per_block
-    )
-
-    offset_in_group = (
-        row_block_pid * stride_per_row_of_blocks_in_group
-        + local_col_block * output_stride_per_block
-    )
-    final_offset = out_group_base_offset + offset_in_group
-
-    # Apply swizzling and write
-    dest_indices_flat = _dest_indices_for_block(
-        row_offs, col_offs, BLOCK_ROWS, BLOCK_COLS
-    )
-    tl.store(
-        output_scales_ptr + final_offset + dest_indices_flat,
-        scales_flat,
-    )
-
-
 @triton.jit
 def triton_scale_swizzle_2d_K_groups(
     scales_ptr,  # (M, total_K//block_size)
@@ -887,9 +707,10 @@ def mxfp8_quantize_cuda_3d(
             torch.Tensor: scales tensor
         """
         assert x.ndim == 3, "Input tensor must be 3D"
-        assert x.dtype in (torch.float32, torch.bfloat16), (
-            "Input tensor must be float32 or bfloat16"
-        )
+        assert x.dtype in (
+            torch.float32,
+            torch.bfloat16,
+        ), "Input tensor must be float32 or bfloat16"
         q_data, scales = mxfp8_cuda.quantize_3d(
             x, scale_dim_n=block_size, scaling_mode=scaling_mode
         )
@@ -902,9 +723,10 @@ def _fake_mxfp8_quantize_cuda_3d(
         scaling_mode: str = "floor",
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         assert x.ndim == 3, "Input tensor must be 3D"
-        assert x.dtype in (torch.float32, torch.bfloat16), (
-            "Input tensor must be float32 or bfloat16"
-        )
+        assert x.dtype in (
+            torch.float32,
+            torch.bfloat16,
+        ), "Input tensor must be float32 or bfloat16"
         E, N, K = x.shape
         # Quantized tensor is in column major layouts
         q_data = x.new_empty(x.shape, dtype=torch.float8_e4m3fn).as_strided(