From 48cdb61d8fd95526bcec79b86c4d86fbc0db1645 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 1 Dec 2025 12:55:52 -0800
Subject: [PATCH 01/19] Int8Tensor migration

Summary:

This PR creates a new Int8Tensor and updates the configs to use the new
Int8Tensor flow

Test Plan:

To ensure BC:
```
pytest test/quantization/test_quant_api.py
```

To test new Int8Tensor:
```
pytest test/quantization/quantize_/workflows/int8/test_int8_tensor.py
```

Reviewers:

Subscribers:

Tasks:

Tags:
---
 .../workflows/int8/test_int8_tensor.py        | 217 +++++++++++
 test/quantization/test_quant_api.py           |  15 +-
 torchao/quantization/__init__.py              |   2 +
 torchao/quantization/quant_api.py             | 137 ++++---
 .../quantize_/workflows/__init__.py           |   4 +
 .../quantize_/workflows/int8/int8_tensor.py   | 351 ++++++++++++++++++
 6 files changed, 662 insertions(+), 64 deletions(-)
 create mode 100644 test/quantization/quantize_/workflows/int8/test_int8_tensor.py
 create mode 100644 torchao/quantization/quantize_/workflows/int8/int8_tensor.py

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
new file mode 100644
index 0000000000..ed150b1ff1
--- /dev/null
+++ b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -0,0 +1,217 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+from torch._inductor.utils import run_and_get_code
+from torch.testing import FileCheck
+from torch.testing._internal import common_utils
+
+from torchao.quantization import (
+    Int8DynamicActivationInt8WeightConfig,
+    Int8WeightOnlyConfig,
+    quantize_,
+)
+from torchao.quantization.granularity import PerRow, PerTensor
+from torchao.quantization.utils import compute_error, get_block_size
+from torchao.testing.model_architectures import ToyTwoLinearModel
+from torchao.testing.utils import TorchAOIntegrationTestCase
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@common_utils.instantiate_parametrized_tests
+class TestInt8Tensor(TorchAOIntegrationTestCase):
+    def setUp(self):
+        super().setUp()
+
+        self.test_shape = (32, 20)
+        self.dtype = torch.bfloat16
+        self.batch_size = 32
+
+        torch.manual_seed(42)
+
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig(version=2),
+            Int8WeightOnlyConfig(version=2),
+        ],
+    )
+    def test_creation_and_attributes(self, config):
+        """Test tensor creation, dtypes, and ranges"""
+        linear = torch.nn.Linear(
+            self.test_shape[1],
+            self.test_shape[0],
+            bias=False,
+            dtype=self.dtype,
+            device="cuda",
+        )
+        quantize_(linear, config)
+
+        w = linear.weight
+
+        self.assertEqual(w.shape, self.test_shape)
+        self.assertEqual(w.qdata.dtype, torch.int8)
+        self.assertTrue(torch.all(w.qdata >= -128) and torch.all(w.qdata <= 127))
+
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @common_utils.parametrize("compile", [True, False])
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig(version=2),
+            Int8WeightOnlyConfig(version=2),
+        ],
+    )
+    @common_utils.parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),  # 2D
+            ((32, 128), 64, 256),  # 3D
+        ],
+    )
+    def test_int8_linear_variants(
+        self,
+        dtype: torch.dtype,
+        config,
+        compile: bool,
+        sizes: tuple,
+    ):
+        """Test linear operation supports including shape and compile"""
+        M, N, K = sizes
+        input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
+        model = ToyTwoLinearModel(K, N, K, dtype=dtype, device="cuda").eval()
+        model_q = copy.deepcopy(model)
+
+        quantize_(model_q, config)
+
+        self.assertEqual(model_q.linear2.weight.scale.shape, (K,))
+        self.assertEqual(model_q.linear2.weight.scale.ndim, 1)
+
+        if compile:
+            model_q = torch.compile(model_q, fullgraph=True)
+
+        output_fp = model(input_tensor)
+        output_quantized = model_q(input_tensor)
+
+        assert compute_error(output_fp, output_quantized) > 20, (
+            f"Quantization error is too high got a SQNR of {compute_error(output_fp, output_quantized)}"
+        )
+
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig(version=2),
+            Int8WeightOnlyConfig(version=2),
+        ],
+    )
+    @common_utils.parametrize("device", ["cpu", "cuda"])
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
+    def test_slice(self, config, device, dtype):
+        """Test tensor slicing with per-row quantization"""
+        tensor_size = 256
+        slice_sizes = (64, 128)
+
+        dummy = torch.nn.Linear(
+            tensor_size, tensor_size, bias=False, dtype=dtype, device=device
+        )
+        quantize_(dummy, config)
+
+        weight1 = dummy.weight.clone().narrow(0, 0, slice_sizes[0])
+        weight2 = dummy.weight.clone().narrow(1, 0, slice_sizes[1])
+
+        self.assertEqual(weight1.qdata, dummy.weight.qdata.narrow(0, 0, slice_sizes[0]))
+        self.assertEqual(weight2.qdata, dummy.weight.qdata.narrow(1, 0, slice_sizes[1]))
+        self.assertEqual(weight1.scale, dummy.weight.scale.narrow(0, 0, slice_sizes[0]))
+        self.assertEqual(weight2.scale, dummy.weight.scale)
+        with self.assertRaises(NotImplementedError):
+            _ = dummy.weight[::2]
+
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig,
+            Int8WeightOnlyConfig,
+        ],
+    )
+    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
+    def test_index_select(self, config, granularity):
+        """test that `x_0 = x[0]` works when `x` is a 2D quantized tensor."""
+        N, K = 256, 512
+        x = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)
+        linear = torch.nn.Linear(K, N, bias=False, dtype=torch.bfloat16, device="cuda")
+        linear.weight.data = x
+
+        config = config(version=2, granularity=granularity)
+        quantize_(linear, config)
+
+        x_int8 = linear.weight
+        x_int8_0 = x_int8[0]
+
+        # Test dequantization consistency
+        torch.testing.assert_close(
+            x_int8.dequantize()[0], x_int8_0.dequantize(), atol=0, rtol=0
+        )
+
+        # Test block_size granularity
+        if isinstance(granularity, PerRow):
+            self.assertEqual(
+                list(get_block_size(x_int8.shape, config.granularity)), [1, K]
+            )
+        elif isinstance(granularity, PerTensor):
+            self.assertEqual(
+                list(get_block_size(x_int8.shape, config.granularity)), [N, K]
+            )
+
+    @common_utils.parametrize(
+        "config",
+        [
+            Int8DynamicActivationInt8WeightConfig(version=2),
+            Int8WeightOnlyConfig(version=2),
+        ],
+    )
+    def test_dequantization_accuracy(self, config):
+        """Test dequantization accuracy separately"""
+        linear = torch.nn.Linear(
+            256, 512, bias=False, dtype=torch.bfloat16, device="cuda"
+        )
+        weight_fp = copy.deepcopy(linear.weight)
+        quantize_(linear, config)
+
+        tensor = linear.weight
+        dequantized = tensor.dequantize()
+        self.assertEqual(dequantized.shape, weight_fp.shape)
+        assert compute_error(dequantized, weight_fp) > 20, (
+            f"Dequantization error is too high to get a SQNR of {compute_error(dequantized, weight_fp)}"
+        )
+
+    def test_available_gpu_kernels(self):
+        """Check which GPU kernels are used"""
+        torch.compiler.reset()
+
+        M, K, N = 128, 256, 512
+        m = torch.nn.Sequential(
+            torch.nn.Linear(K, N, device="cuda", dtype=torch.bfloat16)
+        )
+
+        config = Int8DynamicActivationInt8WeightConfig(version=2)
+        quantize_(m, config)
+
+        m = torch.compile(m)
+        x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+        out, code = run_and_get_code(m, x)
+
+        # Check expected kernels are present
+        FileCheck().check_count("triton_per_fused", 1).check_count(
+            "extern_kernels._int_mm", 1
+        ).check_count("triton_poi_fused", 1).run(code[0])
+
+
+if __name__ == "__main__":
+    common_utils.run_tests()
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 5cd81ece90..14d568d22a 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -35,6 +35,7 @@
 )
 from torchao.quantization import (
     Float8Tensor,
+    Int8Tensor,
     Int4TilePackedTo4dTensor,
     IntxUnpackedToInt8Tensor,
     LinearActivationQuantizedTensor,
@@ -626,8 +627,7 @@ def test_module_fqn_to_config_default(self):
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
         assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout)
-        assert isinstance(model.linear2.weight, AffineQuantizedTensor)
-        assert isinstance(model.linear2.weight._layout, PlainLayout)
+        assert isinstance(model.linear2.weight, Int8Tensor)
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_module_name(self):
@@ -640,8 +640,7 @@ def test_module_fqn_to_config_module_name(self):
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
         assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout)
-        assert isinstance(model.linear2.weight, AffineQuantizedTensor)
-        assert isinstance(model.linear2.weight._layout, PlainLayout)
+        assert isinstance(model.linear2.weight, Int8Tensor)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_module_fqn_to_config_regex_basic(self):
@@ -1209,8 +1208,8 @@ def __init__(self):
         )
         quantize_(m, quant_config, filter_fn=None)
 
-        assert isinstance(m.nested.linear.weight, AffineQuantizedTensor)
-        assert isinstance(m.linear1.weight, AffineQuantizedTensor)
+        assert isinstance(m.nested.linear.weight, Int8Tensor)
+        assert isinstance(m.linear1.weight, Int8Tensor)
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_fqn_config_quantized_nested_module_param(self):
@@ -1234,8 +1233,8 @@ def __init__(self):
         )
         quantize_(m, quant_config, filter_fn=None)
 
-        assert isinstance(m.nested.linear.weight, AffineQuantizedTensor)
-        assert isinstance(m.linear1.weight, AffineQuantizedTensor)
+        assert isinstance(m.nested.linear.weight, Int8Tensor)
+        assert isinstance(m.linear1.weight, Int8Tensor)
 
     def test_fqn_config_module_config_and_fqn_config_both_specified(self):
         with self.assertRaises(ValueError):
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
index a1ca6b0b94..18988943a7 100644
--- a/torchao/quantization/__init__.py
+++ b/torchao/quantization/__init__.py
@@ -93,6 +93,7 @@
 )
 from .quantize_.workflows import (
     Float8Tensor,
+    Int8Tensor,
     Int4MarlinSparseTensor,
     Int4PlainInt32Tensor,
     Int4PreshuffledTensor,
@@ -164,6 +165,7 @@
     "FqnToConfig",
     "ModuleFqnToConfig",
     # tensor subclasses
+    "Int8Tensor",
     "Int4Tensor",
     "Int4PlainInt32Tensor",
     "Int4PreshuffledTensor",
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 57beefdab6..96aacec6a3 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -15,6 +15,7 @@
 and mixed GEMM kernels
 """
 
+from torchao.quantization.quantize_.workflows.int8.int8_tensor import QuantizeTensorToInt8Kwargs
 import logging
 import re
 import types
@@ -81,6 +82,7 @@
     Int4PlainInt32Tensor,
     Int4PreshuffledTensor,
     Int4Tensor,
+    Int8Tensor,
     Int4TilePackedTo4dTensor,
     IntxChooseQParamsAlgorithm,
     IntxOpaqueTensor,
@@ -1332,7 +1334,9 @@ class Int8WeightOnlyConfig(AOBaseConfig):
     """
 
     group_size: Optional[int] = None
+    granularity: Granularity = PerRow()
     set_inductor_config: bool = True
+    version: int = 2
 
     def __post_init__(self):
         torch._C._log_api_usage_once("torchao.quantization.Int8WeightOnlyConfig")
@@ -1343,22 +1347,27 @@ def __post_init__(self):
 
 
 def _int8_weight_only_quantize_tensor(weight, config):
-    mapping_type = MappingType.SYMMETRIC
-    target_dtype = torch.int8
-    eps = torch.finfo(torch.float32).eps
-    zero_point_dtype = torch.int64
-    group_size = config.group_size
-    if group_size is None:
-        group_size = weight.shape[-1]
-    block_size = tuple([1 for x in range(weight.dim() - 1)] + [group_size])
-    new_weight = to_affine_quantized_intx(
-        weight,
-        mapping_type,
-        block_size,
-        target_dtype,
-        eps=eps,
-        zero_point_dtype=zero_point_dtype,
-    )
+    if config.version == 1:
+        mapping_type = MappingType.SYMMETRIC
+        target_dtype = torch.int8
+        eps = torch.finfo(torch.float32).eps
+        zero_point_dtype = torch.int64
+        group_size = config.group_size
+        if group_size is None:
+            group_size = weight.shape[-1]
+        block_size = tuple([1 for x in range(weight.dim() - 1)] + [group_size])
+        new_weight = to_affine_quantized_intx(
+            weight,
+            mapping_type,
+            block_size,
+            target_dtype,
+            eps=eps,
+            zero_point_dtype=zero_point_dtype,
+        )
+        return new_weight
+    else:
+        assert config.version == 2, f"Unexpected version: {config.version}"
+        new_weight = Int8Tensor.from_hp(weight, granularity=config.granularity)
     return new_weight
 
 
@@ -1509,7 +1518,9 @@ class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
     layout: Optional[Layout] = PlainLayout()
     act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC
     weight_only_decode: bool = False
+    granularity: Optional[Union[PerRow, PerTensor]] = PerRow()
     set_inductor_config: bool = True
+    version: int = 2
 
     def __post_init__(self):
         torch._C._log_api_usage_once(
@@ -1524,52 +1535,66 @@ def __post_init__(self):
 
 
 def _int8_dynamic_activation_int8_weight_quantize_tensor(weight, config):
-    layout = config.layout
-    act_mapping_type = config.act_mapping_type
-    weight_only_decode = config.weight_only_decode
-
-    in_features = weight.shape[-1]
-    # int8 dynamic quantization only has benefit when in_feature > 16
-    if in_features <= 16:
-        logger.info(
-            f"Skipping applying Int8DynamicActivationInt8WeightConfig to weight of shape {weight.shape}"
-            f" because `in_feature` is <= 16: {in_features}"
-        )
-        return weight
+    if config.version == 1:
+        layout = config.layout
+        act_mapping_type = config.act_mapping_type
+        weight_only_decode = config.weight_only_decode
+
+        in_features = weight.shape[-1]
+        # int8 dynamic quantization only has benefit when in_feature > 16
+        if in_features <= 16:
+            logger.info(
+                f"Skipping applying Int8DynamicActivationInt8WeightConfig to weight of shape {weight.shape}"
+                f" because `in_feature` is <= 16: {in_features}"
+            )
+            return weight
 
-    # weight settings
-    mapping_type = MappingType.SYMMETRIC
-    weight_zero_point_domain = ZeroPointDomain.NONE
+        # weight settings
+        mapping_type = MappingType.SYMMETRIC
+        weight_zero_point_domain = ZeroPointDomain.NONE
 
-    def get_weight_block_size(x):
-        return tuple([1 for _ in range(x.dim() - 1)] + [x.shape[-1]])
+        def get_weight_block_size(x):
+            return tuple([1 for _ in range(x.dim() - 1)] + [x.shape[-1]])
 
-    target_dtype = torch.int8
-    eps = torch.finfo(torch.float32).eps
-    zero_point_dtype = torch.int64
+        target_dtype = torch.int8
+        eps = torch.finfo(torch.float32).eps
+        zero_point_dtype = torch.int64
 
-    if weight_only_decode:
-        input_quant_func = _int8_symm_per_token_reduced_range_quant_noop_decode
-    else:
-        # input settings
-        if act_mapping_type == MappingType.SYMMETRIC:
-            input_quant_func = _int8_symm_per_token_reduced_range_quant
+        if weight_only_decode:
+            input_quant_func = _int8_symm_per_token_reduced_range_quant_noop_decode
         else:
-            input_quant_func = _int8_asymm_per_token_quant
+            # input settings
+            if act_mapping_type == MappingType.SYMMETRIC:
+                input_quant_func = _int8_symm_per_token_reduced_range_quant
+            else:
+                input_quant_func = _int8_asymm_per_token_quant
 
-    block_size = get_weight_block_size(weight)
-    new_weight = to_affine_quantized_intx(
-        weight,
-        mapping_type,
-        block_size,
-        target_dtype,
-        eps=eps,
-        zero_point_dtype=zero_point_dtype,
-        _layout=layout,
-        zero_point_domain=weight_zero_point_domain,
-    )
-    new_weight = to_linear_activation_quantized(new_weight, input_quant_func)
-    return new_weight
+        block_size = get_weight_block_size(weight)
+        new_weight = to_affine_quantized_intx(
+            weight,
+            mapping_type,
+            block_size,
+            target_dtype,
+            eps=eps,
+            zero_point_dtype=zero_point_dtype,
+            _layout=layout,
+            zero_point_domain=weight_zero_point_domain,
+        )
+        new_weight = to_linear_activation_quantized(new_weight, input_quant_func)
+        return new_weight
+    else:
+        activation_granularity, weight_granularity = _normalize_granularity(config.granularity)
+        act_quant_kwargs = QuantizeTensorToInt8Kwargs(
+            activation_granularity,
+            # hp_value_lb=activation_value_lb,
+            # hp_value_ub=activation_value_ub,
+        )
+        new_weight = Int8Tensor.from_hp(
+            weight,
+            granularity=weight_granularity,
+            act_quant_kwargs=act_quant_kwargs
+        )
+        return new_weight
 
 
 @register_quantize_module_handler(Int8DynamicActivationInt8WeightConfig)
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
index c1d1ae3f74..8b854bebf3 100644
--- a/torchao/quantization/quantize_/workflows/__init__.py
+++ b/torchao/quantization/quantize_/workflows/__init__.py
@@ -2,6 +2,10 @@
     Float8Tensor,
     QuantizeTensorToFloat8Kwargs,
 )
+from .int8.int8_tensor import (
+    Int8Tensor,
+    QuantizeTensorToInt8Kwargs,
+)
 from .int4.int4_choose_qparams_algorithm import Int4ChooseQParamsAlgorithm
 from .int4.int4_marlin_sparse_tensor import (
     Int4MarlinSparseTensor,
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
new file mode 100644
index 0000000000..efc028504e
--- /dev/null
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -0,0 +1,351 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Optional, List
+
+import torch
+from torch.utils._python_dispatch import return_and_correct_aliasing
+
+from torchao.kernel import int_scaled_matmul
+from torchao.quantization.granularity import Granularity, PerRow
+from torchao.quantization.quant_primitives import (
+    MappingType,
+    choose_qparams_affine,
+    dequantize_affine,
+    quantize_affine,
+)
+from torchao.quantization.quantize_.common import QuantizeTensorKwargs
+from torchao.quantization.utils import get_block_size
+from torchao.utils import TorchAOBaseTensor, fill_defaults
+
+__all__ = ["Int8Tensor", "QuantizeTensorToInt8Kwargs"]
+
+aten = torch.ops.aten
+
+
+@dataclass
+class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
+    """Tensor kwargs for creating int8 tensor (either activation or weight)
+
+    Args:
+        granularity: the granularity for the Tensor, currently either PerRow() or PerTensor()
+    """
+    granularity: Granularity = PerRow()
+    hp_value_lb: Optional[float] = None
+    hp_value_ub: Optional[float] = None
+
+
+class Int8Tensor(TorchAOBaseTensor):
+    """
+    int8 quantized tensor with plain layout
+
+    Tensor Attributes:
+        qdata: (N, K) or (B, N, K) int8 quantized weight data (2D or 3D)
+        scale: scale factors for dequantization
+        # TODO: Static quantization support using `static_scale`
+
+    Non-Tensor Attributes:
+        granularity: the granularity for quantization (e.g., PerRow(), PerTensor())
+        act_quant_kwargs: flags for dynamic activation quantization
+    """
+
+    # TODO: Static quantization support using `static_scale`
+    tensor_data_names = ["qdata", "scale"]
+    tensor_attribute_names = []
+    optional_tensor_attribute_names = [
+        "block_size",
+        "act_quant_kwargs",
+        "dtype",
+    ]
+
+    def __new__(
+        cls: type,
+        qdata: torch.Tensor,
+        scale: torch.Tensor,
+        block_size: Optional[List[int]] = None,
+        act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        kwargs = {
+            "device": qdata.device,
+            "dtype": dtype,
+            "requires_grad": False,
+        }
+        return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, **kwargs)
+
+    def __init__(
+        self,
+        qdata: torch.Tensor,
+        scale: torch.Tensor,
+        block_size: Optional[List[int]] = None,
+        act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.qdata = qdata
+        self.scale = scale
+        self.block_size = block_size
+        self.act_quant_kwargs = act_quant_kwargs
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"act_quant_kwargs={self.act_quant_kwargs}, "
+            f"qdata={self.qdata}, "
+            f"scale={self.scale}, "
+            f"block_size={self.block_size}, "
+            f"shape={self.shape}, "
+            f"device={self.device}, "
+            f"dtype={self.dtype})"
+        )
+
+    @classmethod
+    def from_hp(
+        cls,
+        hp_tensor: torch.Tensor,
+        granularity: Granularity = PerRow(),
+        hp_value_lb: Optional[float] = None,
+        hp_value_ub: Optional[float] = None,
+        act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
+    ):
+        """Create Int8Tensor from high-precision tensor"""
+        block_size = get_block_size(hp_tensor.shape, granularity)
+        block_size = list(block_size)
+
+        scale, zero_point = choose_qparams_affine(
+            input=hp_tensor,
+            mapping_type=MappingType.SYMMETRIC,
+            block_size=block_size,
+            target_dtype=torch.int8,
+            quant_min=hp_value_lb,
+            quant_max=hp_value_ub,
+            scale_dtype=hp_tensor.dtype,
+            zero_point_dtype=torch.int8,
+        )
+
+        int_data = quantize_affine(
+            hp_tensor,
+            block_size=block_size,
+            scale=scale,
+            zero_point=zero_point,
+            output_dtype=torch.int8,
+        )
+
+        return cls(
+            int_data,
+            scale,
+            block_size=block_size,
+            act_quant_kwargs=act_quant_kwargs,
+            dtype=hp_tensor.dtype,
+        )
+
+    def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        """Dequantize int8 tensor to floating point"""
+        if output_dtype is None:
+            output_dtype = self.dtype
+
+        return dequantize_affine(
+            input=self.qdata,
+            block_size=self.block_size,
+            scale=self.scale,
+            zero_point=None,
+            input_dtype=torch.int8,
+            quant_min=-128,
+            quant_max=127,
+            output_dtype=output_dtype,
+        )
+
+
+implements = Int8Tensor.implements
+implements_torch_function = Int8Tensor.implements_torch_function
+
+
+def _slice_scale(
+    scale: torch.Tensor,
+    data_shape: list[int],
+    dim: int,
+    start: int,
+    end: int,
+    step: int,
+) -> torch.Tensor:
+    """
+    Slice the scale tensor appropriately based on the data tensor slicing.
+    This function calculates how the scale should be sliced when the data tensor
+    is sliced along a given dimension, taking into account the block structure.
+
+    Example:
+        If data_shape is [256, 128] and scale shape is [1] (indicating per-tensor scaling),
+        slicing along any dimension should return the same scale tensor.
+
+        If data_shape is [256, 128] and scale shape is [256] (indicating per-row scaling),
+        and we slice data along dim=0 from 64 to 192, the corresponding scale
+    """
+    aten = torch.ops.aten
+
+    # Case 1: Per-tensor quantization (scalar scale)
+    if scale.numel() <= 1:
+        return scale
+
+    # Case 2: Per-row quantization (1D scale)
+    # Scale is per-element along this dimension
+    if scale.ndim == 1:
+        if dim == 0:
+            return aten.slice.Tensor(scale, 0, start, end, step)
+        else:
+            return scale
+
+    # Case 3: Per-block quantization (2D scale)
+    block_sizes = tuple(
+        data_shape[i] // scale.shape[i] for i in range(len(scale.shape))
+    )
+
+    block_size_for_dim = block_sizes[dim]
+
+    if step > 1:
+        raise NotImplementedError(
+            "Slicing with step > 1 is not implemented for scale tensors."
+        )
+
+    # There is blocking in this dimension
+    # Calculate which scale elements correspond to the sliced data
+    scale_start = start // block_size_for_dim if start is not None else None
+    scale_end = (
+        (end + block_size_for_dim - 1) // block_size_for_dim
+        if end is not None
+        else None
+    )
+
+    return aten.slice.Tensor(scale, dim, scale_start, scale_end, 1)
+
+
+@implements(aten.linear.default)
+@implements_torch_function(torch.nn.functional.linear)
+def _(func, types, args, kwargs):
+    """INT8 quantization: dynamic activation or weight-only"""
+    activation_tensor, weight_tensor, bias = (
+        args[0],
+        args[1],
+        args[2] if len(args) > 2 else None,
+    )
+
+    if not isinstance(weight_tensor, Int8Tensor):
+        raise TypeError(f"Expected weight to be Int8Tensor, got {type(weight_tensor)}")
+
+    output_dtype = activation_tensor.dtype
+
+    if weight_tensor.act_quant_kwargs is not None:
+        activation_tensor = Int8Tensor.from_hp(
+            activation_tensor, weight_tensor.act_quant_kwargs.granularity
+        )
+        # Dynamic activation quantization path
+
+        # 1. do the matrix form of dot(X_i, W_j)
+        #
+        # 2. rescale the output
+        #
+        # in cases with large matrices, y_dot_int32 can grow sufficiently
+        # large that y_dot_int32 * a FP16 scale is greater than the maximum
+        # value of a FP16, (which results in a value of inf even if multiplying
+        # by the other scale would bring it within the expected range)
+
+        x_vals_int8 = activation_tensor.qdata
+        x_scales = activation_tensor.scale
+        w_vals_int8_t = weight_tensor.qdata.contiguous().t()
+        w_scales = weight_tensor.scale
+
+        tmp = x_vals_int8.reshape(-1, x_vals_int8.shape[-1])
+        # Cast FP16 scale to float to avoid overflow in int_scaled_matmul
+        intermediate_dtype = (
+            torch.float if x_scales.dtype == torch.half else x_scales.dtype
+        )
+        y_dot_scaled = int_scaled_matmul(
+            tmp, w_vals_int8_t, x_scales.reshape(-1, 1).to(intermediate_dtype)
+        ).to(output_dtype)
+        y = (y_dot_scaled * w_scales).reshape(
+            *x_vals_int8.shape[:-1], y_dot_scaled.shape[-1]
+        )
+
+    else:
+        # FP × INT8 (weight-only) 1Code has comments. Press enter to view.
+        w_vals_int8_t = weight_tensor.qdata.t()
+        m = torch.mm(
+            activation_tensor.reshape(-1, activation_tensor.shape[-1]),
+            w_vals_int8_t.to(output_dtype),
+        )
+        y = m * weight_tensor.scale.to(m.dtype)
+        y = y.reshape(*activation_tensor.shape[:-1], weight_tensor.qdata.shape[0])
+
+    if bias is not None:
+        y += bias
+
+    return y.to(output_dtype)
+
+
+@implements(aten.slice.Tensor)
+def _(func, types, args, kwargs):
+    """Slice operation for Int8Tensor"""
+    self, dim, start, end, step = fill_defaults(args, 5, [0, None, None, 1])
+
+    if step != 1:
+        raise NotImplementedError(
+            f"Slicing with step != 1 is not supported, got step={step}"
+        )
+
+    if dim not in [0, 1, 2]:
+        raise ValueError(f"Only dim in [0, 1, 2] supported, got dim={dim}")
+
+    if self.qdata.ndim not in [2, 3]:
+        raise ValueError(f"Expected qdata to be 2D or 3D, got {self.qdata.ndim}D")
+
+    if end is None or end > self.shape[dim]:
+        end = self.shape[dim]
+
+    sliced_qdata = aten.slice.Tensor(self.qdata, dim, start, end, step)
+    sliced_scale = _slice_scale(self.scale, self.qdata.shape, dim, start, end, step)
+
+    return return_and_correct_aliasing(
+        func,
+        args,
+        kwargs,
+        Int8Tensor(
+            sliced_qdata,
+            sliced_scale,
+            block_size=self.block_size[1:], 
+            act_quant_kwargs=self.act_quant_kwargs,
+            dtype=self.dtype,
+        ),
+    )
+
+
+@implements(aten.select.int)
+def _(func, types, args, kwargs):
+    """Select operation for Int8Tensor"""
+    self, dim, index = args
+    if dim != 0:
+        raise NotImplementedError(f"Only dim=0 supported, got dim={dim}")
+
+    selected_qdata = self.qdata[index]
+    selected_scale = _slice_scale(
+        self.scale, self.qdata.shape, dim, index, index + 1, step=1
+    ).squeeze(0)
+
+    return return_and_correct_aliasing(
+        func,
+        args,
+        kwargs,
+        Int8Tensor(
+            selected_qdata,
+            selected_scale,
+            block_size=self.block_size[1:],
+            act_quant_kwargs=self.act_quant_kwargs,
+            dtype=self.dtype,
+        ),
+    )
+
+
+Int8Tensor.__module__ = "torchao.quantization"
+torch.serialization.add_safe_globals([Int8Tensor, QuantizeTensorToInt8Kwargs])

From 0b73aed8bea8f26ae60a94f23e885f1a09ed0196 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 1 Dec 2025 13:06:07 -0800
Subject: [PATCH 02/19] ruff fixes

---
 test/quantization/test_quant_api.py                |  3 +--
 torchao/quantization/__init__.py                   |  2 +-
 torchao/quantization/quant_api.py                  | 14 ++++++++------
 .../quantization/quantize_/workflows/__init__.py   |  8 ++++----
 .../quantize_/workflows/int8/int8_tensor.py        |  5 +++--
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 14d568d22a..7c0d614b4b 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -30,13 +30,12 @@
     AffineQuantizedTensor,
     Int4CPULayout,
     Int4XPULayout,
-    PlainLayout,
     TensorCoreTiledLayout,
 )
 from torchao.quantization import (
     Float8Tensor,
-    Int8Tensor,
     Int4TilePackedTo4dTensor,
+    Int8Tensor,
     IntxUnpackedToInt8Tensor,
     LinearActivationQuantizedTensor,
     PerGroup,
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
index 18988943a7..80e11dda5b 100644
--- a/torchao/quantization/__init__.py
+++ b/torchao/quantization/__init__.py
@@ -93,12 +93,12 @@
 )
 from .quantize_.workflows import (
     Float8Tensor,
-    Int8Tensor,
     Int4MarlinSparseTensor,
     Int4PlainInt32Tensor,
     Int4PreshuffledTensor,
     Int4Tensor,
     Int4TilePackedTo4dTensor,
+    Int8Tensor,
     IntxOpaqueTensor,
     IntxUnpackedToInt8Tensor,
 )
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 96aacec6a3..dcfa3af709 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -15,7 +15,6 @@
 and mixed GEMM kernels
 """
 
-from torchao.quantization.quantize_.workflows.int8.int8_tensor import QuantizeTensorToInt8Kwargs
 import logging
 import re
 import types
@@ -82,14 +81,17 @@
     Int4PlainInt32Tensor,
     Int4PreshuffledTensor,
     Int4Tensor,
-    Int8Tensor,
     Int4TilePackedTo4dTensor,
+    Int8Tensor,
     IntxChooseQParamsAlgorithm,
     IntxOpaqueTensor,
     IntxPackingFormat,
     IntxUnpackedToInt8Tensor,
     QuantizeTensorToFloat8Kwargs,
 )
+from torchao.quantization.quantize_.workflows.int8.int8_tensor import (
+    QuantizeTensorToInt8Kwargs,
+)
 from torchao.quantization.transform_module import (
     _QUANTIZE_CONFIG_HANDLER,
     register_quantize_module_handler,
@@ -1583,16 +1585,16 @@ def get_weight_block_size(x):
         new_weight = to_linear_activation_quantized(new_weight, input_quant_func)
         return new_weight
     else:
-        activation_granularity, weight_granularity = _normalize_granularity(config.granularity)
+        activation_granularity, weight_granularity = _normalize_granularity(
+            config.granularity
+        )
         act_quant_kwargs = QuantizeTensorToInt8Kwargs(
             activation_granularity,
             # hp_value_lb=activation_value_lb,
             # hp_value_ub=activation_value_ub,
         )
         new_weight = Int8Tensor.from_hp(
-            weight,
-            granularity=weight_granularity,
-            act_quant_kwargs=act_quant_kwargs
+            weight, granularity=weight_granularity, act_quant_kwargs=act_quant_kwargs
         )
         return new_weight
 
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
index 8b854bebf3..2097fe0730 100644
--- a/torchao/quantization/quantize_/workflows/__init__.py
+++ b/torchao/quantization/quantize_/workflows/__init__.py
@@ -2,10 +2,6 @@
     Float8Tensor,
     QuantizeTensorToFloat8Kwargs,
 )
-from .int8.int8_tensor import (
-    Int8Tensor,
-    QuantizeTensorToInt8Kwargs,
-)
 from .int4.int4_choose_qparams_algorithm import Int4ChooseQParamsAlgorithm
 from .int4.int4_marlin_sparse_tensor import (
     Int4MarlinSparseTensor,
@@ -21,6 +17,10 @@
     Int4Tensor,
 )
 from .int4.int4_tile_packed_to_4d_tensor import Int4TilePackedTo4dTensor
+from .int8.int8_tensor import (
+    Int8Tensor,
+    QuantizeTensorToInt8Kwargs,
+)
 from .intx.intx_choose_qparams_algorithm import IntxChooseQParamsAlgorithm
 from .intx.intx_opaque_tensor import (
     IntxOpaqueTensor,
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index efc028504e..f76a0cb198 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 from dataclasses import dataclass
-from typing import Optional, List
+from typing import List, Optional
 
 import torch
 from torch.utils._python_dispatch import return_and_correct_aliasing
@@ -34,6 +34,7 @@ class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
     Args:
         granularity: the granularity for the Tensor, currently either PerRow() or PerTensor()
     """
+
     granularity: Granularity = PerRow()
     hp_value_lb: Optional[float] = None
     hp_value_ub: Optional[float] = None
@@ -314,7 +315,7 @@ def _(func, types, args, kwargs):
         Int8Tensor(
             sliced_qdata,
             sliced_scale,
-            block_size=self.block_size[1:], 
+            block_size=self.block_size[1:],
             act_quant_kwargs=self.act_quant_kwargs,
             dtype=self.dtype,
         ),

From 1e49945d5f3380f0a7ccd976fb20b64d401b7453 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 1 Dec 2025 14:18:31 -0800
Subject: [PATCH 03/19] add init

---
 .../workflows/int8/test_int8_tensor.py        |   4 +-
 .../quantize_/workflows/int8/__init__.py      |   0
 .../quantize_/workflows/int8/int8_tensor.py   | 125 ++++++------------
 3 files changed, 41 insertions(+), 88 deletions(-)
 create mode 100644 torchao/quantization/quantize_/workflows/int8/__init__.py

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
index ed150b1ff1..c536bf2ddc 100644
--- a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
+++ b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -90,8 +90,8 @@ def test_int8_linear_variants(
 
         quantize_(model_q, config)
 
-        self.assertEqual(model_q.linear2.weight.scale.shape, (K,))
-        self.assertEqual(model_q.linear2.weight.scale.ndim, 1)
+        self.assertEqual(model_q.linear2.weight.scale.shape, (K, 1))
+        self.assertEqual(model_q.linear2.weight.scale.ndim, 2)
 
         if compile:
             model_q = torch.compile(model_q, fullgraph=True)
diff --git a/torchao/quantization/quantize_/workflows/int8/__init__.py b/torchao/quantization/quantize_/workflows/int8/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index f76a0cb198..aedc8d050d 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -21,6 +21,7 @@
 from torchao.quantization.quantize_.common import QuantizeTensorKwargs
 from torchao.quantization.utils import get_block_size
 from torchao.utils import TorchAOBaseTensor, fill_defaults
+from torchao.float8.inference import _slice_scale_for_dimension
 
 __all__ = ["Int8Tensor", "QuantizeTensorToInt8Kwargs"]
 
@@ -136,6 +137,11 @@ def from_hp(
             output_dtype=torch.int8,
         )
 
+        if isinstance(granularity, PerRow):
+            scale = scale.unsqueeze(1)
+        else:
+            scale = scale.unsqueeze(0).unsqueeze(1)
+
         return cls(
             int_data,
             scale,
@@ -152,7 +158,7 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
         return dequantize_affine(
             input=self.qdata,
             block_size=self.block_size,
-            scale=self.scale,
+            scale=self.scale.squeeze(),
             zero_point=None,
             input_dtype=torch.int8,
             quant_min=-128,
@@ -164,65 +170,6 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
 implements = Int8Tensor.implements
 implements_torch_function = Int8Tensor.implements_torch_function
 
-
-def _slice_scale(
-    scale: torch.Tensor,
-    data_shape: list[int],
-    dim: int,
-    start: int,
-    end: int,
-    step: int,
-) -> torch.Tensor:
-    """
-    Slice the scale tensor appropriately based on the data tensor slicing.
-    This function calculates how the scale should be sliced when the data tensor
-    is sliced along a given dimension, taking into account the block structure.
-
-    Example:
-        If data_shape is [256, 128] and scale shape is [1] (indicating per-tensor scaling),
-        slicing along any dimension should return the same scale tensor.
-
-        If data_shape is [256, 128] and scale shape is [256] (indicating per-row scaling),
-        and we slice data along dim=0 from 64 to 192, the corresponding scale
-    """
-    aten = torch.ops.aten
-
-    # Case 1: Per-tensor quantization (scalar scale)
-    if scale.numel() <= 1:
-        return scale
-
-    # Case 2: Per-row quantization (1D scale)
-    # Scale is per-element along this dimension
-    if scale.ndim == 1:
-        if dim == 0:
-            return aten.slice.Tensor(scale, 0, start, end, step)
-        else:
-            return scale
-
-    # Case 3: Per-block quantization (2D scale)
-    block_sizes = tuple(
-        data_shape[i] // scale.shape[i] for i in range(len(scale.shape))
-    )
-
-    block_size_for_dim = block_sizes[dim]
-
-    if step > 1:
-        raise NotImplementedError(
-            "Slicing with step > 1 is not implemented for scale tensors."
-        )
-
-    # There is blocking in this dimension
-    # Calculate which scale elements correspond to the sliced data
-    scale_start = start // block_size_for_dim if start is not None else None
-    scale_end = (
-        (end + block_size_for_dim - 1) // block_size_for_dim
-        if end is not None
-        else None
-    )
-
-    return aten.slice.Tensor(scale, dim, scale_start, scale_end, 1)
-
-
 @implements(aten.linear.default)
 @implements_torch_function(torch.nn.functional.linear)
 def _(func, types, args, kwargs):
@@ -233,8 +180,7 @@ def _(func, types, args, kwargs):
         args[2] if len(args) > 2 else None,
     )
 
-    if not isinstance(weight_tensor, Int8Tensor):
-        raise TypeError(f"Expected weight to be Int8Tensor, got {type(weight_tensor)}")
+    assert isinstance(weight_tensor, Int8Tensor), f"Expected weight to be Int8Tensor, got {type(weight_tensor)}"
 
     output_dtype = activation_tensor.dtype
 
@@ -266,7 +212,7 @@ def _(func, types, args, kwargs):
         y_dot_scaled = int_scaled_matmul(
             tmp, w_vals_int8_t, x_scales.reshape(-1, 1).to(intermediate_dtype)
         ).to(output_dtype)
-        y = (y_dot_scaled * w_scales).reshape(
+        y = (y_dot_scaled * w_scales.flatten()).reshape(
             *x_vals_int8.shape[:-1], y_dot_scaled.shape[-1]
         )
 
@@ -277,7 +223,7 @@ def _(func, types, args, kwargs):
             activation_tensor.reshape(-1, activation_tensor.shape[-1]),
             w_vals_int8_t.to(output_dtype),
         )
-        y = m * weight_tensor.scale.to(m.dtype)
+        y = m * weight_tensor.scale.to(m.dtype).flatten()
         y = y.reshape(*activation_tensor.shape[:-1], weight_tensor.qdata.shape[0])
 
     if bias is not None:
@@ -306,7 +252,19 @@ def _(func, types, args, kwargs):
         end = self.shape[dim]
 
     sliced_qdata = aten.slice.Tensor(self.qdata, dim, start, end, step)
-    sliced_scale = _slice_scale(self.scale, self.qdata.shape, dim, start, end, step)
+    if self.scale.numel() == 1:
+        # Per-tensor quantization - scale doesn't change
+        sliced_scale = self.scale
+    else:
+        # Block-wise quantization - need to slice the scale appropriately
+        sliced_scale = _slice_scale_for_dimension(
+            self.scale, self.qdata.shape, dim, start, end, step
+        )
+
+    # adjust block_size since the shape has changed, block_size[i] should not be greater than shape[i]
+    block_size = self.block_size.copy()
+    for i in range(len(self.block_size)):
+        block_size[i] = min(block_size[i], sliced_qdata.shape[i])
 
     return return_and_correct_aliasing(
         func,
@@ -315,7 +273,7 @@ def _(func, types, args, kwargs):
         Int8Tensor(
             sliced_qdata,
             sliced_scale,
-            block_size=self.block_size[1:],
+            block_size=block_size,
             act_quant_kwargs=self.act_quant_kwargs,
             dtype=self.dtype,
         ),
@@ -325,27 +283,22 @@ def _(func, types, args, kwargs):
 @implements(aten.select.int)
 def _(func, types, args, kwargs):
     """Select operation for Int8Tensor"""
-    self, dim, index = args
-    if dim != 0:
-        raise NotImplementedError(f"Only dim=0 supported, got dim={dim}")
-
-    selected_qdata = self.qdata[index]
-    selected_scale = _slice_scale(
-        self.scale, self.qdata.shape, dim, index, index + 1, step=1
-    ).squeeze(0)
-
-    return return_and_correct_aliasing(
-        func,
-        args,
-        kwargs,
-        Int8Tensor(
-            selected_qdata,
-            selected_scale,
-            block_size=self.block_size[1:],
-            act_quant_kwargs=self.act_quant_kwargs,
-            dtype=self.dtype,
-        ),
+    old_int8_tensor, dim, index = args
+    assert dim == 0, f"Int8Tensor aten.select.int with {dim=} is not yet supported"
+    assert len(old_int8_tensor.qdata.shape) == len(old_int8_tensor.scale.shape), (
+        "unsupported"
+    )
+    assert len(old_int8_tensor.qdata.shape) == len(old_int8_tensor.block_size), (
+        "unsupported"
+    )
+    new_int8_tensor = old_int8_tensor.__class__(
+        old_int8_tensor.qdata[index],
+        old_int8_tensor.scale[index],
+        old_int8_tensor.block_size[1:],
+        old_int8_tensor.act_quant_kwargs,
+        old_int8_tensor.dtype,
     )
+    return return_and_correct_aliasing(func, args, kwargs, new_int8_tensor)
 
 
 Int8Tensor.__module__ = "torchao.quantization"

From 669b6ee938f70ce36eea893d728675f08b5b64d5 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 1 Dec 2025 14:35:40 -0800
Subject: [PATCH 04/19] fix ruff again

---
 .../quantize_/workflows/int8/int8_tensor.py   | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index aedc8d050d..c348b5b88b 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -10,8 +10,9 @@
 import torch
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
+from torchao.float8.inference import _slice_scale_for_dimension
 from torchao.kernel import int_scaled_matmul
-from torchao.quantization.granularity import Granularity, PerRow
+from torchao.quantization.granularity import Granularity, PerRow, PerTensor
 from torchao.quantization.quant_primitives import (
     MappingType,
     choose_qparams_affine,
@@ -21,7 +22,6 @@
 from torchao.quantization.quantize_.common import QuantizeTensorKwargs
 from torchao.quantization.utils import get_block_size
 from torchao.utils import TorchAOBaseTensor, fill_defaults
-from torchao.float8.inference import _slice_scale_for_dimension
 
 __all__ = ["Int8Tensor", "QuantizeTensorToInt8Kwargs"]
 
@@ -30,15 +30,13 @@
 
 @dataclass
 class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
-    """Tensor kwargs for creating int8 tensor (either activation or weight)
+    """Tensor kwargs for creating int8 tensor from high precision
 
     Args:
         granularity: the granularity for the Tensor, currently either PerRow() or PerTensor()
     """
 
     granularity: Granularity = PerRow()
-    hp_value_lb: Optional[float] = None
-    hp_value_ub: Optional[float] = None
 
 
 class Int8Tensor(TorchAOBaseTensor):
@@ -110,8 +108,6 @@ def from_hp(
         cls,
         hp_tensor: torch.Tensor,
         granularity: Granularity = PerRow(),
-        hp_value_lb: Optional[float] = None,
-        hp_value_ub: Optional[float] = None,
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
     ):
         """Create Int8Tensor from high-precision tensor"""
@@ -123,8 +119,8 @@ def from_hp(
             mapping_type=MappingType.SYMMETRIC,
             block_size=block_size,
             target_dtype=torch.int8,
-            quant_min=hp_value_lb,
-            quant_max=hp_value_ub,
+            quant_min=-128,
+            quant_max=127,
             scale_dtype=hp_tensor.dtype,
             zero_point_dtype=torch.int8,
         )
@@ -137,9 +133,10 @@ def from_hp(
             output_dtype=torch.int8,
         )
 
+        # make scale the correct dim
         if isinstance(granularity, PerRow):
             scale = scale.unsqueeze(1)
-        else:
+        elif isinstance(granularity, PerTensor):
             scale = scale.unsqueeze(0).unsqueeze(1)
 
         return cls(
@@ -152,9 +149,6 @@ def from_hp(
 
     def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
         """Dequantize int8 tensor to floating point"""
-        if output_dtype is None:
-            output_dtype = self.dtype
-
         return dequantize_affine(
             input=self.qdata,
             block_size=self.block_size,
@@ -163,13 +157,14 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
             input_dtype=torch.int8,
             quant_min=-128,
             quant_max=127,
-            output_dtype=output_dtype,
+            output_dtype=output_dtype or self.dtype,
         )
 
 
 implements = Int8Tensor.implements
 implements_torch_function = Int8Tensor.implements_torch_function
 
+
 @implements(aten.linear.default)
 @implements_torch_function(torch.nn.functional.linear)
 def _(func, types, args, kwargs):
@@ -180,7 +175,9 @@ def _(func, types, args, kwargs):
         args[2] if len(args) > 2 else None,
     )
 
-    assert isinstance(weight_tensor, Int8Tensor), f"Expected weight to be Int8Tensor, got {type(weight_tensor)}"
+    assert isinstance(weight_tensor, Int8Tensor), (
+        f"Expected weight to be Int8Tensor, got {type(weight_tensor)}"
+    )
 
     output_dtype = activation_tensor.dtype
 
@@ -217,7 +214,7 @@ def _(func, types, args, kwargs):
         )
 
     else:
-        # FP × INT8 (weight-only) 1Code has comments. Press enter to view.
+        # FP × INT8 (weight-only)
         w_vals_int8_t = weight_tensor.qdata.t()
         m = torch.mm(
             activation_tensor.reshape(-1, activation_tensor.shape[-1]),

From 9071526ec4cc473baa530fcd3243857c41786cd9 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 1 Dec 2025 14:41:39 -0800
Subject: [PATCH 05/19] update

---
 torchao/quantization/quantize_/workflows/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
index 2097fe0730..fd5d572c46 100644
--- a/torchao/quantization/quantize_/workflows/__init__.py
+++ b/torchao/quantization/quantize_/workflows/__init__.py
@@ -40,6 +40,8 @@
     "Int4TilePackedTo4dTensor",
     "Float8Tensor",
     "QuantizeTensorToFloat8Kwargs",
+    "Int8Tensor",
+    "QuantizeTensorToInt8Kwargs",
     "Int4ChooseQParamsAlgorithm",
     "Int4PackingFormat",
     "IntxChooseQParamsAlgorithm",

From 1539e0f2b866c2594fec7855963b48fd3edcd928 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Mon, 1 Dec 2025 17:13:07 -0800
Subject: [PATCH 06/19] wip

---
 test/quantization/test_moe_quant.py           | 19 ++++++++++---------
 torchao/quantization/quant_api.py             |  4 ++--
 .../quantize_/workflows/int8/int8_tensor.py   |  9 +++++++++
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py
index 61000babc1..593da3b76f 100644
--- a/test/quantization/test_moe_quant.py
+++ b/test/quantization/test_moe_quant.py
@@ -32,6 +32,7 @@
     LinearActivationQuantizedTensor,
     quantize_,
 )
+from torchao.quantization import Int8Tensor
 from torchao.quantization.utils import compute_error
 from torchao.utils import is_sm_at_least_90
 
@@ -50,7 +51,7 @@ def _test_impl_moe_quant(
         self,
         config,
         num_tokens=1,
-        model_params=None,
+    model_params=None,
         base_class=AffineQuantizedTensor,
         tensor_impl_class=None,
         dtype=torch.bfloat16,
@@ -167,12 +168,12 @@ def test_int8wo_fake_dim(self, name, num_tokens, fullgraph):
         config = MoEQuantConfig(
             Int8WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE
         )
-        tensor_impl_class = PlainAQTTensorImpl
+        base_class = Int8Tensor
 
         self._test_impl_moe_quant(
             config=config,
             num_tokens=num_tokens,
-            tensor_impl_class=tensor_impl_class,
+            base_class=base_class,
             fullgraph=fullgraph,
         )
 
@@ -187,12 +188,12 @@ def test_int8wo_base(self, name, num_tokens, fullgraph):
             self.skipTest("Need CUDA available")
 
         config = MoEQuantConfig(Int8WeightOnlyConfig())
-        tensor_impl_class = PlainAQTTensorImpl
+        base_class = Int8Tensor
 
         self._test_impl_moe_quant(
             config=config,
             num_tokens=num_tokens,
-            tensor_impl_class=tensor_impl_class,
+            base_class=base_class,
             fullgraph=fullgraph,
         )
 
@@ -204,12 +205,12 @@ def test_int8wo_base(self, name, num_tokens, fullgraph):
     )
     def test_int8wo_base_cpu(self, name, num_tokens, fullgraph):
         config = MoEQuantConfig(Int8WeightOnlyConfig())
-        tensor_impl_class = PlainAQTTensorImpl
+        base_class = Int8Tensor
 
         self._test_impl_moe_quant(
             config=config,
             num_tokens=num_tokens,
-            tensor_impl_class=tensor_impl_class,
+            base_class=base_class,
             fullgraph=fullgraph,
             device="cpu",
         )
@@ -227,7 +228,7 @@ def test_int8dq_fake_dim(self, name, num_tokens, fullgraph):
             Int8DynamicActivationInt8WeightConfig(),
             use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
         )
-        base_class = LinearActivationQuantizedTensor
+        base_class = Int8Tensor
 
         self._test_impl_moe_quant(
             model_params=(512, 256, 2, 2),
@@ -247,7 +248,7 @@ def test_int8dq_base(self, name, num_tokens, fullgraph):
             self.skipTest("Need CUDA available")
 
         config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig())
-        base_class = LinearActivationQuantizedTensor
+        base_class = Int8Tensor 
 
         self._test_impl_moe_quant(
             model_params=(512, 256, 2, 2),
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index dcfa3af709..425061f23c 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1342,6 +1342,8 @@ class Int8WeightOnlyConfig(AOBaseConfig):
 
     def __post_init__(self):
         torch._C._log_api_usage_once("torchao.quantization.Int8WeightOnlyConfig")
+        if self.version == 2:
+            assert self.group_size is None, f"Only support version 2 with group_size=None, got {self.group_size}"
 
 
 # for BC
@@ -1590,8 +1592,6 @@ def get_weight_block_size(x):
         )
         act_quant_kwargs = QuantizeTensorToInt8Kwargs(
             activation_granularity,
-            # hp_value_lb=activation_value_lb,
-            # hp_value_ub=activation_value_ub,
         )
         new_weight = Int8Tensor.from_hp(
             weight, granularity=weight_granularity, act_quant_kwargs=act_quant_kwargs
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index c348b5b88b..bc3a7ca469 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -276,6 +276,15 @@ def _(func, types, args, kwargs):
         ),
     )
 
+@implements(aten.index.Tensor)
+def _(func, types, args, kwargs):
+    return return_and_correct_aliasing(
+        func,
+        args,
+        kwargs,
+        args[0]._apply_fn_to_data(lambda x: func(x, *args[1:], **kwargs)),
+    )
+
 
 @implements(aten.select.int)
 def _(func, types, args, kwargs):

From 673f228204ce3e28917abcfcc0dd9d0599aa52b2 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Tue, 2 Dec 2025 17:55:32 -0800
Subject: [PATCH 07/19] undo update tests

---
 test/quantization/test_moe_quant.py | 19 +++++++++----------
 test/quantization/test_quant_api.py | 16 +++++++++-------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/test/quantization/test_moe_quant.py b/test/quantization/test_moe_quant.py
index 593da3b76f..61000babc1 100644
--- a/test/quantization/test_moe_quant.py
+++ b/test/quantization/test_moe_quant.py
@@ -32,7 +32,6 @@
     LinearActivationQuantizedTensor,
     quantize_,
 )
-from torchao.quantization import Int8Tensor
 from torchao.quantization.utils import compute_error
 from torchao.utils import is_sm_at_least_90
 
@@ -51,7 +50,7 @@ def _test_impl_moe_quant(
         self,
         config,
         num_tokens=1,
-    model_params=None,
+        model_params=None,
         base_class=AffineQuantizedTensor,
         tensor_impl_class=None,
         dtype=torch.bfloat16,
@@ -168,12 +167,12 @@ def test_int8wo_fake_dim(self, name, num_tokens, fullgraph):
         config = MoEQuantConfig(
             Int8WeightOnlyConfig(), use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE
         )
-        base_class = Int8Tensor
+        tensor_impl_class = PlainAQTTensorImpl
 
         self._test_impl_moe_quant(
             config=config,
             num_tokens=num_tokens,
-            base_class=base_class,
+            tensor_impl_class=tensor_impl_class,
             fullgraph=fullgraph,
         )
 
@@ -188,12 +187,12 @@ def test_int8wo_base(self, name, num_tokens, fullgraph):
             self.skipTest("Need CUDA available")
 
         config = MoEQuantConfig(Int8WeightOnlyConfig())
-        base_class = Int8Tensor
+        tensor_impl_class = PlainAQTTensorImpl
 
         self._test_impl_moe_quant(
             config=config,
             num_tokens=num_tokens,
-            base_class=base_class,
+            tensor_impl_class=tensor_impl_class,
             fullgraph=fullgraph,
         )
 
@@ -205,12 +204,12 @@ def test_int8wo_base(self, name, num_tokens, fullgraph):
     )
     def test_int8wo_base_cpu(self, name, num_tokens, fullgraph):
         config = MoEQuantConfig(Int8WeightOnlyConfig())
-        base_class = Int8Tensor
+        tensor_impl_class = PlainAQTTensorImpl
 
         self._test_impl_moe_quant(
             config=config,
             num_tokens=num_tokens,
-            base_class=base_class,
+            tensor_impl_class=tensor_impl_class,
             fullgraph=fullgraph,
             device="cpu",
         )
@@ -228,7 +227,7 @@ def test_int8dq_fake_dim(self, name, num_tokens, fullgraph):
             Int8DynamicActivationInt8WeightConfig(),
             use_fake_extra_dim_tensor=UseFakeExtraDimTensor.TRUE,
         )
-        base_class = Int8Tensor
+        base_class = LinearActivationQuantizedTensor
 
         self._test_impl_moe_quant(
             model_params=(512, 256, 2, 2),
@@ -248,7 +247,7 @@ def test_int8dq_base(self, name, num_tokens, fullgraph):
             self.skipTest("Need CUDA available")
 
         config = MoEQuantConfig(Int8DynamicActivationInt8WeightConfig())
-        base_class = Int8Tensor 
+        base_class = LinearActivationQuantizedTensor
 
         self._test_impl_moe_quant(
             model_params=(512, 256, 2, 2),
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
index 7c0d614b4b..5cd81ece90 100644
--- a/test/quantization/test_quant_api.py
+++ b/test/quantization/test_quant_api.py
@@ -30,12 +30,12 @@
     AffineQuantizedTensor,
     Int4CPULayout,
     Int4XPULayout,
+    PlainLayout,
     TensorCoreTiledLayout,
 )
 from torchao.quantization import (
     Float8Tensor,
     Int4TilePackedTo4dTensor,
-    Int8Tensor,
     IntxUnpackedToInt8Tensor,
     LinearActivationQuantizedTensor,
     PerGroup,
@@ -626,7 +626,8 @@ def test_module_fqn_to_config_default(self):
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
         assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout)
-        assert isinstance(model.linear2.weight, Int8Tensor)
+        assert isinstance(model.linear2.weight, AffineQuantizedTensor)
+        assert isinstance(model.linear2.weight._layout, PlainLayout)
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_module_fqn_to_config_module_name(self):
@@ -639,7 +640,8 @@ def test_module_fqn_to_config_module_name(self):
         model(*example_inputs)
         assert isinstance(model.linear1.weight, AffineQuantizedTensor)
         assert isinstance(model.linear1.weight._layout, TensorCoreTiledLayout)
-        assert isinstance(model.linear2.weight, Int8Tensor)
+        assert isinstance(model.linear2.weight, AffineQuantizedTensor)
+        assert isinstance(model.linear2.weight._layout, PlainLayout)
 
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     def test_module_fqn_to_config_regex_basic(self):
@@ -1207,8 +1209,8 @@ def __init__(self):
         )
         quantize_(m, quant_config, filter_fn=None)
 
-        assert isinstance(m.nested.linear.weight, Int8Tensor)
-        assert isinstance(m.linear1.weight, Int8Tensor)
+        assert isinstance(m.nested.linear.weight, AffineQuantizedTensor)
+        assert isinstance(m.linear1.weight, AffineQuantizedTensor)
 
     @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available")
     def test_fqn_config_quantized_nested_module_param(self):
@@ -1232,8 +1234,8 @@ def __init__(self):
         )
         quantize_(m, quant_config, filter_fn=None)
 
-        assert isinstance(m.nested.linear.weight, Int8Tensor)
-        assert isinstance(m.linear1.weight, Int8Tensor)
+        assert isinstance(m.nested.linear.weight, AffineQuantizedTensor)
+        assert isinstance(m.linear1.weight, AffineQuantizedTensor)
 
     def test_fqn_config_module_config_and_fqn_config_both_specified(self):
         with self.assertRaises(ValueError):

From 739fd64ca84cbc3711ccd4e89b97ea45d22d2ed9 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Tue, 2 Dec 2025 18:09:50 -0800
Subject: [PATCH 08/19] fix ruff

---
 torchao/quantization/quant_api.py                     | 11 ++++-------
 torchao/quantization/quant_primitives.py              |  7 +++++--
 .../quantize_/workflows/int8/int8_tensor.py           | 11 ++---------
 3 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 541f1f9960..623258ddd2 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -89,9 +89,6 @@
     IntxUnpackedToInt8Tensor,
     QuantizeTensorToFloat8Kwargs,
 )
-from torchao.quantization.quantize_.workflows.int8.int8_tensor import (
-    QuantizeTensorToInt8Kwargs,
-)
 from torchao.quantization.transform_module import (
     _QUANTIZE_CONFIG_HANDLER,
     register_quantize_module_handler,
@@ -1345,7 +1342,9 @@ class Int8WeightOnlyConfig(AOBaseConfig):
     def __post_init__(self):
         torch._C._log_api_usage_once("torchao.quantization.Int8WeightOnlyConfig")
         if self.version == 2:
-            assert self.group_size is None, f"Only support version 2 with group_size=None, got {self.group_size}"
+            assert self.group_size is None, (
+                f"Only support version 2 with group_size=None, got {self.group_size}"
+            )
 
 
 # for BC
@@ -1527,9 +1526,7 @@ class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
     layout: Optional[Layout] = PlainLayout()
     act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC
     weight_only_decode: bool = False
-    # TODO: Revisit for supported granularitys
-    # https://github.com/pytorch/ao/pull/3241#discussion_r2551497849
-    granularity: Optional[Granularity] = PerRow()
+    granularity: Optional[Union[Granularity, List[Granularity, Granularity]]] = PerRow()
     set_inductor_config: bool = True
     version: int = 1
 
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
index ee1da11c50..9bdb3871a2 100644
--- a/torchao/quantization/quant_primitives.py
+++ b/torchao/quantization/quant_primitives.py
@@ -1217,6 +1217,7 @@ def choose_qparams_affine(
     eps: Optional[float] = None,
     scale_dtype: Optional[torch.dtype] = None,
     zero_point_dtype: Optional[torch.dtype] = torch.int32,
+    keepdim: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Args:
@@ -1247,6 +1248,7 @@ def choose_qparams_affine(
         eps,
         scale_dtype,
         zero_point_dtype,
+        keepdim,
     )
 
 
@@ -1521,6 +1523,7 @@ def _choose_qparams_affine(
     eps: Optional[float] = None,
     scale_dtype: Optional[torch.dtype] = None,
     zero_point_dtype: Optional[torch.dtype] = None,
+    keepdim: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """op definition that has compatible signatures with custom op library
 
@@ -1550,8 +1553,8 @@ def _choose_qparams_affine(
     )
     input = input.view(shape_for_reduction)
 
-    min_val = torch.amin(input, dim=reduction_dims, keepdim=False)
-    max_val = torch.amax(input, dim=reduction_dims, keepdim=False)
+    min_val = torch.amin(input, dim=reduction_dims, keepdim=keepdim)
+    max_val = torch.amax(input, dim=reduction_dims, keepdim=keepdim)
 
     min_val_neg = torch.min(min_val, torch.zeros_like(min_val))
     max_val_pos = torch.max(max_val, torch.zeros_like(max_val))
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index 4654799468..534af7ed75 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -6,15 +6,12 @@
 
 from dataclasses import dataclass
 from typing import List, Optional
-from typing import Optional
 
 import torch
 from torch.utils._python_dispatch import return_and_correct_aliasing
 
 from torchao.float8.inference import _slice_scale_for_dimension
 from torchao.kernel import int_scaled_matmul
-from torchao.quantization.granularity import Granularity, PerRow, PerTensor
-from torchao.kernel import int_scaled_matmul
 from torchao.quantization.granularity import Granularity, PerRow
 from torchao.quantization.quant_primitives import (
     MappingType,
@@ -126,6 +123,7 @@ def from_hp(
             quant_max=127,
             scale_dtype=hp_tensor.dtype,
             zero_point_dtype=torch.int8,
+            keepdim=True,
         )
 
         int_data = quantize_affine(
@@ -136,12 +134,6 @@ def from_hp(
             output_dtype=torch.int8,
         )
 
-        # make scale the correct dim
-        if isinstance(granularity, PerRow):
-            scale = scale.unsqueeze(1)
-        elif isinstance(granularity, PerTensor):
-            scale = scale.unsqueeze(0).unsqueeze(1)
-
         return cls(
             int_data,
             scale,
@@ -279,6 +271,7 @@ def _(func, types, args, kwargs):
         ),
     )
 
+
 @implements(aten.index.Tensor)
 def _(func, types, args, kwargs):
     return return_and_correct_aliasing(

From 750db1af16f9e7b36b0a85bdd580d279f559dc39 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Tue, 2 Dec 2025 18:15:23 -0800
Subject: [PATCH 09/19] fix varname

---
 torchao/quantization/quant_api.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 623258ddd2..2bf64c6d2d 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1588,9 +1588,7 @@ def get_weight_block_size(x):
             _layout=layout,
             zero_point_domain=weight_zero_point_domain,
         )
-        quantized_weight = to_linear_activation_quantized(
-            quantized_weight, input_quant_func
-        )
+        quantized_weight = to_linear_activation_quantized(new_weight, input_quant_func)
     else:
         from torchao.quantization.quantize_.workflows.int8.int8_tensor import (
             QuantizeTensorToInt8Kwargs,

From 94104884cfb323f959a542bc41703a385ca8b1b0 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Tue, 2 Dec 2025 18:35:18 -0800
Subject: [PATCH 10/19] fix typing

---
 torchao/quantization/quant_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 2bf64c6d2d..25dc43eeb6 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1526,7 +1526,7 @@ class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
     layout: Optional[Layout] = PlainLayout()
     act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC
     weight_only_decode: bool = False
-    granularity: Optional[Union[Granularity, List[Granularity, Granularity]]] = PerRow()
+    granularity: Optional[Union[Granularity, List[Granularity]]] = PerRow()
     set_inductor_config: bool = True
     version: int = 1
 

From 45a3a769e668b497ce0e4b351c900ebdbed6c9a4 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Tue, 2 Dec 2025 19:14:47 -0800
Subject: [PATCH 11/19] add tests

---
 .../workflows/int8/test_int8_tensor.py        | 78 ++++++++++---------
 torchao/quantization/quant_api.py             | 12 ++-
 .../quantize_/workflows/int8/int8_tensor.py   |  3 +-
 3 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
index 906eb207fd..41583ad743 100644
--- a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
+++ b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -18,11 +18,29 @@
     quantize_,
 )
 from torchao.quantization.granularity import PerRow, PerTensor
+from torchao.quantization.quant_primitives import MappingType
 from torchao.quantization.utils import compute_error, get_block_size
 from torchao.testing.model_architectures import ToyTwoLinearModel
 from torchao.testing.utils import TorchAOIntegrationTestCase
 from torchao.utils import torch_version_at_least
 
+INT8_TEST_CONFIGS = [
+    Int8WeightOnlyConfig(version=2, granularity=PerTensor()),
+    Int8WeightOnlyConfig(version=2, granularity=PerRow()),
+    Int8DynamicActivationInt8WeightConfig(
+        version=2, granularity=PerTensor(), act_mapping_type=MappingType.ASYMMETRIC
+    ),
+    Int8DynamicActivationInt8WeightConfig(
+        version=2, granularity=PerRow(), act_mapping_type=MappingType.ASYMMETRIC
+    ),
+    Int8DynamicActivationInt8WeightConfig(
+        version=2, granularity=PerTensor(), act_mapping_type=MappingType.SYMMETRIC
+    ),
+    Int8DynamicActivationInt8WeightConfig(
+        version=2, granularity=PerRow(), act_mapping_type=MappingType.SYMMETRIC
+    ),
+]
+
 
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
 @common_utils.instantiate_parametrized_tests
@@ -36,13 +54,7 @@ def setUp(self):
 
         torch.manual_seed(42)
 
-    @common_utils.parametrize(
-        "config",
-        [
-            Int8DynamicActivationInt8WeightConfig(version=2),
-            Int8WeightOnlyConfig(version=2),
-        ],
-    )
+    @common_utils.parametrize("config", INT8_TEST_CONFIGS)
     def test_creation_and_attributes(self, config):
         """Test tensor creation, dtypes, and ranges"""
         linear = torch.nn.Linear(
@@ -60,15 +72,22 @@ def test_creation_and_attributes(self, config):
         self.assertEqual(w.qdata.dtype, torch.int8)
         self.assertTrue(torch.all(w.qdata >= -128) and torch.all(w.qdata <= 127))
 
+        if isinstance(config.granularity, PerRow):
+            self.assertEqual(w.scale.shape, (w.shape[0], 1))
+        elif isinstance(config.granularity, PerTensor):
+            self.assertEqual(w.scale.shape, (1, 1))
+
+        if config.act_mapping_type == MappingType.SYMMETRIC:
+            self.assertEqual(w.zero_point, None)
+        elif config.act_mapping_type == MappingType.ASYMMETRIC:
+            if isinstance(config.granularity, PerRow):
+                self.assertEqual(w.zero_point.shape, (w.shape[0], 1))
+            elif isinstance(config.granularity, PerTensor):
+                self.assertEqual(w.zero_point.shape, (1, 1))
+
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
     @common_utils.parametrize("compile", [True, False])
-    @common_utils.parametrize(
-        "config",
-        [
-            Int8DynamicActivationInt8WeightConfig(version=2),
-            Int8WeightOnlyConfig(version=2),
-        ],
-    )
+    @common_utils.parametrize("config", INT8_TEST_CONFIGS)
     @common_utils.parametrize(
         "sizes",
         [
@@ -91,10 +110,15 @@ def test_int8_linear_variants(
 
         quantize_(model_q, config)
 
-        self.assertEqual(model_q.linear2.weight.scale.shape, (K, 1))
+        if isinstance(config.granularity, PerRow):
+            self.assertEqual(model_q.linear2.weight.scale.shape, (K, 1))
+        elif isinstance(config.granularity, PerTensor):
+            self.assertEqual(model_q.linear2.weight.scale.shape, (1, 1))
+
         self.assertEqual(model_q.linear2.weight.scale.ndim, 2)
 
         if compile:
+            torch.compiler.reset()
             model_q = torch.compile(model_q, fullgraph=True)
 
         output_fp = model(input_tensor)
@@ -104,13 +128,7 @@ def test_int8_linear_variants(
             f"Quantization error is too high got a SQNR of {compute_error(output_fp, output_quantized)}"
         )
 
-    @common_utils.parametrize(
-        "config",
-        [
-            Int8DynamicActivationInt8WeightConfig(version=2),
-            Int8WeightOnlyConfig(version=2),
-        ],
-    )
+    @common_utils.parametrize("config", INT8_TEST_CONFIGS)
     @common_utils.parametrize("device", ["cpu", "cuda"])
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float16])
     def test_slice(self, config, device, dtype):
@@ -133,13 +151,7 @@ def test_slice(self, config, device, dtype):
         with self.assertRaises(NotImplementedError):
             _ = dummy.weight[::2]
 
-    @common_utils.parametrize(
-        "config",
-        [
-            Int8DynamicActivationInt8WeightConfig,
-            Int8WeightOnlyConfig,
-        ],
-    )
+    @common_utils.parametrize("config", INT8_TEST_CONFIGS)
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     def test_index_select(self, config, granularity):
         """test that `x_0 = x[0]` works when `x` is a 2D quantized tensor."""
@@ -169,13 +181,7 @@ def test_index_select(self, config, granularity):
                 list(get_block_size(x_int8.shape, config.granularity)), [N, K]
             )
 
-    @common_utils.parametrize(
-        "config",
-        [
-            Int8DynamicActivationInt8WeightConfig(version=2),
-            Int8WeightOnlyConfig(version=2),
-        ],
-    )
+    @common_utils.parametrize("config", INT8_TEST_CONFIGS)
     def test_dequantization_accuracy(self, config):
         """Test dequantization accuracy separately"""
         linear = torch.nn.Linear(
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 25dc43eeb6..fe5dfd0c74 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1594,16 +1594,22 @@ def get_weight_block_size(x):
             QuantizeTensorToInt8Kwargs,
         )
 
+        assert config.granularity in {PerRow(), PerTensor()}, (
+            "Only PerRow and PerTensor are supported"
+        )
+        weight_granularity, act_granularity = _normalize_granularity(config.granularity)
+
         assert config.version == 2, f"Unexpected version: {config.version}"
 
         # TODO: Symmentric/Asymmetric choice for weight quantization
         # https://github.com/pytorch/ao/pull/3241#discussion_r2551515539
-        # TODO: Add block_size args to return in from_hp
-        # https://github.com/pytorch/ao/pull/3241#discussion_r2552016429
         quantized_weight = Int8Tensor.from_hp(
             weight,
             granularity=config.granularity,
-            act_quant_kwargs=QuantizeTensorToInt8Kwargs(granularity=config.granularity),
+            act_quant_kwargs=QuantizeTensorToInt8Kwargs(
+                granularity=act_granularity,
+                act_mapping_type=config.act_mapping_type,
+            ),
         )
 
     return quantized_weight
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index 534af7ed75..ebdc1c4689 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -37,6 +37,7 @@ class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
     """
 
     granularity: Granularity = PerRow()
+    act_mapping_type: MappingType = MappingType.SYMMETRIC
 
 
 class Int8Tensor(TorchAOBaseTensor):
@@ -59,7 +60,6 @@ class Int8Tensor(TorchAOBaseTensor):
     optional_tensor_attribute_names = [
         "block_size",
         "act_quant_kwargs",
-        "dtype",
     ]
 
     def __new__(
@@ -83,7 +83,6 @@ def __init__(
         scale: torch.Tensor,
         block_size: Optional[List[int]] = None,
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
-        dtype: Optional[torch.dtype] = None,
     ):
         super().__init__()
         self.qdata = qdata

From 4e2f09c2743c0336ae9d4dacf9acb45adf630130 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Tue, 2 Dec 2025 19:17:03 -0800
Subject: [PATCH 12/19] fix dtype

---
 torchao/quantization/quantize_/workflows/int8/int8_tensor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index ebdc1c4689..420e7f5b81 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -60,6 +60,7 @@ class Int8Tensor(TorchAOBaseTensor):
     optional_tensor_attribute_names = [
         "block_size",
         "act_quant_kwargs",
+        "dtype",
     ]
 
     def __new__(
@@ -83,6 +84,7 @@ def __init__(
         scale: torch.Tensor,
         block_size: Optional[List[int]] = None,
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
+        dtype: Optional[torch.dtype] = None,
     ):
         super().__init__()
         self.qdata = qdata

From dd80cca3fc7f95b453ebe148b688d15b5e844877 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 3 Dec 2025 15:53:52 -0800
Subject: [PATCH 13/19] fix ci

---
 .../workflows/int8/test_int8_tensor.py        | 33 ++++++++-----------
 torchao/quantization/quant_api.py             |  2 +-
 .../quantize_/workflows/int8/int8_tensor.py   | 14 +++++---
 3 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
index 41583ad743..795e7d6ed2 100644
--- a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
+++ b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -27,12 +27,6 @@
 INT8_TEST_CONFIGS = [
     Int8WeightOnlyConfig(version=2, granularity=PerTensor()),
     Int8WeightOnlyConfig(version=2, granularity=PerRow()),
-    Int8DynamicActivationInt8WeightConfig(
-        version=2, granularity=PerTensor(), act_mapping_type=MappingType.ASYMMETRIC
-    ),
-    Int8DynamicActivationInt8WeightConfig(
-        version=2, granularity=PerRow(), act_mapping_type=MappingType.ASYMMETRIC
-    ),
     Int8DynamicActivationInt8WeightConfig(
         version=2, granularity=PerTensor(), act_mapping_type=MappingType.SYMMETRIC
     ),
@@ -77,13 +71,8 @@ def test_creation_and_attributes(self, config):
         elif isinstance(config.granularity, PerTensor):
             self.assertEqual(w.scale.shape, (1, 1))
 
-        if config.act_mapping_type == MappingType.SYMMETRIC:
-            self.assertEqual(w.zero_point, None)
-        elif config.act_mapping_type == MappingType.ASYMMETRIC:
-            if isinstance(config.granularity, PerRow):
-                self.assertEqual(w.zero_point.shape, (w.shape[0], 1))
-            elif isinstance(config.granularity, PerTensor):
-                self.assertEqual(w.zero_point.shape, (1, 1))
+        if hasattr(config, "act_mapping_type"):
+            self.assertEqual(w.act_quant_kwargs.mapping_type, config.act_mapping_type)
 
     @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
     @common_utils.parametrize("compile", [True, False])
@@ -103,6 +92,8 @@ def test_int8_linear_variants(
         sizes: tuple,
     ):
         """Test linear operation supports including shape and compile"""
+        torch.compiler.reset()
+
         M, N, K = sizes
         input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
         model = ToyTwoLinearModel(K, N, K, dtype=dtype, device="cuda").eval()
@@ -118,7 +109,6 @@ def test_int8_linear_variants(
         self.assertEqual(model_q.linear2.weight.scale.ndim, 2)
 
         if compile:
-            torch.compiler.reset()
             model_q = torch.compile(model_q, fullgraph=True)
 
         output_fp = model(input_tensor)
@@ -146,21 +136,24 @@ def test_slice(self, config, device, dtype):
 
         self.assertEqual(weight1.qdata, dummy.weight.qdata.narrow(0, 0, slice_sizes[0]))
         self.assertEqual(weight2.qdata, dummy.weight.qdata.narrow(1, 0, slice_sizes[1]))
-        self.assertEqual(weight1.scale, dummy.weight.scale.narrow(0, 0, slice_sizes[0]))
+
+        if isinstance(config.granularity, PerRow):
+            self.assertEqual(
+                weight1.scale, dummy.weight.scale.narrow(0, 0, slice_sizes[0])
+            )
+
         self.assertEqual(weight2.scale, dummy.weight.scale)
         with self.assertRaises(NotImplementedError):
             _ = dummy.weight[::2]
 
     @common_utils.parametrize("config", INT8_TEST_CONFIGS)
-    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
-    def test_index_select(self, config, granularity):
+    def test_index_select(self, config):
         """test that `x_0 = x[0]` works when `x` is a 2D quantized tensor."""
         N, K = 256, 512
         x = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)
         linear = torch.nn.Linear(K, N, bias=False, dtype=torch.bfloat16, device="cuda")
         linear.weight.data = x
 
-        config = config(version=2, granularity=granularity)
         quantize_(linear, config)
 
         x_int8 = linear.weight
@@ -172,11 +165,11 @@ def test_index_select(self, config, granularity):
         )
 
         # Test block_size granularity
-        if isinstance(granularity, PerRow):
+        if isinstance(config.granularity, PerRow):
             self.assertEqual(
                 list(get_block_size(x_int8.shape, config.granularity)), [1, K]
             )
-        elif isinstance(granularity, PerTensor):
+        elif isinstance(config.granularity, PerTensor):
             self.assertEqual(
                 list(get_block_size(x_int8.shape, config.granularity)), [N, K]
             )
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index fe5dfd0c74..5feb808cb6 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1608,7 +1608,7 @@ def get_weight_block_size(x):
             granularity=config.granularity,
             act_quant_kwargs=QuantizeTensorToInt8Kwargs(
                 granularity=act_granularity,
-                act_mapping_type=config.act_mapping_type,
+                mapping_type=config.act_mapping_type,
             ),
         )
 
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index 420e7f5b81..ea0eeadca4 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -37,12 +37,14 @@ class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
     """
 
     granularity: Granularity = PerRow()
-    act_mapping_type: MappingType = MappingType.SYMMETRIC
+    mapping_type: MappingType = MappingType.SYMMETRIC
 
 
 class Int8Tensor(TorchAOBaseTensor):
     """
-    int8 quantized tensor with plain layout
+    int8 quantized tensor with plain layout.
+
+    Currently only Symmetric quantization is supported.
 
     Tensor Attributes:
         qdata: (N, K) or (B, N, K) int8 quantized weight data (2D or 3D)
@@ -73,7 +75,7 @@ def __new__(
     ):
         kwargs = {
             "device": qdata.device,
-            "dtype": dtype,
+            "dtype": dtype or scale.dtype,
             "requires_grad": False,
         }
         return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, **kwargs)
@@ -110,6 +112,7 @@ def from_hp(
         hp_tensor: torch.Tensor,
         granularity: Granularity = PerRow(),
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
+        mapping_type=MappingType.SYMMETRIC,
     ):
         """Create Int8Tensor from high-precision tensor"""
         block_size = get_block_size(hp_tensor.shape, granularity)
@@ -117,7 +120,7 @@ def from_hp(
 
         scale, zero_point = choose_qparams_affine(
             input=hp_tensor,
-            mapping_type=MappingType.SYMMETRIC,
+            mapping_type=mapping_type,
             block_size=block_size,
             target_dtype=torch.int8,
             quant_min=-128,
@@ -179,7 +182,8 @@ def _(func, types, args, kwargs):
 
     if weight_tensor.act_quant_kwargs is not None:
         activation_tensor = Int8Tensor.from_hp(
-            activation_tensor, weight_tensor.act_quant_kwargs.granularity
+            activation_tensor,
+            granularity=weight_tensor.act_quant_kwargs.granularity,
         )
         # Dynamic activation quantization path
 

From 7f730621e0cacef0b59073a4b6cb2d2754aad752 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Wed, 3 Dec 2025 16:15:33 -0800
Subject: [PATCH 14/19] address granularity cr

---
 torchao/quantization/quant_api.py                      | 10 +++++++---
 .../quantize_/workflows/int8/int8_tensor.py            |  7 ++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index 5feb808cb6..e94b7ba82c 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1526,7 +1526,7 @@ class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
     layout: Optional[Layout] = PlainLayout()
     act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC
     weight_only_decode: bool = False
-    granularity: Optional[Union[Granularity, List[Granularity]]] = PerRow()
+    granularity: Union[PerRow, PerTensor] = PerRow()
     set_inductor_config: bool = True
     version: int = 1
 
@@ -1597,15 +1597,19 @@ def get_weight_block_size(x):
         assert config.granularity in {PerRow(), PerTensor()}, (
             "Only PerRow and PerTensor are supported"
         )
-        weight_granularity, act_granularity = _normalize_granularity(config.granularity)
+        weight_granularity = config.granularity
+        act_granularity = config.granularity
 
+        assert config.act_mapping_type == MappingType.SYMMETRIC, (
+            "asymmetric dynamic quant not supported currently"
+        )
         assert config.version == 2, f"Unexpected version: {config.version}"
 
         # TODO: Symmentric/Asymmetric choice for weight quantization
         # https://github.com/pytorch/ao/pull/3241#discussion_r2551515539
         quantized_weight = Int8Tensor.from_hp(
             weight,
-            granularity=config.granularity,
+            granularity=weight_granularity,
             act_quant_kwargs=QuantizeTensorToInt8Kwargs(
                 granularity=act_granularity,
                 mapping_type=config.act_mapping_type,
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index ea0eeadca4..84b82f3c44 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -30,10 +30,11 @@
 
 @dataclass
 class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
-    """Tensor kwargs for creating int8 tensor from high precision
+    """Tensor kwargs for creating int8 tensor for activation.
 
     Args:
         granularity: the granularity for the Tensor, currently either PerRow() or PerTensor()
+        mapping_type: whether to use symmetric or asymmetric quant, only symmetric is supported currently
     """
 
     granularity: Granularity = PerRow()
@@ -151,7 +152,7 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
         return dequantize_affine(
             input=self.qdata,
             block_size=self.block_size,
-            scale=self.scale.squeeze(),
+            scale=self.scale,
             zero_point=None,
             input_dtype=torch.int8,
             quant_min=-128,
@@ -298,7 +299,7 @@ def _(func, types, args, kwargs):
     assert len(old_int8_tensor.qdata.shape) == len(old_int8_tensor.block_size), (
         "unsupported"
     )
-    new_int8_tensor = old_int8_tensor.__class__(
+    new_int8_tensor = Int8Tensor(
         old_int8_tensor.qdata[index],
         old_int8_tensor.scale[index],
         old_int8_tensor.block_size[1:],

From ac6a2b6d75183937d9ae4090f9fdf74fa296170f Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Thu, 4 Dec 2025 12:12:23 -0800
Subject: [PATCH 15/19] update _choose_quant_func_and_quantize_tensor

---
 torchao/quantization/quant_api.py                      |  1 +
 .../quantize_/common/quantize_tensor_kwargs.py         |  9 +++++++++
 .../quantize_/workflows/int8/int8_tensor.py            | 10 ++++++----
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index e94b7ba82c..df57273c7a 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1610,6 +1610,7 @@ def get_weight_block_size(x):
         quantized_weight = Int8Tensor.from_hp(
             weight,
             granularity=weight_granularity,
+            mapping_type=MappingType.SYMMETRIC,
             act_quant_kwargs=QuantizeTensorToInt8Kwargs(
                 granularity=act_granularity,
                 mapping_type=config.act_mapping_type,
diff --git a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
index 0adc8c786d..16ad39abaa 100644
--- a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
+++ b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
@@ -39,7 +39,9 @@ def _choose_quant_func_and_quantize_tensor(
     """
     from torchao.quantization.quantize_.workflows import (
         Float8Tensor,
+        Int8Tensor,
         QuantizeTensorToFloat8Kwargs,
+        QuantizeTensorToInt8Kwargs,
     )
 
     if isinstance(quant_kwargs, QuantizeTensorToFloat8Kwargs):
@@ -53,4 +55,11 @@ def _choose_quant_func_and_quantize_tensor(
             quant_kwargs.kernel_preference,
         )
 
+    if isinstance(quant_kwargs, QuantizeTensorToInt8Kwargs):
+        return Int8Tensor.from_hp(
+            tensor,
+            quant_kwargs.granularity,
+            quant_kwargs.mapping_type,
+        )
+
     raise NotImplementedError(f"Quant kwargs not supported: {quant_kwargs}")
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index 84b82f3c44..d2a352ceac 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -19,7 +19,10 @@
     dequantize_affine,
     quantize_affine,
 )
-from torchao.quantization.quantize_.common import QuantizeTensorKwargs
+from torchao.quantization.quantize_.common import (
+    QuantizeTensorKwargs,
+    _choose_quant_func_and_quantize_tensor,
+)
 from torchao.quantization.utils import get_block_size
 from torchao.utils import TorchAOBaseTensor, fill_defaults
 
@@ -182,9 +185,8 @@ def _(func, types, args, kwargs):
     output_dtype = activation_tensor.dtype
 
     if weight_tensor.act_quant_kwargs is not None:
-        activation_tensor = Int8Tensor.from_hp(
-            activation_tensor,
-            granularity=weight_tensor.act_quant_kwargs.granularity,
+        activation_tensor = _choose_quant_func_and_quantize_tensor(
+            activation_tensor, weight_tensor.act_quant_kwargs
         )
         # Dynamic activation quantization path
 

From f28df4a8f0a4821ed2b59b9d0d6c6da2bae90f14 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Thu, 4 Dec 2025 12:20:03 -0800
Subject: [PATCH 16/19] make block size required attribute

---
 .../quantize_/workflows/int8/int8_tensor.py           | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index d2a352ceac..40f6bb377e 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -62,9 +62,8 @@ class Int8Tensor(TorchAOBaseTensor):
 
     # TODO: Static quantization support using `static_scale`
     tensor_data_names = ["qdata", "scale"]
-    tensor_attribute_names = []
+    tensor_attribute_names = ["block_size"]
     optional_tensor_attribute_names = [
-        "block_size",
         "act_quant_kwargs",
         "dtype",
     ]
@@ -73,7 +72,7 @@ def __new__(
         cls: type,
         qdata: torch.Tensor,
         scale: torch.Tensor,
-        block_size: Optional[List[int]] = None,
+        block_size: List[int] = None,
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
         dtype: Optional[torch.dtype] = None,
     ):
@@ -88,7 +87,7 @@ def __init__(
         self,
         qdata: torch.Tensor,
         scale: torch.Tensor,
-        block_size: Optional[List[int]] = None,
+        block_size: List[int],
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
         dtype: Optional[torch.dtype] = None,
     ):
@@ -145,7 +144,7 @@ def from_hp(
         return cls(
             int_data,
             scale,
-            block_size=block_size,
+            block_size,
             act_quant_kwargs=act_quant_kwargs,
             dtype=hp_tensor.dtype,
         )
@@ -273,7 +272,7 @@ def _(func, types, args, kwargs):
         Int8Tensor(
             sliced_qdata,
             sliced_scale,
-            block_size=block_size,
+            block_size,
             act_quant_kwargs=self.act_quant_kwargs,
             dtype=self.dtype,
         ),

From 328585e4a45d63899dd138f80a943d48414a4a0f Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Thu, 4 Dec 2025 12:26:46 -0800
Subject: [PATCH 17/19] made dtype required as well

---
 torchao/quantization/quant_api.py             |  1 -
 .../quantize_/workflows/int8/int8_tensor.py   | 20 +++++++++----------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index df57273c7a..e94b7ba82c 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1610,7 +1610,6 @@ def get_weight_block_size(x):
         quantized_weight = Int8Tensor.from_hp(
             weight,
             granularity=weight_granularity,
-            mapping_type=MappingType.SYMMETRIC,
             act_quant_kwargs=QuantizeTensorToInt8Kwargs(
                 granularity=act_granularity,
                 mapping_type=config.act_mapping_type,
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index 40f6bb377e..3854c0ec93 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -62,23 +62,22 @@ class Int8Tensor(TorchAOBaseTensor):
 
     # TODO: Static quantization support using `static_scale`
     tensor_data_names = ["qdata", "scale"]
-    tensor_attribute_names = ["block_size"]
+    tensor_attribute_names = ["block_size", "dtype"]
     optional_tensor_attribute_names = [
         "act_quant_kwargs",
-        "dtype",
     ]
 
     def __new__(
         cls: type,
         qdata: torch.Tensor,
         scale: torch.Tensor,
-        block_size: List[int] = None,
+        block_size: List[int],
+        dtype: torch.dtype,
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
-        dtype: Optional[torch.dtype] = None,
     ):
         kwargs = {
             "device": qdata.device,
-            "dtype": dtype or scale.dtype,
+            "dtype": dtype,
             "requires_grad": False,
         }
         return torch.Tensor._make_wrapper_subclass(cls, qdata.shape, **kwargs)
@@ -88,13 +87,14 @@ def __init__(
         qdata: torch.Tensor,
         scale: torch.Tensor,
         block_size: List[int],
+        dtype: torch.dtype,
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
-        dtype: Optional[torch.dtype] = None,
     ):
         super().__init__()
         self.qdata = qdata
         self.scale = scale
         self.block_size = block_size
+        # don't set dtype because this gets done in __new__
         self.act_quant_kwargs = act_quant_kwargs
 
     def __repr__(self):
@@ -145,8 +145,8 @@ def from_hp(
             int_data,
             scale,
             block_size,
+            hp_tensor.dtype,
             act_quant_kwargs=act_quant_kwargs,
-            dtype=hp_tensor.dtype,
         )
 
     def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor:
@@ -159,7 +159,7 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
             input_dtype=torch.int8,
             quant_min=-128,
             quant_max=127,
-            output_dtype=output_dtype or self.dtype,
+            output_dtype=output_dtype if output_dtype is not None else self.dtype,
         )
 
 
@@ -273,8 +273,8 @@ def _(func, types, args, kwargs):
             sliced_qdata,
             sliced_scale,
             block_size,
+            self.dtype,
             act_quant_kwargs=self.act_quant_kwargs,
-            dtype=self.dtype,
         ),
     )
 
@@ -304,8 +304,8 @@ def _(func, types, args, kwargs):
         old_int8_tensor.qdata[index],
         old_int8_tensor.scale[index],
         old_int8_tensor.block_size[1:],
-        old_int8_tensor.act_quant_kwargs,
         old_int8_tensor.dtype,
+        old_int8_tensor.act_quant_kwargs,
     )
     return return_and_correct_aliasing(func, args, kwargs, new_int8_tensor)
 

From ce4d568fdc4cac0e18d2ebf7b4752d2c91ec6653 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Thu, 4 Dec 2025 12:45:12 -0800
Subject: [PATCH 18/19] address nits

---
 torchao/quantization/quant_api.py                           | 2 +-
 .../quantization/quantize_/common/quantize_tensor_kwargs.py | 2 +-
 .../quantization/quantize_/workflows/int8/int8_tensor.py    | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
index e94b7ba82c..24d6b6676c 100644
--- a/torchao/quantization/quant_api.py
+++ b/torchao/quantization/quant_api.py
@@ -1526,7 +1526,7 @@ class Int8DynamicActivationInt8WeightConfig(AOBaseConfig):
     layout: Optional[Layout] = PlainLayout()
     act_mapping_type: Optional[MappingType] = MappingType.SYMMETRIC
     weight_only_decode: bool = False
-    granularity: Union[PerRow, PerTensor] = PerRow()
+    granularity: Granularity = PerRow()
     set_inductor_config: bool = True
     version: int = 1
 
diff --git a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
index 16ad39abaa..e4544a2f0c 100644
--- a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
+++ b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
@@ -59,7 +59,7 @@ def _choose_quant_func_and_quantize_tensor(
         return Int8Tensor.from_hp(
             tensor,
             quant_kwargs.granularity,
-            quant_kwargs.mapping_type,
+            mapping_type=quant_kwargs.mapping_type,
         )
 
     raise NotImplementedError(f"Quant kwargs not supported: {quant_kwargs}")
diff --git a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
index 3854c0ec93..dd422b90f6 100644
--- a/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
+++ b/torchao/quantization/quantize_/workflows/int8/int8_tensor.py
@@ -12,7 +12,7 @@
 
 from torchao.float8.inference import _slice_scale_for_dimension
 from torchao.kernel import int_scaled_matmul
-from torchao.quantization.granularity import Granularity, PerRow
+from torchao.quantization.granularity import Granularity
 from torchao.quantization.quant_primitives import (
     MappingType,
     choose_qparams_affine,
@@ -40,7 +40,7 @@ class QuantizeTensorToInt8Kwargs(QuantizeTensorKwargs):
         mapping_type: whether to use symmetric or asymmetric quant, only symmetric is supported currently
     """
 
-    granularity: Granularity = PerRow()
+    granularity: Granularity
     mapping_type: MappingType = MappingType.SYMMETRIC
 
 
@@ -113,7 +113,7 @@ def __repr__(self):
     def from_hp(
         cls,
         hp_tensor: torch.Tensor,
-        granularity: Granularity = PerRow(),
+        granularity: Granularity,
         act_quant_kwargs: Optional[QuantizeTensorToInt8Kwargs] = None,
         mapping_type=MappingType.SYMMETRIC,
     ):

From a665d451c0cb3baccefd246befd55ba72e730f28 Mon Sep 17 00:00:00 2001
From: Jesse Cai <jcjessecai@gmail.com>
Date: Thu, 4 Dec 2025 13:42:59 -0800
Subject: [PATCH 19/19] skip per tensor weight only test for now

---
 .../quantize_/workflows/int8/test_int8_tensor.py             | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
index 795e7d6ed2..2819903e69 100644
--- a/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
+++ b/test/quantization/quantize_/workflows/int8/test_int8_tensor.py
@@ -109,6 +109,11 @@ def test_int8_linear_variants(
         self.assertEqual(model_q.linear2.weight.scale.ndim, 2)
 
         if compile:
+            if isinstance(config, Int8WeightOnlyConfig) and isinstance(
+                config.granularity, PerTensor
+            ):
+                # currently the inductor lowering for weight only quant in core does not support per-tensor gpu, so this errors. Skipping for now, but will address this in core
+                return
             model_q = torch.compile(model_q, fullgraph=True)
 
         output_fp = model(input_tensor)