diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py index 7dac4049feb..e0256437022 100644 --- a/backends/cadence/aot/quantizer/quantizer.py +++ b/backends/cadence/aot/quantizer/quantizer.py @@ -386,3 +386,16 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: quantizers.append(CadenceAtenQuantizer(Conv1dPattern(), qconfig_A16)) quantizers.append(CadenceAtenQuantizer(Conv2dPattern(), qconfig_A16)) super().__init__(quantizers) + + +class CadenceWith16BitMatmulActivationsQuantizer(CadenceQuantizer): + """ + Quantizer including A16 matmul + """ + + def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None: + if quantizers is None: + quantizers = [] + # Add 16-bit quantizers for MatmulPattern + quantizers.append(CadenceAtenQuantizer(MatmulPattern(), qconfig_A16)) + super().__init__(quantizers) diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp b/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp index 90fe483660b..5b615c41386 100644 --- a/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp +++ b/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp @@ -8,6 +8,7 @@ #include #include +#include #include using executorch::aten::ScalarType; @@ -192,8 +193,20 @@ void quantized_matmul_out( size_t leading_dim = X.size(X.dim() - 2); size_t out_dim = Y.size(Y.dim() - 1 - transposed); size_t in_dim = X.size(X.dim() - 1); - - if (out.scalar_type() == exec_aten::ScalarType::Byte) { + if (out.scalar_type() == exec_aten::ScalarType::Short) { + ::impl::generic::native::quantized_matmul_out( + ctx, + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + out); + } else if (out.scalar_type() == exec_aten::ScalarType::Byte) { _typed_quantized_matmul( ctx, X, diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.h b/backends/cadence/hifi/operators/op_quantized_matmul_out.h new file mode 100644 index 00000000000..c53a07b58aa --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantized_matmul_out.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include "executorch/runtime/core/exec_aten/exec_aten.h" +#include "executorch/runtime/kernel/kernel_runtime_context.h" + +namespace impl { +namespace HiFi { +namespace native { + +::executorch::aten::Tensor& quantized_matmul_out( + ::executorch::runtime::KernelRuntimeContext& ctx, + const ::executorch::aten::Tensor& X, + int64_t X_zero_point, + const ::executorch::aten::Tensor& Y, + int64_t Y_zero_point, + const ::executorch::aten::optional<::executorch::aten::Tensor>& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + ::executorch::aten::Tensor& out); + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index c993745c4c0..9ff7f060277 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -2,7 +2,7 @@ load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX") load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") -def define_operator(name: str, deps: list[str] | None = None) -> None: +def define_operator(name: str, deps: list[str] | None = None, exported_headers: list[str] | None = None) -> None: op_name = "op_{}".format(name) # Deps used by all operators. @@ -21,6 +21,8 @@ def define_operator(name: str, deps: list[str] | None = None) -> None: ] if deps == None: deps = [] + if exported_headers == None: + exported_headers = ["operators.h"] runtime.cxx_library( name = op_name, @@ -32,7 +34,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None: ], compatible_with = ["ovr_config//cpu:xtensa"], deps = deps + common_deps, - exported_headers = ["operators.h"], + exported_headers = exported_headers, ) OPERATORS = [ @@ -87,7 +89,6 @@ OPERATORS = [ "quantized_layer_norm", "quantized_linear_asym8sxasym8s_asym8s_per_tensor_out", "quantized_linear_asym8uxasym8u_asym8u_per_tensor_out", - "quantized_matmul_out", "quantized_matmul_asym8sxasym8s_asym8s_out", "quantized_matmul_asym8uxasym8u_asym8u_out", "quantized_relu_out", @@ -127,3 +128,6 @@ def define_common_targets(): # quantized_conv2d_nchw_out and quantized_conv2d_nhwc_out need additional dependency for int16 support define_operator("quantized_conv2d_nchw_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"]) define_operator("quantized_conv2d_nhwc_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"]) + + # quantized_matmul_out needs additional dependency for int16 support + define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_matmul"], exported_headers=["op_quantized_matmul_out.h"]) diff --git a/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp b/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp new file mode 100644 index 00000000000..3286913f055 --- /dev/null +++ b/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +using ::executorch::aten::Scalar; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::aten::TensorImpl; +using ::executorch::runtime::Error; +using ::executorch::runtime::KernelRuntimeContext; +using ::executorch::runtime::runtime_init; +using ::executorch::runtime::testing::TensorFactory; + +class HiFiQuantizedMatmulTest : public OperatorTest { + public: + protected: + Tensor& quantized_matmul_out( + const Tensor& X, + int64_t X_zero_point, + const Tensor& Y, + int64_t Y_zero_point, + const std::optional& bias, + int64_t out_multiplier, + int64_t out_shift, + int64_t out_zero_point, + bool transposed, + Tensor& output) { + return impl::HiFi::native::quantized_matmul_out( + context_, + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + transposed, + output); + } +}; + +// Test quantized_matmul_out with int16 activations and int8 weights +TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16Test) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + + // Minimal test case: X [2, 2] x Y [2, 2] = output [2, 2] + // Small enough to verify by hand calculation + // + // X (2x2): Y (2x2): + // 2 4 1 2 + // 6 8 1 0 + // + // Hand calculation for matmul (before scaling): + // (0,0): 2*1 + 4*1 = 6 + // (0,1): 2*2 + 4*0 = 4 + // (1,0): 6*1 + 8*1 = 14 + // (1,1): 6*2 + 8*0 = 12 + // + // Raw result: [[6, 4], [14, 12]] + // After 0.5 scaling: [[3, 2], [7, 6]] + Tensor X = tf_int16.make({2, 2}, {2, 4, 6, 8}); + Tensor Y = tf_int8.make({2, 2}, {1, 2, 1, 0}); + Tensor bias = tf_int32.zeros({2}); + Tensor output = tf_int16.zeros({2, 2}); + + int64_t X_zero_point = 0; + int64_t Y_zero_point = 0; + int64_t out_multiplier = 1073741824; // 0.5 * 2^31 + int64_t out_shift = 0; + int64_t out_zero_point = 0; + + quantized_matmul_out( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + false, // transposed + output); + + Tensor expected = tf_int16.make({2, 2}, {3, 2, 7, 6}); + EXPECT_TENSOR_EQ(output, expected); +} + +// Test quantized_matmul_out with transposed Y (int16 activations and int8 +// weights) +TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16TransposedTest) { + TensorFactory tf_int16; + TensorFactory tf_int32; + TensorFactory tf_int8; + + // Minimal test case with transposed Y: X [2, 2] x Y^T [2, 2] = output [2, 2] + // Y is stored transposed, so we compute X @ Y^T + // + // X (2x2): Y_stored (2x2, which is Y^T): + // 2 4 1 1 + // 6 8 2 0 + // + // When transposed=true, we compute X @ Y_stored^T = X @ Y + // Y = Y_stored^T = [[1, 2], [1, 0]] + // + // Hand calculation for matmul (before scaling): + // (0,0): 2*1 + 4*1 = 6 + // (0,1): 2*2 + 4*0 = 4 + // (1,0): 6*1 + 8*1 = 14 + // (1,1): 6*2 + 8*0 = 12 + // + // Raw result: [[6, 4], [14, 12]] + // After 0.5 scaling: [[3, 2], [7, 6]] + Tensor X = tf_int16.make({2, 2}, {2, 4, 6, 8}); + Tensor Y = tf_int8.make({2, 2}, {1, 1, 2, 0}); // Stored as Y^T + Tensor bias = tf_int32.zeros({2}); + Tensor output = tf_int16.zeros({2, 2}); + + int64_t X_zero_point = 0; + int64_t Y_zero_point = 0; + int64_t out_multiplier = 1073741824; // 0.5 * 2^31 + int64_t out_shift = 0; + int64_t out_zero_point = 0; + + quantized_matmul_out( + X, + X_zero_point, + Y, + Y_zero_point, + bias, + out_multiplier, + out_shift, + out_zero_point, + true, // transposed + output); + + Tensor expected = tf_int16.make({2, 2}, {3, 2, 7, 6}); + EXPECT_TENSOR_EQ(output, expected); +} + +} // namespace +} // namespace native +} // namespace HiFi +} // namespace impl