Skip to content

Commit b8769e4

Browse files
RahulC7facebook-github-bot
authored andcommitted
Using generic implementation for 16-bit activations and 8-bit weights for matmul in backends
Summary: # Context We continue from D84284794 to add support for 16-bit activations. Note that right now, all though they support 16-bit activations already, it's only if the weights are also 16-bits. To do this, we need to change the way we template some functions. # Current Behavior Right now, we're composing two macros together, the `ET_FORALL_JARVIS_QUANTIZED_TYPES_WITH_INT16` macro: https://www.internalfb.com/code/fbsource/[9e8c6d8466107f58aa3de1b9e4ec71c49d670a8f]/fbcode/on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/operators.h?lines=22-25 and the function macro(`quantized_linear` chosen for example): https://www.internalfb.com/code/fbsource/[9e8c6d8466107f58aa3de1b9e4ec71c49d670a8f]/fbcode/on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/quantized_linear_out.cpp?lines=30-41 so together, it just becomes a switch statement, calling the `quantized_linear` function with the correct template parameter. However, note that it assumes that both the input activations and weights are the same dtype, which is not the case. # This Diff We now use the generic implementation in the backends and create a unit test as well as e2e tests. Differential Revision: D87997149
1 parent 9b21e73 commit b8769e4

File tree

5 files changed

+214
-6
lines changed

5 files changed

+214
-6
lines changed

backends/cadence/aot/quantizer/quantizer.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,3 +386,16 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
386386
quantizers.append(CadenceAtenQuantizer(Conv1dPattern(), qconfig_A16))
387387
quantizers.append(CadenceAtenQuantizer(Conv2dPattern(), qconfig_A16))
388388
super().__init__(quantizers)
389+
390+
391+
class CadenceWith16BitMatmulActivationsQuantizer(CadenceQuantizer):
392+
"""
393+
Quantizer including A16 matmul
394+
"""
395+
396+
def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
397+
if quantizers is None:
398+
quantizers = []
399+
# Add 16-bit quantizers for MatmulPattern
400+
quantizers.append(CadenceAtenQuantizer(MatmulPattern(), qconfig_A16))
401+
super().__init__(quantizers)

backends/cadence/hifi/operators/op_quantized_matmul_out.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <executorch/backends/cadence/hifi/kernels/kernels.h>
1010
#include <executorch/runtime/kernel/kernel_includes.h>
11+
#include <on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/op_quantized_matmul.h>
1112
#include <stdlib.h>
1213

1314
using executorch::aten::ScalarType;
@@ -192,8 +193,20 @@ void quantized_matmul_out(
192193
size_t leading_dim = X.size(X.dim() - 2);
193194
size_t out_dim = Y.size(Y.dim() - 1 - transposed);
194195
size_t in_dim = X.size(X.dim() - 1);
195-
196-
if (out.scalar_type() == exec_aten::ScalarType::Byte) {
196+
if (out.scalar_type() == exec_aten::ScalarType::Short) {
197+
::impl::generic::native::quantized_matmul_out(
198+
ctx,
199+
X,
200+
X_zero_point,
201+
Y,
202+
Y_zero_point,
203+
bias,
204+
out_multiplier,
205+
out_shift,
206+
out_zero_point,
207+
transposed,
208+
out);
209+
} else if (out.scalar_type() == exec_aten::ScalarType::Byte) {
197210
_typed_quantized_matmul<uint8_t>(
198211
ctx,
199212
X,
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include "executorch/runtime/core/exec_aten/exec_aten.h"
12+
#include "executorch/runtime/kernel/kernel_runtime_context.h"
13+
14+
namespace impl {
15+
namespace HiFi {
16+
namespace native {
17+
18+
::executorch::aten::Tensor& quantized_matmul_out(
19+
::executorch::runtime::KernelRuntimeContext& ctx,
20+
const ::executorch::aten::Tensor& X,
21+
int64_t X_zero_point,
22+
const ::executorch::aten::Tensor& Y,
23+
int64_t Y_zero_point,
24+
const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
25+
int64_t out_multiplier,
26+
int64_t out_shift,
27+
int64_t out_zero_point,
28+
bool transposed,
29+
::executorch::aten::Tensor& out);
30+
31+
} // namespace native
32+
} // namespace HiFi
33+
} // namespace impl

backends/cadence/hifi/operators/targets.bzl

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
22
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
33

44

5-
def define_operator(name: str, deps: list[str] | None = None) -> None:
5+
def define_operator(name: str, deps: list[str] | None = None, exported_headers: list[str] | None = None) -> None:
66
op_name = "op_{}".format(name)
77

88
# Deps used by all operators.
@@ -21,6 +21,8 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
2121
]
2222
if deps == None:
2323
deps = []
24+
if exported_headers == None:
25+
exported_headers = ["operators.h"]
2426

2527
runtime.cxx_library(
2628
name = op_name,
@@ -32,7 +34,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
3234
],
3335
compatible_with = ["ovr_config//cpu:xtensa"],
3436
deps = deps + common_deps,
35-
exported_headers = ["operators.h"],
37+
exported_headers = exported_headers,
3638
)
3739

3840
OPERATORS = [
@@ -87,7 +89,6 @@ OPERATORS = [
8789
"quantized_layer_norm",
8890
"quantized_linear_asym8sxasym8s_asym8s_per_tensor_out",
8991
"quantized_linear_asym8uxasym8u_asym8u_per_tensor_out",
90-
"quantized_matmul_out",
9192
"quantized_matmul_asym8sxasym8s_asym8s_out",
9293
"quantized_matmul_asym8uxasym8u_asym8u_out",
9394
"quantized_relu_out",
@@ -124,6 +125,9 @@ def define_common_targets():
124125
define_operator("quantized_linear_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_linear"])
125126
define_operator("quantized_linear_per_tensor_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_linear"])
126127

127-
# quantized_conv2d_nchw_out and quantized_conv2d_nhwc_out need additional dependency for int16 support
128+
# quantized_conv2d_nchw_out and quantized_conv2d_nhwc_out need additional dependency for int16 support
128129
define_operator("quantized_conv2d_nchw_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"])
129130
define_operator("quantized_conv2d_nhwc_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"])
131+
132+
# quantized_matmul_out needs additional dependency for int16 support
133+
define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_matmul"], exported_headers=["op_quantized_matmul_out.h"])
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/cadence/hifi/operators/op_quantized_matmul_out.h>
10+
#include <gtest/gtest.h>
11+
12+
#include <executorch/kernels/test/TestUtil.h>
13+
#include <executorch/runtime/core/error.h>
14+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
15+
#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
16+
#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
17+
#include <executorch/runtime/platform/runtime.h>
18+
19+
namespace impl {
20+
namespace HiFi {
21+
namespace native {
22+
namespace {
23+
24+
using ::executorch::aten::Scalar;
25+
using ::executorch::aten::ScalarType;
26+
using ::executorch::aten::Tensor;
27+
using ::executorch::aten::TensorImpl;
28+
using ::executorch::runtime::Error;
29+
using ::executorch::runtime::KernelRuntimeContext;
30+
using ::executorch::runtime::runtime_init;
31+
using ::executorch::runtime::testing::TensorFactory;
32+
33+
class HiFiQuantizedMatmulTest : public OperatorTest {
34+
public:
35+
protected:
36+
Tensor& quantized_matmul_out(
37+
const Tensor& X,
38+
int64_t X_zero_point,
39+
const Tensor& Y,
40+
int64_t Y_zero_point,
41+
const std::optional<Tensor>& bias,
42+
int64_t out_multiplier,
43+
int64_t out_shift,
44+
int64_t out_zero_point,
45+
bool transposed,
46+
Tensor& output) {
47+
return impl::HiFi::native::quantized_matmul_out(
48+
context_,
49+
X,
50+
X_zero_point,
51+
Y,
52+
Y_zero_point,
53+
bias,
54+
out_multiplier,
55+
out_shift,
56+
out_zero_point,
57+
transposed,
58+
output);
59+
}
60+
};
61+
62+
// Test quantized_matmul_out with int16 activations and int8 weights
63+
TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16Test) {
64+
TensorFactory<ScalarType::Short> tf_int16;
65+
TensorFactory<ScalarType::Int> tf_int32;
66+
TensorFactory<ScalarType::Char> tf_int8;
67+
68+
// Simple 2D case: X [64, 33] x Y [33, 128] = output [64, 128]
69+
// Using simple values for testing
70+
Tensor X = tf_int16.ones({64, 33});
71+
Tensor Y = tf_int8.ones({33, 128});
72+
// Bias not used
73+
Tensor bias = tf_int32.full({128}, -30);
74+
Tensor output = tf_int16.zeros({64, 128});
75+
76+
int64_t X_zero_point = 0;
77+
int64_t Y_zero_point = 0;
78+
int64_t out_multiplier = 1073741824; // 0.5 * 2^31
79+
int64_t out_shift = 0;
80+
int64_t out_zero_point = 0;
81+
82+
quantized_matmul_out(
83+
X,
84+
X_zero_point,
85+
Y,
86+
Y_zero_point,
87+
bias, // pass bias tensor
88+
out_multiplier,
89+
out_shift,
90+
out_zero_point,
91+
false, // transposed
92+
output);
93+
94+
// Verify the output is correct
95+
// With all ones input and weights, inner dimension is 33
96+
// Matmul result: 33, with out_multiplier = 0.5 * 2^31 (scales by 0.5)
97+
// Expected value: 33 * 0.5 = 16.5 ≈ 16
98+
Tensor expected = tf_int16.full({64, 128}, 16);
99+
EXPECT_TENSOR_EQ(output, expected);
100+
}
101+
102+
// Test quantized_matmul_out with transposed Y (int16 activations and int8
103+
// weights)
104+
TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16TransposedTest) {
105+
TensorFactory<ScalarType::Short> tf_int16;
106+
TensorFactory<ScalarType::Int> tf_int32;
107+
TensorFactory<ScalarType::Char> tf_int8;
108+
109+
// Transposed case: X [64, 33] x Y^T [128, 33] = output [64, 128]
110+
Tensor X = tf_int16.ones({64, 33});
111+
Tensor Y = tf_int8.ones({128, 33}); // Transposed
112+
// Bias not used
113+
Tensor bias = tf_int32.full({128}, -30);
114+
Tensor output = tf_int16.zeros({64, 128});
115+
116+
int64_t X_zero_point = 0;
117+
int64_t Y_zero_point = 0;
118+
int64_t out_multiplier = 1073741824; // 0.5 * 2^31
119+
int64_t out_shift = 0;
120+
int64_t out_zero_point = 0;
121+
122+
quantized_matmul_out(
123+
X,
124+
X_zero_point,
125+
Y,
126+
Y_zero_point,
127+
bias, // pass bias tensor
128+
out_multiplier,
129+
out_shift,
130+
out_zero_point,
131+
true, // transposed
132+
output);
133+
134+
// Verify the output is correct
135+
// With all ones input and weights, inner dimension is 33
136+
// Matmul result: 33, with out_multiplier = 0.5 * 2^31 (scales by 0.5)
137+
// Expected value: 33 * 0.5 = 16.5 ≈ 16
138+
Tensor expected = tf_int16.full({64, 128}, 16);
139+
EXPECT_TENSOR_EQ(output, expected);
140+
}
141+
142+
} // namespace
143+
} // namespace native
144+
} // namespace HiFi
145+
} // namespace impl

0 commit comments

Comments
 (0)