From 554cc990efff3cfc5d29c827aa7f6c90ab822173 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 26 Jun 2025 06:09:37 +0000 Subject: [PATCH 01/49] Fix pack test on rvv bool --- ntt/test/ctest/generate_pack_tests.py | 28 ++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/ntt/test/ctest/generate_pack_tests.py b/ntt/test/ctest/generate_pack_tests.py index 61a4ab99fd..6d929e9168 100644 --- a/ntt/test/ctest/generate_pack_tests.py +++ b/ntt/test/ctest/generate_pack_tests.py @@ -13,7 +13,33 @@ from test_generator_base import * import os -class PackTestGenerator(BaseTestGenerator): + +# is_contiguous: bool +# non_contiguous_dim: int or None +# big_tensor_op: str or None - How to build the big tensor at given non_contiguous_dim +Continuity = namedtuple('Continuity', ['is_contiguous', 'non_contiguous_dim', 'big_tensor_op']) +DataType = namedtuple('DataType', ['cpp_type', 'name_suffix', 'min_val', 'max_val']) + + +ALL_DATATYPES = [ + DataType('bool', 'Bool', 'false', 'true'), + DataType('uint8_t', 'Uint8', '0', '255'), + DataType('uint16_t', 'Uint16', '0', '65535'), + DataType('uint32_t', 'Uint32', '0', '100000'), + DataType('uint64_t', 'Uint64', '0', '1000000'), + DataType('int8_t', 'Int8', '-127', '127'), + DataType('int16_t', 'Int16', '-32767', '32767'), + DataType('int32_t', 'Int32', '-100000', '100000'), + DataType('int64_t', 'Int64', '-1000000', '1000000'), + DataType('half', 'Float16', '-65504.0', '65504.0'), + DataType('float', 'Float32', '-3.4e38', '3.4e38'), + DataType('double', 'Float64', '-1.7e308', '1.7e308'), + DataType('bfloat16', 'Bfloat16', '-3.3e38_bf16', '3.3e38_bf16'), + DataType('float_e4m3_t', 'Float8e4m3', 'float_e4m3_t(-448.0f)', 'float_e4m3_t(448.0f)'), + DataType('float_e5m2_t', 'Float8e5m2', 'float_e5m2_t(-57344.0f)', 'float_e5m2_t(57344.0f)'), +] + +class PackTestGenerator: def __init__(self): super().__init__() From bd39ac2274cdabcf09b99fb2d902383e3d788576 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 23 Jun 2025 08:02:17 +0000 Subject: [PATCH 02/49] unpack_generator initialized --- ntt/test/ctest/generate_pack_tests.py | 30 +- ntt/test/ctest/generate_pack_tests.py.bkp | 445 ++++++++++++++++++++++ ntt/test/ctest/test_generator_base.py | 36 +- 3 files changed, 482 insertions(+), 29 deletions(-) create mode 100644 ntt/test/ctest/generate_pack_tests.py.bkp diff --git a/ntt/test/ctest/generate_pack_tests.py b/ntt/test/ctest/generate_pack_tests.py index 6d929e9168..c0e980b1ba 100644 --- a/ntt/test/ctest/generate_pack_tests.py +++ b/ntt/test/ctest/generate_pack_tests.py @@ -10,36 +10,10 @@ import itertools from typing import List, Tuple -from test_generator_base import * +from test_generator_base import BaseTestGenerator, ALL_DATATYPES, Continuity, DataType, generate_cmake_list import os - -# is_contiguous: bool -# non_contiguous_dim: int or None -# big_tensor_op: str or None - How to build the big tensor at given non_contiguous_dim -Continuity = namedtuple('Continuity', ['is_contiguous', 'non_contiguous_dim', 'big_tensor_op']) -DataType = namedtuple('DataType', ['cpp_type', 'name_suffix', 'min_val', 'max_val']) - - -ALL_DATATYPES = [ - DataType('bool', 'Bool', 'false', 'true'), - DataType('uint8_t', 'Uint8', '0', '255'), - DataType('uint16_t', 'Uint16', '0', '65535'), - DataType('uint32_t', 'Uint32', '0', '100000'), - DataType('uint64_t', 'Uint64', '0', '1000000'), - DataType('int8_t', 'Int8', '-127', '127'), - DataType('int16_t', 'Int16', '-32767', '32767'), - DataType('int32_t', 'Int32', '-100000', '100000'), - DataType('int64_t', 'Int64', '-1000000', '1000000'), - DataType('half', 'Float16', '-65504.0', '65504.0'), - DataType('float', 'Float32', '-3.4e38', '3.4e38'), - DataType('double', 'Float64', '-1.7e308', '1.7e308'), - DataType('bfloat16', 'Bfloat16', '-3.3e38_bf16', '3.3e38_bf16'), - DataType('float_e4m3_t', 'Float8e4m3', 'float_e4m3_t(-448.0f)', 'float_e4m3_t(448.0f)'), - DataType('float_e5m2_t', 'Float8e5m2', 'float_e5m2_t(-57344.0f)', 'float_e5m2_t(57344.0f)'), -] - -class PackTestGenerator: +class PackTestGenerator(BaseTestGenerator): def __init__(self): super().__init__() diff --git a/ntt/test/ctest/generate_pack_tests.py.bkp b/ntt/test/ctest/generate_pack_tests.py.bkp new file mode 100644 index 0000000000..8e78309883 --- /dev/null +++ b/ntt/test/ctest/generate_pack_tests.py.bkp @@ -0,0 +1,445 @@ +#!/usr/bin/env python3 +""" +Generate test cases for NTT pack operations +Covering the following cases: +1. Shape types: fixed/dynamic +2. Vector dimensions: 1D/2D +3. Tensor continuity: contiguous/non-contiguous +4. Pack axes: different dimensions +""" + + + +import itertools +from typing import List, Tuple +from collections import namedtuple +import os + + +# is_contiguous: bool +# non_contiguous_dim: int or None +# big_tensor_op: str or None - How to build the big tensor at given non_contiguous_dim +Continuity = namedtuple('Continuity', ['is_contiguous', 'non_contiguous_dim', 'big_tensor_op']) +DataType = namedtuple('DataType', ['cpp_type', 'name_suffix', 'min_val', 'max_val']) + + +ALL_DATATYPES = [ + DataType('bool', 'Bool', 'false', 'true'), + DataType('uint8_t', 'Uint8', '0', '255'), + DataType('uint16_t', 'Uint16', '0', '65535'), + DataType('uint32_t', 'Uint32', '0', '100000'), + DataType('uint64_t', 'Uint64', '0', '1000000'), + DataType('int8_t', 'Int8', '-127', '127'), + DataType('int16_t', 'Int16', '-32767', '32767'), + DataType('int32_t', 'Int32', '-100000', '100000'), + DataType('int64_t', 'Int64', '-1000000', '1000000'), + DataType('half', 'Float16', '-65504.0', '65504.0'), + DataType('float', 'Float32', '-3.4e38', '3.4e38'), + DataType('double', 'Float64', '-1.7e308', '1.7e308'), + DataType('bfloat16', 'Bfloat16', '-3.3e38_bf16', '3.3e38_bf16'), + DataType('float_e4m3_t', 'Float8e4m3', 'float_e4m3_t(-448.0f)', 'float_e4m3_t(448.0f)'), + DataType('float_e5m2_t', 'Float8e5m2', 'float_e5m2_t(-57344.0f)', 'float_e5m2_t(57344.0f)'), +] + +class PackTestGenerator: + def __init__(self): + self.test_cases = [] + + def generate_test_name(self, datatype, shape_type, vector_dim, continuity: Continuity, pack_axis_str, ndim): + parts = [] + parts.append(datatype.name_suffix) + parts.append(shape_type) + parts.append(f"{vector_dim}D_vector") + + if continuity.is_contiguous: + parts.append("contiguous") + else: + op_str = "mul2" if continuity.big_tensor_op == "*2" else "add5" + parts.append(f"non_contiguous_dim{continuity.non_contiguous_dim}_{op_str}") + + parts.append(f"pack_axis_{pack_axis_str}") + parts.append(f"{ndim}D") + return "_".join(parts) + + def generate_shape_init(self, shape_type, dims): + if shape_type == "fixed": + dim_strs = [f"{d}" for d in dims] + return f"ntt::fixed_shape_v<{', '.join(dim_strs)}>" + else: # dynamic + dim_strs = [str(d) for d in dims] + return f"ntt::make_shape({', '.join(dim_strs)})" + + def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name): + code = [] + shape_expr = self.generate_shape_init(shape_type, dims) + + if continuity.is_contiguous: + code.append(f"alignas(32) auto {var_name} = ntt::make_tensor<{datatype.cpp_type}>({shape_expr});") + code.append(f"NttTest::init_tensor({var_name}, min_input, max_input);") + else: # non-contiguous + # Create a bigger tensor, then create view + big_dims = dims.copy() + dim_to_change = continuity.non_contiguous_dim + op = continuity.big_tensor_op + + if dim_to_change is not None and op is not None and dim_to_change < len(big_dims): + big_dims[dim_to_change] = f"({big_dims[dim_to_change]}) {op}" + + big_shape_expr = self.generate_shape_init(shape_type, big_dims) + + code.append(f"// Create non-contiguous tensor (on dimension {dim_to_change})") + code.append(f"alignas(32) auto big_tensor = ntt::make_tensor<{datatype.cpp_type}>({big_shape_expr});") + code.append(f"NttTest::init_tensor(big_tensor, min_input, max_input);") + code.append(f"") + code.append(f"auto {var_name} = ntt::make_tensor_view_from_address<{datatype.cpp_type}>(") + code.append(f" big_tensor.elements().data(),") + code.append(f" {shape_expr},") + code.append(f" big_tensor.strides());") + + return code + + def generate_pack_axes_str(self, axes): + if len(axes) == 1: + return f"ntt::fixed_shape_v<{axes[0]}>" + else: + return f"ntt::fixed_shape_v<{', '.join(map(str, axes))}>" + + def generate_ort_reference(self, input_dims, input_dim_names, pack_axes): + code = [] + ndim = len(input_dims) + + # Calculate reshaped dimensions (for code string generation) + reshape_dims_str = [] + dim_idx = 0 + for i in range(ndim): + if i in pack_axes: + axis_idx = pack_axes.index(i) + # Use string expressions instead of calculated results + reshape_dims_str.append(f"(int64_t)({input_dim_names[i]} / P)") + reshape_dims_str.append(f"(int64_t)P") + else: + reshape_dims_str.append(f"(int64_t){input_dim_names[i]}") + + # Generate reshape code + code.append("// ORT reference implementation") + code.append("auto ort_input = NttTest::ntt2ort(ntt_input);") + code.append(f"int64_t reshape_data[] = {{{', '.join(reshape_dims_str)}}};") + code.append("int64_t reshape_shape[] = {std::size(reshape_data)};") + code.append("auto ort_type = NttTest::primitive_type2ort_type();") + code.append("auto shape_tensor = make_tensor(reinterpret_cast(reshape_data), ort_type,") + code.append(" reshape_shape, std::size(reshape_shape));") + code.append("auto reshaped_tensor = ortki_Reshape(ort_input, shape_tensor, 0);") + + # Generate transpose permutation + if len(pack_axes) > 0: + # Calculate permutation + perm = [] + packed_dims = [] + j = 0 + for i in range(ndim): + if i in pack_axes: + perm.append(j) + packed_dims.append(j + 1) + j += 2 + else: + perm.append(j) + j += 1 + perm.extend(packed_dims) + + code.append("") + code.append(f"int64_t perms[] = {{{', '.join(map(str, perm))}}};") + code.append("auto ort_output = ortki_Transpose(reshaped_tensor, perms, std::size(perms));") + else: + code.append("auto ort_output = reshaped_tensor;") + + return code + + def generate_test_prologue(self, datatype, test_name, P, dim_names, dims, pack_axes): + """generate test function header, constant P and dimension constants""" + code = [f"TEST(PackTest_{datatype.name_suffix}, {test_name}) {{", f" constexpr size_t P = {P};"] + + # define dimension constants + for i, (name, size) in enumerate(zip(dim_names, dims)): + if i in pack_axes: + axis_idx = pack_axes.index(i) + code.append(f" constexpr size_t {name}_coefficient = {size};") + code.append(f" constexpr size_t {name} = {name}_coefficient * P;") + else: + code.append(f" constexpr size_t {name} = {size};") + + code.extend([f" {datatype.cpp_type} min_input = {datatype.min_val};", + f" {datatype.cpp_type} max_input = {datatype.max_val};", ""]) + return code + + def generate_output_tensor_code(self, datatype, shape_type, dim_names, pack_axes, vector_dim): + output_dims = [] + for i, name in enumerate(dim_names): + if i in pack_axes: + output_dims.append(f"{name} / P") + else: + output_dims.append(name) + + if vector_dim == 1: + vector_type = f"ntt::vector<{datatype.cpp_type}, P>" + else: + vector_type = f"ntt::vector<{datatype.cpp_type}, {', '.join(['P'] * len(pack_axes))}>" + + output_shape_expr = self.generate_shape_init(shape_type, output_dims) + + code = [ + f"// Create output tensor", + f"alignas(32) auto ntt_output1 = ntt::make_tensor<{vector_type}>({output_shape_expr});", + "" + ] + return code, vector_type, output_shape_expr + + def generate_pack_call_code(self, pack_axes): + pack_axes_str = self.generate_pack_axes_str(pack_axes) + return [ + "// Execute pack operation", + f"ntt::pack(ntt_input, ntt_output1, {pack_axes_str});", + "" + ] + + def generate_reference_and_comparison_code(self, datatype, continuity, dims, dim_names, pack_axes, shape_type, vector_type, output_shape_expr, is_fp8): + code = [] + input_dims_expr = [f"{name}" for name in dim_names] + + ort_input_tensor = "ntt_input" + # For non-contiguous tensor, need to copy to contiguous tensor first + if not continuity.is_contiguous: + if is_fp8: + # for fp8, ntt_input_uint8 is already contiguous, created by cast + ort_input_tensor = "ntt_input_uint8" + else: + code.append(" // Copy to contiguous tensor for ORT reference") + code.append(f" alignas(32) auto continuous_input = ntt::make_tensor<{datatype.cpp_type}>({self.generate_shape_init(shape_type, input_dims_expr)});") + + # generate nested loops to copy data + code.append(" ") + for i, name in enumerate(dim_names): + code.append(f" {' ' * i}for (size_t {name.lower()} = 0; {name.lower()} < {name}; {name.lower()}++) {{") + + indices = [f"{name.lower()}" for name in dim_names] + code.append(f" {' ' * len(dim_names)}continuous_input({', '.join(indices)}) = ntt_input({', '.join(indices)});") + + for i in range(len(dim_names)-1, -1, -1): + code.append(f" {' ' * i}}}") + code.append("") + ort_input_tensor = "continuous_input" + elif is_fp8: # contiguous fp8 case + ort_input_tensor = "ntt_input_uint8" + + ort_ref = self.generate_ort_reference(dims, dim_names, pack_axes) + # The first line of ort_ref is "// ORT reference implementation" + # The second line is "auto ort_input = NttTest::ntt2ort(ntt_input);" + # We modify this line. + ort_ref[1] = f" auto ort_input = NttTest::ntt2ort({ort_input_tensor});" + + code.extend([f" {line}" for line in ort_ref]) + code.append("") + + # compare results + code.append(" // Compare results") + if is_fp8: + vector_type_uint8 = vector_type.replace(datatype.cpp_type, 'uint8_t') + code.append(f" alignas(32) auto ntt_output2_uint8 = ntt::make_tensor<{vector_type_uint8}>({output_shape_expr});") + code.append(" NttTest::ort2ntt(ort_output, ntt_output2_uint8);") + code.append(" EXPECT_TRUE(NttTest::compare_tensor(ntt_output1_uint8, ntt_output2_uint8));") + else: + code.append(f" alignas(32) auto ntt_output2 = ntt::make_tensor<{vector_type}>({output_shape_expr});") + code.append(" NttTest::ort2ntt(ort_output, ntt_output2);") + code.append(" EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2));") + code.append("}") + code.append("") + + return code + +# shape_type: fixed/dynamic +# vector_dim: 1/2 +# continuity: is_contiguous, non_contiguous_dim, big_tensor_op +# pack_axes: list of axes to pack +# ndim: dimension of the tensor + def generate_test_case(self, datatype, shape_type, vector_dim, continuity, pack_axes, ndim): + # 1. initialize dimension and other basic variables + P = f"NTT_VLEN / (sizeof({datatype.cpp_type}) * 8)" + if ndim == 3: + dims, dim_names = [1, 77, 3], ['C', 'H', 'W'] + elif ndim == 4: + dims, dim_names = [2, 8, 4, 4], ['N', 'C', 'H', 'W'] + else: + dims, dim_names = [2, 8, 4, 4, 2], ['N', 'C', 'H', 'W', 'D'] + + test_name = self.generate_test_name(datatype, shape_type, vector_dim, continuity, "_".join(map(str, pack_axes)), ndim) + + is_fp8 = 'float_e' in datatype.cpp_type + + # 2. call helper functions to generate code + code = [] + + # 2.1 generate test function header and constants + code.extend(self.generate_test_prologue(datatype, test_name, P, dim_names, dims, pack_axes)) + + # 2.2 generate input tensor initialization code + input_dims_expr = [f"{name}" for name in dim_names] + tensor_init_code = self.generate_tensor_init(datatype, shape_type, input_dims_expr, continuity, "ntt_input") + code.extend([f" {line}" for line in tensor_init_code]) + + if is_fp8: + input_shape_expr = self.generate_shape_init(shape_type, input_dims_expr) + code.append(f" auto ntt_input_uint8 = ntt::make_tensor({input_shape_expr});") + code.append(f" NttTest::reinterpret_cast_fp8_to_uint8(ntt_input, ntt_input_uint8);") + + code.append("") + + # 2.3 generate output tensor initialization code + output_tensor_code, vector_type, output_shape_expr = self.generate_output_tensor_code(datatype, shape_type, dim_names, pack_axes, vector_dim) + code.extend([f" {line}" for line in output_tensor_code]) + + # 2.4 generate pack operation call code + pack_call_code = self.generate_pack_call_code(pack_axes) + code.extend([f" {line}" for line in pack_call_code]) + + if is_fp8: + vector_type_uint8 = vector_type.replace(datatype.cpp_type, 'uint8_t') + code.append(f" auto ntt_output1_uint8 = ntt::make_tensor<{vector_type_uint8}>({output_shape_expr});") + code.append(f" NttTest::reinterpret_cast_fp8_to_uint8(ntt_output1, ntt_output1_uint8);") + code.append("") + + # 2.5 generate reference implementation and result comparison code + ref_and_comp_code = self.generate_reference_and_comparison_code(datatype, continuity, dims, dim_names, pack_axes, shape_type, vector_type, output_shape_expr, is_fp8) + + code.extend(ref_and_comp_code) + + return "\n".join(code) + + def generate_all_tests_for_type(self, datatype): + """Generate all test combinations for a given datatype + 1. rank 3, 4, 5 + 2. fixed/dynamic + 3. 1D/2D vector + 4. contiguous/non-contiguous + 4.1 For dimensions 3, 5, test simple non-contiguous cases (simple_continuities) + 4.2 For dimension 4, test more complex non-contiguous cases (full_continuities) + """ + """Uncovered test scope: + 1. Cases where packed dimensions are not multiples of P, requiring padding + """ + shape_types = ["fixed", "dynamic"] + vector_dims = [1, 2] + continuities = ["contiguous", "non_contiguous"] + + # Define pack axis options for different dimensions + pack_axes_options = { + 3: [[2], [1], [0], [0, 1], [1, 2]], + 4: [[3], [2], [1], [0], [0, 1], [1, 2], [2, 3]], + 5: [[4], [3], [2], [1], [0], [0, 1], [1, 2], [2, 3], [3, 4]] + } + + # Full continuity test combinations, mainly for 4D + full_continuities = [ + Continuity(is_contiguous=True, non_contiguous_dim=None, big_tensor_op=None), + Continuity(is_contiguous=False, non_contiguous_dim=2, big_tensor_op="+7"), + Continuity(is_contiguous=False, non_contiguous_dim=2, big_tensor_op="*2"), + Continuity(is_contiguous=False, non_contiguous_dim=1, big_tensor_op="*2"), + Continuity(is_contiguous=False, non_contiguous_dim=1, big_tensor_op="+7"), + ] + + # Simplified continuity test combinations, for non-4D + simple_continuities = [ + Continuity(is_contiguous=True, non_contiguous_dim=None, big_tensor_op=None), + Continuity(is_contiguous=False, non_contiguous_dim=1, big_tensor_op="*2"), # Choose a representative non-contiguous case + ] + + code = [] + + # Generate file header + code.append(self.generate_header()) + + # Generate test cases + for ndim in [3, 4, 5]: + # Select continuity test strategy based on dimension + current_continuities = full_continuities if ndim == 4 else simple_continuities + + for shape_type, vector_dim, continuity in itertools.product(shape_types, vector_dims, current_continuities): + for pack_axes in pack_axes_options[ndim]: + # Skip unreasonable combinations + if vector_dim == 2 and len(pack_axes) < 2: + continue + if vector_dim == 1 and len(pack_axes) > 1: + continue + + test_code = self.generate_test_case(datatype, shape_type, vector_dim, continuity, pack_axes, ndim) + code.append(test_code) + # Generate main function + code.append(self.generate_footer()) + + return "\n".join(code) + + def generate_header(self): + return '''/* Copyright 2019-2024 Canaan Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nncase/ntt/shape.h" +#include "nncase/ntt/tensor.h" +#include "nncase/ntt/tensor_traits.h" +#include "nncase/ntt/vector.h" +#include "ntt_test.h" +#include "ortki_helper.h" +#include +#include +#include +#include + +using namespace nncase; +using namespace ortki; + +''' + + def generate_footer(self): + return '''int main(int argc, char *argv[]) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} +''' +def generate_cmake_list(directory, filenames): + """generate a .cmake file that contains the list of generated test files""" + cmake_list_path = os.path.join(directory, "generated_tests.cmake") + with open(cmake_list_path, "w") as f: + f.write("# This file is generated automatically. DO NOT EDIT.\n") + f.write("set(GENERATED_TEST_SOURCES\n") + for name in filenames: + f.write(f" ${{CMAKE_CURRENT_LIST_DIR}}/{name}\n") # use relative path to current CMakeLists.txt + f.write(")\n") + print(f"Generated CMake list: {cmake_list_path}") + + +if __name__ == "__main__": + generator = PackTestGenerator() + script_directory = os.path.dirname(os.path.abspath(__file__)) + + generated_filenames = [] # collect all generated file names + + for datatype in ALL_DATATYPES: + test_code = generator.generate_all_tests_for_type(datatype) + filename = f"test_ntt_pack_generated_{datatype.name_suffix}.cpp" + output_filepath = os.path.join(script_directory, filename) + + with open(output_filepath, "w") as f: + f.write(test_code) + + print(f"Test file generated: {output_filepath}") + generated_filenames.append(filename) + + generate_cmake_list(script_directory, generated_filenames) \ No newline at end of file diff --git a/ntt/test/ctest/test_generator_base.py b/ntt/test/ctest/test_generator_base.py index 161bcf30b8..01fd6140be 100644 --- a/ntt/test/ctest/test_generator_base.py +++ b/ntt/test/ctest/test_generator_base.py @@ -5,7 +5,6 @@ import os from collections import namedtuple -from typing import List, Optional # is_contiguous: bool # non_contiguous_dim: int or None @@ -23,7 +22,11 @@ DataType('int16_t', 'Int16', '-32767', '32767'), DataType('int32_t', 'Int32', '-100000', '100000'), DataType('int64_t', 'Int64', '-1000000', '1000000'), +<<<<<<< HEAD DataType('half', 'Float16', 'half(-65504.0f)', 'half(65504.0f)'), +======= + DataType('half', 'Float16', '-65504.0', '65504.0'), +>>>>>>> 4bb20af6a ( unpack_generator initialized) DataType('float', 'Float32', '-3.4e38', '3.4e38'), DataType('double', 'Float64', '-1.7e308', '1.7e308'), DataType('bfloat16', 'Bfloat16', '-3.3e38_bf16', '3.3e38_bf16'), @@ -50,18 +53,33 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, # Determine element type based on vector_rank if vector_rank == 0: element_cpp_type = datatype.cpp_type +<<<<<<< HEAD elif vector_rank > 0: if P is None: raise ValueError("P must be provided for vector_rank > 0") # The rank of the vector is determined by vector_rank. ps = ', '.join([f"P"] * vector_rank) +======= + elif vector_rank == 1: + if P is None: + raise ValueError("P must be provided for vector_rank 1") + element_cpp_type = f"ntt::vector<{datatype.cpp_type}, {P}>" + elif vector_rank > 1: + if P is None or axes_count is None: + raise ValueError("P and axes_count must be provided for vector_rank > 1") + ps = ', '.join([str(P)] * axes_count) +>>>>>>> 4bb20af6a ( unpack_generator initialized) element_cpp_type = f"ntt::vector<{datatype.cpp_type}, {ps}>" else: raise ValueError(f"Invalid vector_rank: {vector_rank}") if continuity.is_contiguous: +<<<<<<< HEAD code.append(f"auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});") +======= + code.append(f"alignas(32) auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});") +>>>>>>> 4bb20af6a ( unpack_generator initialized) code.append(f"NttTest::init_tensor({var_name}, min_input, max_input);") else: # non-contiguous big_dims = dims.copy() @@ -74,7 +92,11 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, big_shape_expr = self.generate_shape_init(shape_type, big_dims) code.append(f"// Create non-contiguous tensor (on dimension {dim_to_change})") +<<<<<<< HEAD code.append(f"auto big_tensor = ntt::make_tensor<{element_cpp_type}>({big_shape_expr});") +======= + code.append(f"alignas(32) auto big_tensor = ntt::make_tensor<{element_cpp_type}>({big_shape_expr});") +>>>>>>> 4bb20af6a ( unpack_generator initialized) code.append(f"NttTest::init_tensor(big_tensor, min_input, max_input);") code.append(f"") code.append(f"auto {var_name} = ntt::make_tensor_view_from_address<{element_cpp_type}>(") @@ -84,6 +106,7 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, return code +<<<<<<< HEAD def generate_test_prologue(self, test_suite_prefix, datatype, test_name, P, dim_names, dims, axes=None): """generate test function header, constant P and dimension constants""" code = [f"TEST({test_suite_prefix}_{datatype.name_suffix}, {test_name}) {{"] @@ -157,6 +180,8 @@ def generate_reference_and_comparison_code(self, code.append("") return code +======= +>>>>>>> 4bb20af6a ( unpack_generator initialized) def generate_header(self): return '''/* Copyright 2019-2024 Canaan Inc. * @@ -195,6 +220,7 @@ def generate_footer(self): } ''' +<<<<<<< HEAD def _build_vector_cpp_type(self, base_cpp_type: str, vector_rank: int, P: Optional[str], axes_count: Optional[int] = None) -> str: """Utility: given primitive cpp type, return the full `ntt::vector<..., ...>` expression. When ``vector_rank == 0`` it just returns the primitive type. @@ -405,6 +431,14 @@ def generate_cmake_list(directory, filenames, output_filename, variable_name): with open(cmake_list_path, "w") as f: f.write(f"# This file is generated automatically. DO NOT EDIT.\n") f.write(f"set({variable_name}\n") +======= +def generate_cmake_list(directory, filenames): + """generate a .cmake file that contains the list of generated test files""" + cmake_list_path = os.path.join(directory, "generated_tests.cmake") + with open(cmake_list_path, "w") as f: + f.write("# This file is generated automatically. DO NOT EDIT.\n") + f.write("set(GENERATED_TEST_SOURCES\n") +>>>>>>> 4bb20af6a ( unpack_generator initialized) for name in filenames: f.write(f" ${{CMAKE_CURRENT_LIST_DIR}}/{name}\n") # use relative path to current CMakeLists.txt f.write(")\n") From 104a9047052b08c142bc53b67f8fd9c2cd0befae Mon Sep 17 00:00:00 2001 From: root Date: Wed, 25 Jun 2025 02:53:18 +0000 Subject: [PATCH 03/49] Add unpack ctest generator --- ntt/test/ctest/generate_pack_tests.py | 2 +- ntt/test/ctest/test_generator_base.py | 58 ++++++++------------------- 2 files changed, 18 insertions(+), 42 deletions(-) diff --git a/ntt/test/ctest/generate_pack_tests.py b/ntt/test/ctest/generate_pack_tests.py index c0e980b1ba..61a4ab99fd 100644 --- a/ntt/test/ctest/generate_pack_tests.py +++ b/ntt/test/ctest/generate_pack_tests.py @@ -10,7 +10,7 @@ import itertools from typing import List, Tuple -from test_generator_base import BaseTestGenerator, ALL_DATATYPES, Continuity, DataType, generate_cmake_list +from test_generator_base import * import os class PackTestGenerator(BaseTestGenerator): diff --git a/ntt/test/ctest/test_generator_base.py b/ntt/test/ctest/test_generator_base.py index 01fd6140be..fc47ac5ef2 100644 --- a/ntt/test/ctest/test_generator_base.py +++ b/ntt/test/ctest/test_generator_base.py @@ -5,6 +5,7 @@ import os from collections import namedtuple +from typing import List, Optional # is_contiguous: bool # non_contiguous_dim: int or None @@ -22,11 +23,7 @@ DataType('int16_t', 'Int16', '-32767', '32767'), DataType('int32_t', 'Int32', '-100000', '100000'), DataType('int64_t', 'Int64', '-1000000', '1000000'), -<<<<<<< HEAD - DataType('half', 'Float16', 'half(-65504.0f)', 'half(65504.0f)'), -======= DataType('half', 'Float16', '-65504.0', '65504.0'), ->>>>>>> 4bb20af6a ( unpack_generator initialized) DataType('float', 'Float32', '-3.4e38', '3.4e38'), DataType('double', 'Float64', '-1.7e308', '1.7e308'), DataType('bfloat16', 'Bfloat16', '-3.3e38_bf16', '3.3e38_bf16'), @@ -53,14 +50,6 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, # Determine element type based on vector_rank if vector_rank == 0: element_cpp_type = datatype.cpp_type -<<<<<<< HEAD - elif vector_rank > 0: - if P is None: - raise ValueError("P must be provided for vector_rank > 0") - - # The rank of the vector is determined by vector_rank. - ps = ', '.join([f"P"] * vector_rank) -======= elif vector_rank == 1: if P is None: raise ValueError("P must be provided for vector_rank 1") @@ -69,17 +58,18 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, if P is None or axes_count is None: raise ValueError("P and axes_count must be provided for vector_rank > 1") ps = ', '.join([str(P)] * axes_count) ->>>>>>> 4bb20af6a ( unpack_generator initialized) + elif vector_rank > 0: + if P is None: + raise ValueError("P must be provided for vector_rank > 0") + + # The rank of the vector is determined by vector_rank. + ps = ', '.join([f"P"] * vector_rank) element_cpp_type = f"ntt::vector<{datatype.cpp_type}, {ps}>" else: raise ValueError(f"Invalid vector_rank: {vector_rank}") if continuity.is_contiguous: -<<<<<<< HEAD - code.append(f"auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});") -======= code.append(f"alignas(32) auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});") ->>>>>>> 4bb20af6a ( unpack_generator initialized) code.append(f"NttTest::init_tensor({var_name}, min_input, max_input);") else: # non-contiguous big_dims = dims.copy() @@ -92,11 +82,7 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, big_shape_expr = self.generate_shape_init(shape_type, big_dims) code.append(f"// Create non-contiguous tensor (on dimension {dim_to_change})") -<<<<<<< HEAD - code.append(f"auto big_tensor = ntt::make_tensor<{element_cpp_type}>({big_shape_expr});") -======= code.append(f"alignas(32) auto big_tensor = ntt::make_tensor<{element_cpp_type}>({big_shape_expr});") ->>>>>>> 4bb20af6a ( unpack_generator initialized) code.append(f"NttTest::init_tensor(big_tensor, min_input, max_input);") code.append(f"") code.append(f"auto {var_name} = ntt::make_tensor_view_from_address<{element_cpp_type}>(") @@ -106,7 +92,6 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, return code -<<<<<<< HEAD def generate_test_prologue(self, test_suite_prefix, datatype, test_name, P, dim_names, dims, axes=None): """generate test function header, constant P and dimension constants""" code = [f"TEST({test_suite_prefix}_{datatype.name_suffix}, {test_name}) {{"] @@ -142,7 +127,7 @@ def generate_reference_and_comparison_code(self, ort_input_tensor = "ntt_input_uint8" else: code.append(" // Copy to contiguous tensor for ORT reference") - code.append(f" auto continuous_input = ntt::make_tensor<{input_element_type}>({self.generate_shape_init(shape_type, input_dims_expr)});") + code.append(f" alignas(32) auto continuous_input = ntt::make_tensor<{input_element_type}>({self.generate_shape_init(shape_type, input_dims_expr)});") code.append(" ") for i, name in enumerate(dim_names): code.append(f" {' ' * i}for (size_t {name.lower()} = 0; {name.lower()} < {name}; {name.lower()}++) {{") @@ -168,11 +153,11 @@ def generate_reference_and_comparison_code(self, if ntt_output_var_is_vector: output_element_type_uint8 = output_element_type.replace(datatype.cpp_type, 'uint8_t') - code.append(f" auto ntt_output2_uint8 = ntt::make_tensor<{output_element_type_uint8}>({output_shape_expr});") + code.append(f" alignas(32) auto ntt_output2_uint8 = ntt::make_tensor<{output_element_type_uint8}>({output_shape_expr});") code.append(f" NttTest::ort2ntt(ort_output, ntt_output2_uint8);") code.append(f" EXPECT_TRUE(NttTest::compare_tensor({ntt_output_for_comp}, ntt_output2_uint8));") else: - code.append(f" auto ntt_output2 = ntt::make_tensor<{output_element_type}>({output_shape_expr});") + code.append(f" alignas(32) auto ntt_output2 = ntt::make_tensor<{output_element_type}>({output_shape_expr});") code.append(f" NttTest::ort2ntt(ort_output, ntt_output2);") code.append(f" EXPECT_TRUE(NttTest::compare_tensor({ntt_output_for_comp}, ntt_output2));") @@ -181,7 +166,7 @@ def generate_reference_and_comparison_code(self, return code ======= ->>>>>>> 4bb20af6a ( unpack_generator initialized) +>>>>>>> e98735a1d (Add unpack ctest generator) def generate_header(self): return '''/* Copyright 2019-2024 Canaan Inc. * @@ -220,7 +205,6 @@ def generate_footer(self): } ''' -<<<<<<< HEAD def _build_vector_cpp_type(self, base_cpp_type: str, vector_rank: int, P: Optional[str], axes_count: Optional[int] = None) -> str: """Utility: given primitive cpp type, return the full `ntt::vector<..., ...>` expression. When ``vector_rank == 0`` it just returns the primitive type. @@ -294,7 +278,7 @@ def generate_ntt_output_and_op_section(self, output_tensor_code = [ f"// Create output tensor", - f"auto {output_var_name} = ntt::make_tensor<{output_element_type}>({output_shape_expr});", + f"alignas(32) auto {output_var_name} = ntt::make_tensor<{output_element_type}>({output_shape_expr});", "" ] op_section = output_tensor_code + ntt_op_call_lines @@ -351,7 +335,7 @@ def generate_ort_input_section(self, # For vector types, the element type is a vector. element_cpp_type = self._build_vector_cpp_type(datatype.cpp_type, vector_rank, P, axes_count) shape_expr = self.generate_shape_init(shape_type, input_dims_expr) - lines.append(f" auto continuous_input = ntt::make_tensor<{element_cpp_type}>({shape_expr});") + lines.append(f" alignas(32) auto continuous_input = ntt::make_tensor<{element_cpp_type}>({shape_expr});") # nested copy loops lines.append("") @@ -396,7 +380,7 @@ def generate_ort_back2ntt_and_compare_section(self, if deal_fp8 == 0: # Not fp8 golden_var_name = "ntt_golden" - lines.append(f"auto {golden_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});") + lines.append(f"alignas(32) auto {golden_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});") lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_var_name});") lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_var_name}, {golden_var_name}));") elif deal_fp8 == 1: # fp8 with uint8 comparison @@ -404,7 +388,7 @@ def generate_ort_back2ntt_and_compare_section(self, golden_var_name = "ntt_golden_uint8" golden_cpp_type = "uint8_t" if "vector" not in output_element_cpp_type else output_element_cpp_type.replace(datatype.cpp_type, "uint8_t") - lines.append(f"auto {golden_var_name} = ntt::make_tensor<{golden_cpp_type}>({output_shape_expr});") + lines.append(f"alignas(32) auto {golden_var_name} = ntt::make_tensor<{golden_cpp_type}>({output_shape_expr});") lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_var_name});") lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_to_compare}, {golden_var_name}));") elif deal_fp8 == 2: # fp8 with fp16 intermediate, compare fp8 @@ -412,11 +396,11 @@ def generate_ort_back2ntt_and_compare_section(self, golden_fp16_cpp_type = output_element_cpp_type.replace(datatype.cpp_type, "half") lines.append(f"// Golden output is in fp16, cast it back to fp8 for comparison") - lines.append(f"auto {golden_fp16_var_name} = ntt::make_tensor<{golden_fp16_cpp_type}>({output_shape_expr});") + lines.append(f"alignas(32) auto {golden_fp16_var_name} = ntt::make_tensor<{golden_fp16_cpp_type}>({output_shape_expr});") lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_fp16_var_name});") golden_fp8_var_name = "ntt_golden_fp8" - lines.append(f"auto {golden_fp8_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});") + lines.append(f"alignas(32) auto {golden_fp8_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});") lines.append(f"ntt::cast({golden_fp16_var_name}, {golden_fp8_var_name});") lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_var_name}, {golden_fp8_var_name}));") @@ -431,14 +415,6 @@ def generate_cmake_list(directory, filenames, output_filename, variable_name): with open(cmake_list_path, "w") as f: f.write(f"# This file is generated automatically. DO NOT EDIT.\n") f.write(f"set({variable_name}\n") -======= -def generate_cmake_list(directory, filenames): - """generate a .cmake file that contains the list of generated test files""" - cmake_list_path = os.path.join(directory, "generated_tests.cmake") - with open(cmake_list_path, "w") as f: - f.write("# This file is generated automatically. DO NOT EDIT.\n") - f.write("set(GENERATED_TEST_SOURCES\n") ->>>>>>> 4bb20af6a ( unpack_generator initialized) for name in filenames: f.write(f" ${{CMAKE_CURRENT_LIST_DIR}}/{name}\n") # use relative path to current CMakeLists.txt f.write(")\n") From 2d7d2a82c2dba33b730e209ddaff90feb8e961d0 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 26 Jun 2025 08:45:43 +0000 Subject: [PATCH 04/49] Added Unpack test generator --- ntt/test/ctest/test_generator_base.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ntt/test/ctest/test_generator_base.py b/ntt/test/ctest/test_generator_base.py index fc47ac5ef2..a95fdb53f6 100644 --- a/ntt/test/ctest/test_generator_base.py +++ b/ntt/test/ctest/test_generator_base.py @@ -23,7 +23,7 @@ DataType('int16_t', 'Int16', '-32767', '32767'), DataType('int32_t', 'Int32', '-100000', '100000'), DataType('int64_t', 'Int64', '-1000000', '1000000'), - DataType('half', 'Float16', '-65504.0', '65504.0'), + DataType('half', 'Float16', 'half(-65504.0f)', 'half(65504.0f)'), DataType('float', 'Float32', '-3.4e38', '3.4e38'), DataType('double', 'Float64', '-1.7e308', '1.7e308'), DataType('bfloat16', 'Bfloat16', '-3.3e38_bf16', '3.3e38_bf16'), @@ -69,7 +69,7 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, raise ValueError(f"Invalid vector_rank: {vector_rank}") if continuity.is_contiguous: - code.append(f"alignas(32) auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});") + code.append(f"auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});") code.append(f"NttTest::init_tensor({var_name}, min_input, max_input);") else: # non-contiguous big_dims = dims.copy() @@ -82,7 +82,7 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, big_shape_expr = self.generate_shape_init(shape_type, big_dims) code.append(f"// Create non-contiguous tensor (on dimension {dim_to_change})") - code.append(f"alignas(32) auto big_tensor = ntt::make_tensor<{element_cpp_type}>({big_shape_expr});") + code.append(f"auto big_tensor = ntt::make_tensor<{element_cpp_type}>({big_shape_expr});") code.append(f"NttTest::init_tensor(big_tensor, min_input, max_input);") code.append(f"") code.append(f"auto {var_name} = ntt::make_tensor_view_from_address<{element_cpp_type}>(") @@ -127,7 +127,7 @@ def generate_reference_and_comparison_code(self, ort_input_tensor = "ntt_input_uint8" else: code.append(" // Copy to contiguous tensor for ORT reference") - code.append(f" alignas(32) auto continuous_input = ntt::make_tensor<{input_element_type}>({self.generate_shape_init(shape_type, input_dims_expr)});") + code.append(f" auto continuous_input = ntt::make_tensor<{input_element_type}>({self.generate_shape_init(shape_type, input_dims_expr)});") code.append(" ") for i, name in enumerate(dim_names): code.append(f" {' ' * i}for (size_t {name.lower()} = 0; {name.lower()} < {name}; {name.lower()}++) {{") @@ -153,11 +153,11 @@ def generate_reference_and_comparison_code(self, if ntt_output_var_is_vector: output_element_type_uint8 = output_element_type.replace(datatype.cpp_type, 'uint8_t') - code.append(f" alignas(32) auto ntt_output2_uint8 = ntt::make_tensor<{output_element_type_uint8}>({output_shape_expr});") + code.append(f" auto ntt_output2_uint8 = ntt::make_tensor<{output_element_type_uint8}>({output_shape_expr});") code.append(f" NttTest::ort2ntt(ort_output, ntt_output2_uint8);") code.append(f" EXPECT_TRUE(NttTest::compare_tensor({ntt_output_for_comp}, ntt_output2_uint8));") else: - code.append(f" alignas(32) auto ntt_output2 = ntt::make_tensor<{output_element_type}>({output_shape_expr});") + code.append(f" auto ntt_output2 = ntt::make_tensor<{output_element_type}>({output_shape_expr});") code.append(f" NttTest::ort2ntt(ort_output, ntt_output2);") code.append(f" EXPECT_TRUE(NttTest::compare_tensor({ntt_output_for_comp}, ntt_output2));") @@ -278,7 +278,7 @@ def generate_ntt_output_and_op_section(self, output_tensor_code = [ f"// Create output tensor", - f"alignas(32) auto {output_var_name} = ntt::make_tensor<{output_element_type}>({output_shape_expr});", + f"auto {output_var_name} = ntt::make_tensor<{output_element_type}>({output_shape_expr});", "" ] op_section = output_tensor_code + ntt_op_call_lines @@ -335,7 +335,7 @@ def generate_ort_input_section(self, # For vector types, the element type is a vector. element_cpp_type = self._build_vector_cpp_type(datatype.cpp_type, vector_rank, P, axes_count) shape_expr = self.generate_shape_init(shape_type, input_dims_expr) - lines.append(f" alignas(32) auto continuous_input = ntt::make_tensor<{element_cpp_type}>({shape_expr});") + lines.append(f" auto continuous_input = ntt::make_tensor<{element_cpp_type}>({shape_expr});") # nested copy loops lines.append("") @@ -380,7 +380,7 @@ def generate_ort_back2ntt_and_compare_section(self, if deal_fp8 == 0: # Not fp8 golden_var_name = "ntt_golden" - lines.append(f"alignas(32) auto {golden_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});") + lines.append(f"auto {golden_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});") lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_var_name});") lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_var_name}, {golden_var_name}));") elif deal_fp8 == 1: # fp8 with uint8 comparison @@ -388,7 +388,7 @@ def generate_ort_back2ntt_and_compare_section(self, golden_var_name = "ntt_golden_uint8" golden_cpp_type = "uint8_t" if "vector" not in output_element_cpp_type else output_element_cpp_type.replace(datatype.cpp_type, "uint8_t") - lines.append(f"alignas(32) auto {golden_var_name} = ntt::make_tensor<{golden_cpp_type}>({output_shape_expr});") + lines.append(f"auto {golden_var_name} = ntt::make_tensor<{golden_cpp_type}>({output_shape_expr});") lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_var_name});") lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_to_compare}, {golden_var_name}));") elif deal_fp8 == 2: # fp8 with fp16 intermediate, compare fp8 @@ -396,11 +396,11 @@ def generate_ort_back2ntt_and_compare_section(self, golden_fp16_cpp_type = output_element_cpp_type.replace(datatype.cpp_type, "half") lines.append(f"// Golden output is in fp16, cast it back to fp8 for comparison") - lines.append(f"alignas(32) auto {golden_fp16_var_name} = ntt::make_tensor<{golden_fp16_cpp_type}>({output_shape_expr});") + lines.append(f"auto {golden_fp16_var_name} = ntt::make_tensor<{golden_fp16_cpp_type}>({output_shape_expr});") lines.append(f"NttTest::ort2ntt({ort_output_var_name}, {golden_fp16_var_name});") golden_fp8_var_name = "ntt_golden_fp8" - lines.append(f"alignas(32) auto {golden_fp8_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});") + lines.append(f"auto {golden_fp8_var_name} = ntt::make_tensor<{output_element_cpp_type}>({output_shape_expr});") lines.append(f"ntt::cast({golden_fp16_var_name}, {golden_fp8_var_name});") lines.append(f"EXPECT_TRUE(NttTest::compare_tensor({ntt_output_var_name}, {golden_fp8_var_name}));") From 0448151010171ebc56f924207b642983957901f3 Mon Sep 17 00:00:00 2001 From: root Date: Thu, 3 Jul 2025 03:30:14 +0000 Subject: [PATCH 05/49] First version done of ctest cast The current ctest suite for the cast operator requires several improvements: 1. **Missing repack coverage:** Tests for repack functionality are not yet implemented. These should be added to cover various dimensions. 2. **Unsupported bool type for different ISAs:** Support for the boolean type across different ISAs is needed. The main workload is expected to be in , with minor changes to the test scripts. 3. **Adapt to new cast interface:** Ctests need to be regenerated to align with the new cast interface once the recent implementation is merged. --- ntt/CMakeLists.txt | 2 +- ntt/include/nncase/float8.h | 17 +++++ ntt/include/nncase/ntt/kernels/cast.h | 1 + ntt/include/nncase/ntt/ukernels/u_cast.h | 39 ++++++----- ntt/test/ctest/CMakeLists.txt | 24 ++++--- ntt/test/ctest/generate_pack_tests.py | 5 -- ntt/test/ctest/generate_unpack_tests.py | 14 +--- ntt/test/ctest/test_generator_base.py | 88 ++++++++++++++---------- ntt/test/ctest/test_ntt_cast.cpp | 13 ++-- ntt/test/ntt_test.h | 9 +-- 10 files changed, 122 insertions(+), 90 deletions(-) diff --git a/ntt/CMakeLists.txt b/ntt/CMakeLists.txt index 1de56acea3..4854869b80 100644 --- a/ntt/CMakeLists.txt +++ b/ntt/CMakeLists.txt @@ -7,5 +7,5 @@ if(BUILD_TESTING) endif() if(BUILD_BENCHMARK) - add_subdirectory(test/benchmark_test) + # add_subdirectory(test/benchmark_test) endif() diff --git a/ntt/include/nncase/float8.h b/ntt/include/nncase/float8.h index 3cc06ed154..cf608236a3 100644 --- a/ntt/include/nncase/float8.h +++ b/ntt/include/nncase/float8.h @@ -510,6 +510,17 @@ struct alignas(1) float_e4m3_t : float8_base { CUTLASS_HOST_DEVICE explicit float_e4m3_t(size_t x) : float_e4m3_t(float(x)) {} + CUTLASS_HOST_DEVICE + explicit float_e4m3_t(bfloat16 x) : float_e4m3_t(float(x)) {} + + CUTLASS_HOST_DEVICE + explicit float_e4m3_t(int64_t x) : float_e4m3_t(float(x)) {} + + + CUTLASS_HOST_DEVICE + explicit float_e4m3_t(uint32_t x) : float_e4m3_t(float(x)) {} + + /// E5M2 conversion. Defined after float_e5m2_t is defined. CUTLASS_HOST_DEVICE explicit float_e4m3_t(float_e5m2_t x); @@ -709,6 +720,12 @@ struct alignas(1) float_e5m2_t : float8_base { CUTLASS_HOST_DEVICE explicit float_e5m2_t(bfloat16 x) : float_e5m2_t(float(x)) {} + CUTLASS_HOST_DEVICE + explicit float_e5m2_t(int64_t x) : float_e5m2_t(float(x)) {} + + CUTLASS_HOST_DEVICE + explicit float_e5m2_t(uint32_t x) : float_e5m2_t(float(x)) {} + /// E4M3 conversion CUTLASS_HOST_DEVICE explicit float_e5m2_t(float_e4m3_t x); diff --git a/ntt/include/nncase/ntt/kernels/cast.h b/ntt/include/nncase/ntt/kernels/cast.h index cb56762154..87ca821bfe 100644 --- a/ntt/include/nncase/ntt/kernels/cast.h +++ b/ntt/include/nncase/ntt/kernels/cast.h @@ -101,6 +101,7 @@ class cast_impl { #if 0 template constexpr void + //rest_dims is the dims of the tensor to be casted apply(const TContiguousDims &conti_dims, const TRestDims &rest_dims, dynamic_shape_t &index, const TIn &input, TOut &output) { if (conti_dims == rest_dims.rank()) { diff --git a/ntt/include/nncase/ntt/ukernels/u_cast.h b/ntt/include/nncase/ntt/ukernels/u_cast.h index 9c9bdbeeaa..762352188b 100644 --- a/ntt/include/nncase/ntt/ukernels/u_cast.h +++ b/ntt/include/nncase/ntt/ukernels/u_cast.h @@ -16,6 +16,7 @@ #include "../post_ops.h" #include "../primitive_ops.h" #include "../vector.h" +#include "../apply.h" namespace nncase::ntt { namespace ukernels { @@ -82,32 +83,30 @@ struct u_cast { while (count / unroll) { for (size_t i = 0; i < unroll; i++) { - auto tmp_output = ntt::ops::cast()(*input); - for (auto s = 0; s < out_offset_scale; s++) { - *output = *((T2 *)(&tmp_output(s))); - (*output) = TPostOps()(*output); - output += output_stride; - } + auto temp = ntt::ops::cast()(*input); + std::memcpy(output, &temp, sizeof(temp)); input += input_stride * in_offset_scale; count--; } } for (size_t i = 0; i < count; i++) { - auto tmp_output = ntt::ops::cast()(*input); - for (auto s = 0; s < out_offset_scale; s++) { - *output = *((T2 *)(&tmp_output(s))); - (*output) = TPostOps()(*output); - output += output_stride; - } + auto temp = ntt::ops::cast()(*input); + std::memcpy(output, &temp, sizeof(temp)); input += input_stride * in_offset_scale; } } else { while (count / unroll) { for (size_t i = 0; i < unroll; i++) { - *output = ntt::ops::cast()(*input); - (*output) = TPostOps()(*output); + if constexpr (!Vector) { + *output = ntt::ops::cast()(*input); + } else { + auto temp = ntt::ops::cast()(*input); + ntt::apply(temp.shape(), [&](auto index) { + (*output)(index) = temp(index); + }); + } input += input_stride * in_offset_scale; output += output_stride * out_offset_scale; count--; @@ -115,8 +114,16 @@ struct u_cast { } for (size_t i = 0; i < count; i++) { - *output = ntt::ops::cast()(*input); - (*output) = TPostOps()(*output); + // auto temp = ntt::ops::cast()(*input); + // std::memcpy(output, &temp, sizeof(temp)); + if constexpr (!Vector) { + *output = ntt::ops::cast()(*input); + } else { + auto temp = ntt::ops::cast()(*input); + ntt::apply(temp.shape(), [&](auto index) { + (*output)(index) = temp(index); + }); + } input += input_stride * in_offset_scale; output += output_stride * out_offset_scale; } diff --git a/ntt/test/ctest/CMakeLists.txt b/ntt/test/ctest/CMakeLists.txt index 928644cc09..a8fe4ad7cf 100644 --- a/ntt/test/ctest/CMakeLists.txt +++ b/ntt/test/ctest/CMakeLists.txt @@ -19,6 +19,8 @@ set(GENERATE_PACK_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/generate_pack_tests.py) set(GENERATED_PACK_CMAKE ${CMAKE_CURRENT_SOURCE_DIR}/generated_pack_tests.cmake) set(GENERATE_UNPACK_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/generate_unpack_tests.py) set(GENERATED_UNPACK_CMAKE ${CMAKE_CURRENT_SOURCE_DIR}/generated_unpack_tests.cmake) +set(GENERATE_CAST_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/generate_cast_tests.py) +set(GENERATED_CAST_CMAKE ${CMAKE_CURRENT_SOURCE_DIR}/generated_cast_tests.cmake) # Macro to run a generator script only when the output is missing or outdated macro(run_generator_if_needed script_path output_file) @@ -40,11 +42,12 @@ endmacro() # Run the generators run_generator_if_needed(${GENERATE_PACK_SCRIPT} ${GENERATED_PACK_CMAKE}) run_generator_if_needed(${GENERATE_UNPACK_SCRIPT} ${GENERATED_UNPACK_CMAKE}) +run_generator_if_needed(${GENERATE_CAST_SCRIPT} ${GENERATED_CAST_CMAKE}) # Include the generated cmake files which define test source variables include(${GENERATED_PACK_CMAKE}) include(${GENERATED_UNPACK_CMAKE}) - +include(${GENERATED_CAST_CMAKE}) macro(add_test_exec name) add_executable(${name} ${name}.cpp) @@ -67,19 +70,18 @@ file(GLOB HANDWRITTEN_TESTS CONFIGURE_DEPENDS # test_ntt_gather.cpp # test_ntt_layer_norm.cpp test_ntt_matmul.cpp - # test_ntt_pack_generated_Float8e4m3.cpp - # test_ntt_reduce.cpp - # test_ntt_rms_norm.cpp - # test_ntt_scatter_nd.cpp - # test_ntt_slice.cpp - # test_ntt_softmax.cpp - # test_ntt_transpose.cpp - # test_ntt_transpose_half.cpp - # test_ntt_unpack.cpp + test_ntt_reduce.cpp + test_ntt_rms_norm.cpp + test_ntt_scatter_nd.cpp + test_ntt_slice.cpp + test_ntt_softmax.cpp + test_ntt_transpose.cpp + test_ntt_unpack.cpp # test_ntt_where.cpp ) -# list(APPEND TEST_NAMES ${HANDWRITTEN_TESTS} ${GENERATED_PACK_TEST_SOURCES} ${GENERATED_UNPACK_TEST_SOURCES}) +# list(APPEND TEST_NAMES ${HANDWRITTEN_TESTS} ${GENERATED_PACK_TEST_SOURCES} ${GENERATED_UNPACK_TEST_SOURCES} ${GENERATED_CAST_TEST_SOURCES}) +list(APPEND TEST_NAMES ${GENERATED_CAST_TEST_SOURCES}) foreach(test_name ${TEST_NAMES}) diff --git a/ntt/test/ctest/generate_pack_tests.py b/ntt/test/ctest/generate_pack_tests.py index 61a4ab99fd..46b377b8e3 100644 --- a/ntt/test/ctest/generate_pack_tests.py +++ b/ntt/test/ctest/generate_pack_tests.py @@ -34,11 +34,6 @@ def generate_test_name(self, datatype, shape_type, vector_dim, continuity: Conti return "_".join(parts) - def generate_pack_axes_str(self, axes): - if len(axes) == 1: - return f"ntt::fixed_shape_v<{axes[0]}>" - else: - return f"ntt::fixed_shape_v<{', '.join(map(str, axes))}>" def generate_ort_reference(self, input_dims, input_dim_names, pack_axes): code = [] diff --git a/ntt/test/ctest/generate_unpack_tests.py b/ntt/test/ctest/generate_unpack_tests.py index d51c941bb2..7574e9840f 100644 --- a/ntt/test/ctest/generate_unpack_tests.py +++ b/ntt/test/ctest/generate_unpack_tests.py @@ -66,12 +66,7 @@ def generate_ort_reference(self, input_dims, input_dim_names, unpack_axes, P): reshape_source = "ort_input" # 2. Reshape to final output shape - output_dims = [] - for i, name in enumerate(input_dim_names): - if i in unpack_axes: - output_dims.append(f"{name} * P") - else: - output_dims.append(name) + output_dims = self.get_unpacked_dims(input_dim_names, unpack_axes) code.append(f"int64_t reshape_data[] = {{{', '.join(output_dims)}}};") code.append("int64_t reshape_shape[] = {std::size(reshape_data)};") @@ -114,12 +109,7 @@ def generate_ntt_output_to_test(self, datatype, shape_type, dim_names, continuit var_name="ntt_input")) # 2. NTT operation (unpack) - output_dims = [] - for i, name in enumerate(dim_names): - if i in unpack_axes: - output_dims.append(f"{name} * P") - else: - output_dims.append(name) + output_dims = self.get_unpacked_dims(dim_names, unpack_axes) output_shape_expr = self.generate_shape_init(shape_type, output_dims) unpack_call_code = self.generate_ntt_ops(unpack_axes) diff --git a/ntt/test/ctest/test_generator_base.py b/ntt/test/ctest/test_generator_base.py index a95fdb53f6..3776986b19 100644 --- a/ntt/test/ctest/test_generator_base.py +++ b/ntt/test/ctest/test_generator_base.py @@ -35,7 +35,20 @@ class BaseTestGenerator: def __init__(self): self.test_cases = [] + def get_unpacked_dims(self, dim_names, unpack_axes) -> List[str]: + """Generate dimension expressions for an unpack operation.""" + output_dims = [] + ndim = len(dim_names) + positive_unpack_axes = [ax if ax >= 0 else ndim + ax for ax in unpack_axes] + for i, name in enumerate(dim_names): + if i in positive_unpack_axes: + output_dims.append(f"{name} * P") + else: + output_dims.append(name) + return output_dims + def generate_shape_init(self, shape_type, dims): + assert shape_type in ["fixed", "dynamic"], f"Invalid shape type: {shape_type}" if shape_type == "fixed": dim_strs = [f"{d}" for d in dims] return f"ntt::fixed_shape_v<{', '.join(dim_strs)}>" @@ -43,30 +56,25 @@ def generate_shape_init(self, shape_type, dims): dim_strs = [str(d) for d in dims] return f"ntt::make_shape({', '.join(dim_strs)})" - def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, vector_rank, P=None, axes_count=1): - code = [] - shape_expr = self.generate_shape_init(shape_type, dims) - - # Determine element type based on vector_rank + def get_element_cpp_type(self, datatype: DataType, vector_rank: int, P: Optional[str]) -> str: + """Determine element C++ type based on vector_rank.""" if vector_rank == 0: - element_cpp_type = datatype.cpp_type - elif vector_rank == 1: - if P is None: - raise ValueError("P must be provided for vector_rank 1") - element_cpp_type = f"ntt::vector<{datatype.cpp_type}, {P}>" - elif vector_rank > 1: - if P is None or axes_count is None: - raise ValueError("P and axes_count must be provided for vector_rank > 1") - ps = ', '.join([str(P)] * axes_count) - elif vector_rank > 0: + return datatype.cpp_type + if vector_rank > 0: if P is None: raise ValueError("P must be provided for vector_rank > 0") - + # The rank of the vector is determined by vector_rank. ps = ', '.join([f"P"] * vector_rank) - element_cpp_type = f"ntt::vector<{datatype.cpp_type}, {ps}>" - else: - raise ValueError(f"Invalid vector_rank: {vector_rank}") + return f"ntt::vector<{datatype.cpp_type}, {ps}>" + + raise ValueError(f"Invalid vector_rank: {vector_rank}") + + def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, vector_rank, P=None, axes_count=1): + code = [] + shape_expr = self.generate_shape_init(shape_type, dims) + + element_cpp_type = self.get_element_cpp_type(datatype, vector_rank, P) if continuity.is_contiguous: code.append(f"auto {var_name} = ntt::make_tensor<{element_cpp_type}>({shape_expr});") @@ -92,15 +100,15 @@ def generate_tensor_init(self, datatype, shape_type, dims, continuity, var_name, return code - def generate_test_prologue(self, test_suite_prefix, datatype, test_name, P, dim_names, dims, axes=None): + def generate_test_prologue(self, test_suite_prefix, datatype, test_name, P, dim_names, dims, pack_axes=None): """generate test function header, constant P and dimension constants""" code = [f"TEST({test_suite_prefix}_{datatype.name_suffix}, {test_name}) {{"] - if P: + if (P and (pack_axes is not None)) or ("unpack" in test_name): code.append(f" constexpr size_t P = {P};") # define dimension constants for i, (name, size) in enumerate(zip(dim_names, dims)): - if axes and i in axes: + if pack_axes and i in pack_axes: code.append(f" constexpr size_t {name}_coefficient = {size};") code.append(f" constexpr size_t {name} = {name}_coefficient * P;") else: @@ -110,6 +118,27 @@ def generate_test_prologue(self, test_suite_prefix, datatype, test_name, P, dim_ f" {datatype.cpp_type} max_input = {datatype.max_val};", ""]) return code + def generate_copy_to_contiguous_code(self, input_element_type, shape_type, dim_names, input_var_name="ntt_input", output_var_name="continuous_input"): + code = [] + input_dims_expr = [f"{name}" for name in dim_names] + code.append(" // Copy to contiguous tensor for ORT reference") + code.append(f" auto {output_var_name} = ntt::make_tensor<{input_element_type}>({self.generate_shape_init(shape_type, input_dims_expr)});") + code.append(" ") + for i, name in enumerate(dim_names): + code.append(f" {' ' * i}for (size_t {name.lower()} = 0; {name.lower()} < {name}; {name.lower()}++) {{") + indices = [f"{name.lower()}" for name in dim_names] + code.append(f" {' ' * len(dim_names)}{output_var_name}({', '.join(indices)}) = {input_var_name}({', '.join(indices)});") + for i in range(len(dim_names) - 1, -1, -1): + code.append(f" {' ' * i}}}") + code.append("") + return code, output_var_name + + def generate_pack_axes_str(self, axes): + if len(axes) == 1: + return f"ntt::fixed_shape_v<{axes[0]}>" + else: + return f"ntt::fixed_shape_v<{', '.join(map(str, axes))}>" + def generate_reference_and_comparison_code(self, datatype, continuity, dim_names, shape_type, is_fp8, input_element_type, @@ -119,24 +148,13 @@ def generate_reference_and_comparison_code(self, ntt_output_var_name = "ntt_output1", ntt_output_var_is_vector = False): code = [] - input_dims_expr = [f"{name}" for name in dim_names] - ort_input_tensor = "ntt_input" if not continuity.is_contiguous: if is_fp8: ort_input_tensor = "ntt_input_uint8" else: - code.append(" // Copy to contiguous tensor for ORT reference") - code.append(f" auto continuous_input = ntt::make_tensor<{input_element_type}>({self.generate_shape_init(shape_type, input_dims_expr)});") - code.append(" ") - for i, name in enumerate(dim_names): - code.append(f" {' ' * i}for (size_t {name.lower()} = 0; {name.lower()} < {name}; {name.lower()}++) {{") - indices = [f"{name.lower()}" for name in dim_names] - code.append(f" {' ' * len(dim_names)}continuous_input({', '.join(indices)}) = ntt_input({', '.join(indices)});") - for i in range(len(dim_names)-1, -1, -1): - code.append(f" {' ' * i}}}") - code.append("") - ort_input_tensor = "continuous_input" + copy_code, ort_input_tensor = self.generate_copy_to_contiguous_code(input_element_type, shape_type, dim_names) + code.extend(copy_code) elif is_fp8: ort_input_tensor = "ntt_input_uint8" diff --git a/ntt/test/ctest/test_ntt_cast.cpp b/ntt/test/ctest/test_ntt_cast.cpp index fe164fbaf3..0ac2fc8b6c 100644 --- a/ntt/test/ctest/test_ntt_cast.cpp +++ b/ntt/test/ctest/test_ntt_cast.cpp @@ -446,21 +446,21 @@ TEST(CastTestFloat32ToFloat8E4M3, NoVectorize) { constexpr size_t N = 32; float min_input = -500.0f; float max_input = 500.0f; - +//# generate ntt output to test // init auto ntt_input = ntt::make_tensor(ntt::fixed_shape_v); NttTest::init_tensor(ntt_input, min_input, max_input); // ntt auto ntt_output1 = ntt::make_tensor(ntt::fixed_shape_v); - ntt::cast(ntt_input, ntt_output1, ntt::fixed_shape_v<>); - + ntt::cast(ntt_input, ntt_output1); +//# generate_ntt_golden_output // float8 auto ntt_output2 = ntt::make_tensor(ntt::fixed_shape_v); nncase::ntt::apply(ntt_input.shape(), [&](auto index) { (ntt_output2)(index) = (float_e4m3_t)(ntt_input)(index); }); - +//# compare // compare EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2)); } @@ -471,7 +471,7 @@ TEST(CastTestFloat32ToFloat8E4M3, Vectorize) { constexpr size_t P = NTT_VLEN / (sizeof(float) * 8); float min_input = -500.0f; float max_input = 500.0f; - +//# generate ntt output to test // init auto ntt_input = ntt::make_tensor(ntt::fixed_shape_v); NttTest::init_tensor(ntt_input, min_input, max_input); @@ -486,12 +486,13 @@ TEST(CastTestFloat32ToFloat8E4M3, Vectorize) { auto ntt_output1 = ntt::make_tensor(ntt::fixed_shape_v); ntt::unpack(vectorize_output, ntt_output1, ntt::fixed_shape_v<0>); +//# generate_ntt_golden_output // float8 auto ntt_output2 = ntt::make_tensor(ntt::fixed_shape_v); nncase::ntt::apply(ntt_input.shape(), [&](auto index) { (ntt_output2)(index) = float_e4m3_t((ntt_input)(index)); }); - +//# compare // compare EXPECT_TRUE(NttTest::compare_tensor(ntt_output1, ntt_output2)); } diff --git a/ntt/test/ntt_test.h b/ntt/test/ntt_test.h index bdaa6aa96c..a83579dad2 100644 --- a/ntt/test/ntt_test.h +++ b/ntt/test/ntt_test.h @@ -298,10 +298,11 @@ bool compare_tensor(TTensor1 &lhs, TTensor2 &rhs, double threshold = 0.999f) { } // 2D vector -template - requires(TTensor::element_type::rank() == 2) -bool compare_tensor(TTensor &lhs, TTensor &rhs, double threshold = 0.999f) { - using vector_type = typename TTensor::element_type; +template + requires(TTensor1::element_type::rank() == 2 && + TTensor2::element_type::rank() == 2) +bool compare_tensor(TTensor1 &lhs, TTensor2 &rhs, double threshold = 0.999f) { + using vector_type = typename TTensor1::element_type; constexpr size_t N0 = vector_type::template lane<0>(); constexpr size_t N1 = vector_type::template lane<1>(); From a75552e52130487ff678940cb7fcd55c17f2d9f9 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 7 Jul 2025 09:54:41 +0000 Subject: [PATCH 06/49] binary test generator 50% cast, pack, unpack stay right --- ntt/include/nncase/ntt/vector_ops.h | 26 ++ ntt/test/ctest/CMakeLists.txt | 22 +- ntt/test/ctest/generate_binary_tests.py | 332 ++++++++++++++++++++++++ ntt/test/ctest/generate_cast_tests.py | 304 ++++++++++++++++++++++ ntt/test/ctest/generate_pack_tests.py | 9 +- ntt/test/ctest/test_generator_base.py | 169 +++++++----- ntt/test/ctest/test_ntt_binary_add.cpp | 35 ++- 7 files changed, 814 insertions(+), 83 deletions(-) create mode 100644 ntt/test/ctest/generate_binary_tests.py create mode 100644 ntt/test/ctest/generate_cast_tests.py diff --git a/ntt/include/nncase/ntt/vector_ops.h b/ntt/include/nncase/ntt/vector_ops.h index de2d35c02d..5a9bee4ed0 100644 --- a/ntt/include/nncase/ntt/vector_ops.h +++ b/ntt/include/nncase/ntt/vector_ops.h @@ -67,6 +67,8 @@ struct tensor_unary_impl { template