From 2faf03eca07f51cfcd9d0745ccf17219bfcee318 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 7 Aug 2023 19:52:12 +0200 Subject: [PATCH 1/4] Enable templates for all of YATeTo (no templating for individual kernels or tensors... yet) --- yateto/codegen/code.py | 12 ++ yateto/codegen/visitor.py | 308 ++++++++++++++++++++------------------ yateto/generator.py | 13 +- 3 files changed, 186 insertions(+), 147 deletions(-) diff --git a/yateto/codegen/code.py b/yateto/codegen/code.py index 9cf7896..45666b1 100644 --- a/yateto/codegen/code.py +++ b/yateto/codegen/code.py @@ -160,6 +160,18 @@ def Class(self, name): def classDeclaration(self, name): return self.__call__('class {};'.format(name)) + + def TemplateStructForward(self, name, params): + if params is None: + return self.__call__(f'struct {name};') + else: + return self.__call__(f'template<{params}> struct {name};') + + def TemplateStruct(self, name, values): + if values is None: + return Block(self, f'struct {name}', foot=';') + else: + return Block(self, f'template<> struct {name} <{values}>', foot=';') def forwardStruct(self, name): self.__call__('struct {};'.format(name)) diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index 9195d18..70eb1e6 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -47,8 +47,9 @@ class KernelGenerator(object): PREFETCHVAR_NAME = '_prefetch' BUFFER_NAME = '_buffer' - def __init__(self, arch): + def __init__(self, arch, template): self._arch = arch + self._template = template @classmethod def _bufferName(cls, buf): @@ -107,8 +108,8 @@ class OptimisedKernelGenerator(KernelGenerator): TEMP_MAX_MEM_REQUIRED_NAME = 'TmpMaxMemRequiredInBytes' - def __init__(self, arch, routineCache): - super().__init__(arch) + def __init__(self, arch, routineCache, template): + super().__init__(arch, template) self._routineCache = routineCache class KernelOutline(object): @@ -233,133 +234,135 @@ def generate(self, cpp, header, name, kernelOutlines, familyStride=None): formatArray = lambda lst: lst[0] brackets = '' - with header.Namespace(self.NAMESPACE): - with header.Struct(name): - header('{} {} const {}{} = {};'.format( - MODIFIERS, - self._arch.ulongTypename, - self.NONZEROFLOPS_NAME, - brackets, - formatArray([kernelOutline.nonZeroFlops if kernelOutline else 0 for kernelOutline in kernelOutlines]) - )) - header('{} {} const {}{} = {};'.format( - MODIFIERS, - self._arch.ulongTypename, - self.HARDWAREFLOPS_NAME, - brackets, - formatArray([kernelOutline.hwFlops if kernelOutline else 0 for kernelOutline in kernelOutlines]) - )) - - # tmp mem required by a kernel(s) - tmp_mem_list = [kernelOutline.tmp_mem_size if kernelOutline else 0 for kernelOutline in kernelOutlines] - header('{} {} const {}{} = {};'.format(MODIFIERS, - self._arch.ulongTypename, - self.TEMP_MEM_REQUIRED_NAME, - brackets, - formatArray(tmp_mem_list))) + with header.Struct(name): + header('{} {} const {}{} = {};'.format( + MODIFIERS, + self._arch.ulongTypename, + self.NONZEROFLOPS_NAME, + brackets, + formatArray([kernelOutline.nonZeroFlops if kernelOutline else 0 for kernelOutline in kernelOutlines]) + )) + header('{} {} const {}{} = {};'.format( + MODIFIERS, + self._arch.ulongTypename, + self.HARDWAREFLOPS_NAME, + brackets, + formatArray([kernelOutline.hwFlops if kernelOutline else 0 for kernelOutline in kernelOutlines]) + )) - header('{} {} const {} = {};'.format(MODIFIERS, - self._arch.ulongTypename, - self.TEMP_MAX_MEM_REQUIRED_NAME, - max(tmp_mem_list))) + # tmp mem required by a kernel(s) + tmp_mem_list = [kernelOutline.tmp_mem_size if kernelOutline else 0 for kernelOutline in kernelOutlines] + header('{} {} const {}{} = {};'.format(MODIFIERS, + self._arch.ulongTypename, + self.TEMP_MEM_REQUIRED_NAME, + brackets, + formatArray(tmp_mem_list))) - if target == 'gpu': - # LinearAllocatorT controls external extra mem. allocated on gpu for tmp. variables - header(f'yateto::LinearAllocatorT<{self._arch.typename}> linearAllocator;') + header('{} {} const {} = {};'.format(MODIFIERS, + self._arch.ulongTypename, + self.TEMP_MAX_MEM_REQUIRED_NAME, + max(tmp_mem_list))) - header.emptyline() + if target == 'gpu': + # LinearAllocatorT controls external extra mem. allocated on gpu for tmp. variables + header(f'yateto::LinearAllocatorT<{self._arch.typename}> linearAllocator;') - for scalar in scalars: - header('{0} {1} = std::numeric_limits<{0}>::signaling_NaN();'.format(self._arch.typename, scalar)) + header.emptyline() - def kernelArgs(base_name_with_namespace, groups, writable, is_constant, target): + for scalar in scalars: + header('{0} {1} = std::numeric_limits<{0}>::signaling_NaN();'.format(self._arch.typename, scalar)) + + def kernelArgs(base_name_with_namespace, groups, writable, is_constant, target): + prefix, base_name = Tensor.splitBasename(base_name_with_namespace) + typ = self._arch.typename + ptr_type = '**' if not is_constant and target == 'gpu' else '*' + if not writable: + typ += ' const' + if len(next(iter(groups))) > 0: + class_name = f'{prefix}{InitializerGenerator.TENSOR_NAMESPACE}::{base_name}' + container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{typ}{ptr_type}>' + header(f'{class_name}::{container_type} {base_name};') + else: + header(f'{typ}{ptr_type} {base_name}{{}};') + + for baseName, groups in tensors.items(): + kernelArgs(baseName, + groups, + writable[baseName], + is_compute_constant_tensors[baseName], + target) + header.emptyline() + + # containers with extra offsets for GPU-like computations + if target == 'gpu': + header(f'unsigned {BatchedOperationsAux.NUM_ELEMENTS_NAME} = 0;') + header(f'void *{BatchedOperationsAux.STREAM_PTR_NAME} = {BatchedOperationsAux.FORBIDDEN_STREAM_PTR};') + header(f'unsigned *{BatchedOperationsAux.FLAGS_NAME} = nullptr;') + + def generate_extra_offset_args(base_name_with_namespace, groups): prefix, base_name = Tensor.splitBasename(base_name_with_namespace) - typ = self._arch.typename - ptr_type = '**' if not is_constant and target == 'gpu' else '*' - if not writable: - typ += ' const' + offset_type = 'int' + offset_name = f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{base_name}' if len(next(iter(groups))) > 0: class_name = f'{prefix}{InitializerGenerator.TENSOR_NAMESPACE}::{base_name}' - container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{typ}{ptr_type}>' - header(f'{class_name}::{container_type} {base_name};') + container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{offset_type}>' + header(f'{class_name}::{container_type} {offset_name};') else: - header(f'{typ}{ptr_type} {base_name}{{}};') - - for baseName, groups in tensors.items(): - kernelArgs(baseName, - groups, - writable[baseName], - is_compute_constant_tensors[baseName], - target) - header.emptyline() + header(f'{offset_type} {offset_name}{{}};') - # containers with extra offsets for GPU-like computations - if target == 'gpu': - header(f'unsigned {BatchedOperationsAux.NUM_ELEMENTS_NAME} = 0;') - header(f'void *{BatchedOperationsAux.STREAM_PTR_NAME} = {BatchedOperationsAux.FORBIDDEN_STREAM_PTR};') - header(f'unsigned *{BatchedOperationsAux.FLAGS_NAME} = nullptr;') - - def generate_extra_offset_args(base_name_with_namespace, groups): - prefix, base_name = Tensor.splitBasename(base_name_with_namespace) - offset_type = 'int' - offset_name = f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{base_name}' - if len(next(iter(groups))) > 0: - class_name = f'{prefix}{InitializerGenerator.TENSOR_NAMESPACE}::{base_name}' - container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{offset_type}>' - header(f'{class_name}::{container_type} {offset_name};') - else: - header(f'{offset_type} {offset_name}{{}};') - - for base_name, groups in tensors.items(): - generate_extra_offset_args(base_name, groups) + for base_name, groups in tensors.items(): + generate_extra_offset_args(base_name, groups) + header.emptyline() + + if len(prefetch) > 0: + with header.Struct(self.PREFETCHSTRUCT_NAME): + for baseName, groups in prefetch.items(): + kernelArgs(baseName, groups, writable=False, is_constant=False, target='any') + header('{} {};'.format(self.PREFETCHSTRUCT_NAME, self.PREFETCHVAR_NAME)) header.emptyline() - if len(prefetch) > 0: - with header.Struct(self.PREFETCHSTRUCT_NAME): - for baseName, groups in prefetch.items(): - kernelArgs(baseName, groups, writable=False, is_constant=False, target='any') - header('{} {};'.format(self.PREFETCHSTRUCT_NAME, self.PREFETCHVAR_NAME)) - header.emptyline() - - for index, kernelOutline in enumerate(kernelOutlines): - if kernelOutline: - header.functionDeclaration(executeName(index)) - - if familyStride is not None: - header('using {} = void ({}::*)();'.format(self.MEMBER_FUNCTION_PTR_NAME, name)) - header('{} {} {}[] = {};'.format( - MODIFIERS, - self.MEMBER_FUNCTION_PTR_NAME, - self.EXECUTE_ARRAY_NAME, - formatArray(['&{}::{}'.format(name, executeName(index)) if kernelOutline else 'nullptr' for index, kernelOutline in enumerate(kernelOutlines)]) - )) - args = typedNdArgs(len(familyStride), self._arch.uintTypename) - indexF = indexFun(familyStride) - with header.Function(self.FIND_EXECUTE_NAME, args, '{} {}'.format(MODIFIERS, self.MEMBER_FUNCTION_PTR_NAME)): - header('return {}[{}];'.format(self.EXECUTE_ARRAY_NAME, indexF)) - with header.Function(self.EXECUTE_NAME, args, '{} void'.format(INLINE)): - header('(this->*{}({}))();'.format(self.FIND_EXECUTE_NAME, ', '.join(ndargs(len(familyStride))))) - - aux_functions = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME, self.TEMP_MEM_REQUIRED_NAME] - for function in aux_functions: - funName = function[:1].lower() + function[1:] - with header.Function(funName, args, '{} {}'.format(MODIFIERS, self._arch.ulongTypename)): - header('return {}[{}];'.format(function, indexF)) + for index, kernelOutline in enumerate(kernelOutlines): + if kernelOutline: + header.functionDeclaration(executeName(index)) + if familyStride is not None: + header('using {} = void ({}::*)();'.format(self.MEMBER_FUNCTION_PTR_NAME, name)) + header('{} {} {}[] = {};'.format( + MODIFIERS, + self.MEMBER_FUNCTION_PTR_NAME, + self.EXECUTE_ARRAY_NAME, + formatArray(['&{}::{}'.format(name, executeName(index)) if kernelOutline else 'nullptr' for index, kernelOutline in enumerate(kernelOutlines)]) + )) + args = typedNdArgs(len(familyStride), self._arch.uintTypename) + indexF = indexFun(familyStride) + with header.Function(self.FIND_EXECUTE_NAME, args, '{} {}'.format(MODIFIERS, self.MEMBER_FUNCTION_PTR_NAME)): + header('return {}[{}];'.format(self.EXECUTE_ARRAY_NAME, indexF)) + with header.Function(self.EXECUTE_NAME, args, '{} void'.format(INLINE)): + header('(this->*{}({}))();'.format(self.FIND_EXECUTE_NAME, ', '.join(ndargs(len(familyStride))))) + + aux_functions = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME, self.TEMP_MEM_REQUIRED_NAME] + for function in aux_functions: + funName = function[:1].lower() + function[1:] + with header.Function(funName, args, '{} {}'.format(MODIFIERS, self._arch.ulongTypename)): + header('return {}[{}];'.format(function, indexF)) + + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' flopCounters = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME] for fc in flopCounters: - cpp('{} {} const {}::{}::{}{};'.format( + cpp('{} {} const {}{}::{}{};'.format( CONSTEXPR, self._arch.ulongTypename, self.NAMESPACE, + templateval, name, fc, brackets )) if familyStride is not None: - cpp('{0} {1}::{2}::{3} {1}::{2}::{4}[];'.format( + cpp('{0} {1}{2}::{3}::{4} {1}{2}::{3}::{4}[];'.format( CONSTEXPR, self.NAMESPACE, + templateval, name, self.MEMBER_FUNCTION_PTR_NAME, self.EXECUTE_ARRAY_NAME @@ -368,7 +371,7 @@ def generate_extra_offset_args(base_name_with_namespace, groups): if kernelOutline is None: continue - with cpp.Function('{}::{}::{}'.format(self.NAMESPACE, name, executeName(index))): + with cpp.Function('{}{}::{}::{}'.format(self.NAMESPACE, templateval, name, executeName(index))): sclrs = sorted(list(kernelOutline.scalars), key=str) for scalar in sclrs: cpp('assert(!std::isnan({}));'.format(scalar)) @@ -389,8 +392,8 @@ def generate_extra_offset_args(base_name_with_namespace, groups): class UnitTestGenerator(KernelGenerator): KERNEL_VAR = 'krnl' - def __init__(self, arch): - super().__init__(arch) + def __init__(self, arch, template): + super().__init__(arch, template) @classmethod def _tensorName(cls, var): @@ -426,6 +429,7 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, gemm_cfg, testFra scalars = ScalarsSet().visit(cfg) scalars = sorted(scalars, key=str) variables = SortedGlobalsList().visit(cfg) + templateparam = '' if self._template[1] is None else '<' + self._template[1] + '>' kernel_prefix = '{}::'.format(namespace) if namespace else '' with cpp.Function(**testFramework.functionArgs(testName)): factory = UnitTestFactory(cpp, self._arch, self._name, testFramework) @@ -449,8 +453,9 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, gemm_cfg, testFra ) ) prefix = '{}::'.format(var.tensor.namespace) if var.tensor.namespace else '' - cpp( '{prefix}{initNS}::{baseName}::{viewStruct}{groupTemplate}::{createFun}({name}).copyToView({viewName});'.format( + cpp( '{prefix}{initNS}{template}::{baseName}::{viewStruct}{groupTemplate}::{createFun}({name}).copyToView({viewName});'.format( initNS = InitializerGenerator.INIT_NAMESPACE, + template = templateparam, supportNS = SUPPORT_LIBRARY_NAMESPACE, groupTemplate=self._groupTemplate(var), prefix=prefix, @@ -463,7 +468,7 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, gemm_cfg, testFra ) cpp.emptyline() - cpp( '{}{}::{} {};'.format(kernel_prefix, OptimisedKernelGenerator.NAMESPACE, kernelClass, self.KERNEL_VAR) ) + cpp( '{}{}{}::{} {};'.format(kernel_prefix, OptimisedKernelGenerator.NAMESPACE, templateparam, kernelClass, self.KERNEL_VAR) ) for scalar in scalars: cpp( '{0}.{1} = {1};'.format(self.KERNEL_VAR, scalar) ) for var in variables: @@ -554,8 +559,9 @@ def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declaration cpp(self.formatArray(numberType, namespace + self.ROWIND_NAME + index, memLayout.rowIndex(), declarationOnly)) cpp(self.formatArray(numberType, namespace + self.COLPTR_NAME + index, memLayout.colPointer(), declarationOnly)) - def __init__(self, arch, tensors): + def __init__(self, arch, tensors, template): self._arch = arch + self._template = template self._numberType = '{} const'.format(self._arch.uintTypename) self._realType = '{} const'.format(self._arch.typename) self._realPtrType = self._realType + '*' @@ -602,43 +608,51 @@ def iterate_collect(self): def generateTensorsH(self, header): for namespace, tensor_dict in self.iterate_collect(): - with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE): - for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items(): - with header.Struct(baseNameWithoutNamespace): - groupSize = self._groupSize[baseName] - self._tensor(header, '', tensors, groupSize, False) - args = ndargs(len(groupSize)) - typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) - returnType = '{} {}'.format(MODIFIERS, self._arch.uintTypename) - if len(groupSize) > 0: - with header.Function(self.INDEX_FUN_NAME, typedArgs, returnType): - header('return {};'.format(indexFun(groupSizeToStride(groupSize)))) - with header.Function(self.SIZE_FUN_NAME, typedArgs, returnType): - if len(groupSize) == 0: - header('return {};'.format(self.SIZE_NAME)) - else: - header('return {}[{}({})];'.format(self.SIZE_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - if len(groupSize) > 0: - header('template') - with header.Struct(self.CONTAINER_CLASS_NAME): - header('T {}[{}];'.format(self.CONTAINER_DATA_NAME, reduce(operator.mul, groupSize))) - header('{}() : {}{{}} {{}}'.format(self.CONTAINER_CLASS_NAME, self.CONTAINER_DATA_NAME)) - with header.Function('operator()', typedArgs, '{} T&'.format(INLINE)): - header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): - header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) + with header.Namespace(namespace): + header.TemplateStructForward(self.TENSOR_NAMESPACE, self._template[0]) + with header.TemplateStruct(self.TENSOR_NAMESPACE, self._template[1]): + for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items(): + with header.Struct(baseNameWithoutNamespace): + groupSize = self._groupSize[baseName] + self._tensor(header, '', tensors, groupSize, False) + args = ndargs(len(groupSize)) + typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) + returnType = '{} {}'.format(MODIFIERS, self._arch.uintTypename) + if len(groupSize) > 0: + with header.Function(self.INDEX_FUN_NAME, typedArgs, returnType): + header('return {};'.format(indexFun(groupSizeToStride(groupSize)))) + with header.Function(self.SIZE_FUN_NAME, typedArgs, returnType): + if len(groupSize) == 0: + header('return {};'.format(self.SIZE_NAME)) + else: + header('return {}[{}({})];'.format(self.SIZE_NAME, self.INDEX_FUN_NAME, ', '.join(args))) + if len(groupSize) > 0: + header('template') + with header.Struct(self.CONTAINER_CLASS_NAME): + header('T {}[{}];'.format(self.CONTAINER_DATA_NAME, reduce(operator.mul, groupSize))) + header('{}() : {}{{}} {{}}'.format(self.CONTAINER_CLASS_NAME, self.CONTAINER_DATA_NAME)) + with header.Function('operator()', typedArgs, '{} T&'.format(INLINE)): + header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) + with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): + header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) def generateTensorsCpp(self, cpp): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' for namespace, tensor_dict in self.iterate_collect(): with cpp.Namespace(namespace): for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - self._tensor(cpp, '::'.join([self.TENSOR_NAMESPACE, base_name_without_namespace, '']), tensors, self._groupSize[base_name], True) + self._tensor(cpp, '::'.join([f'{self.TENSOR_NAMESPACE}{templateval}', base_name_without_namespace, '']), tensors, self._groupSize[base_name], True) def generateInitH(self, header): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' for namespace, tensor_dict in self.iterate_collect(): - with header.Namespace(namespace), header.Namespace(self.INIT_NAMESPACE): + with header.Namespace(namespace): + header.TemplateStructForward(self.INIT_NAMESPACE, self._template[0]) + with header.TemplateStruct(self.INIT_NAMESPACE, self._template[1]): + for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): + self._init(header, base_name, base_name_without_namespace, '', tensors, False) for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - self._init(header, base_name, base_name_without_namespace, '', tensors, False) + self._init2(header, base_name, base_name_without_namespace, f'{self.INIT_NAMESPACE}{templateval}', tensors, False) def generateInitCpp(self, cpp): for namespace, tensor_dict in self.iterate_collect(): @@ -646,9 +660,11 @@ def generateInitCpp(self, cpp): prefix_parts = [] if len(namespace) > 0: prefix_parts.append(namespace) - prefix_parts += [self.INIT_NAMESPACE, base_name_without_namespace, ''] + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' + prefix_parts += [f'{self.INIT_NAMESPACE}{templateval}', base_name_without_namespace, ''] prefix = '::'.join(prefix_parts) self._init(cpp, base_name, base_name_without_namespace, prefix, tensors, True) + self._init2(cpp, base_name, base_name_without_namespace, prefix, tensors, True) def _tensor(self, cpp, name, tensors, groupSize, declarationOnly): shape = {group: tensor.shape() for group,tensor in tensors.items()} @@ -680,7 +696,8 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat if len(valueNames) > 1: self._array(cpp, self._realPtrType, name + self.VALUES_BASENAME, valueNames, groupSize, alwaysArray=False, constexpr=False, static=False) else: - with cpp.Struct('{0} : {1}::{0}'.format(baseNameWithoutNamespace, self.TENSOR_NAMESPACE)): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' + with cpp.Struct('{0} : {1}::{0}'.format(baseNameWithoutNamespace, f'{self.TENSOR_NAMESPACE}{templateval}')): for group,tensor in tensors.items(): ml = tensor.memoryLayout() tv = self._tensorViewGenerator(ml) @@ -712,6 +729,13 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) cpp('template<{}> struct {} {{}};'.format(typedArgs, self.VIEW_STRUCT_NAME)) + def _init2(self, cpp, baseName, baseNameWithoutNamespace, prefix, tensors, declarationOnly): + groupSize = self._groupSize[baseName] + stride = groupSizeToStride(groupSize) + index = lambda group: str(address(group, stride)) if len(group) > 0 else '' + viewArgs = self.TensorView.arguments(self._arch) + + if not declarationOnly: if len(groupSize) > 0: for group,tensor in tensors.items(): ml = tensor.memoryLayout() @@ -719,7 +743,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat typename = tv.typename(len(ml.shape()), self._arch) special = ','.join(str(g) for g in group) cpp('template<>') - with cpp.Struct('{}::{}<{}>'.format(baseNameWithoutNamespace, self.VIEW_STRUCT_NAME, special)): + with cpp.Struct('{}::{}::{}<{}>'.format(prefix, baseNameWithoutNamespace, self.VIEW_STRUCT_NAME, special)): cpp('typedef {} {};'.format(typename, self.VIEW_TYPE_NAME)) with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='{} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)): tv.generate(cpp, ml, self._arch, index(group)) diff --git a/yateto/generator.py b/yateto/generator.py index ea59b3e..041ee20 100644 --- a/yateto/generator.py +++ b/yateto/generator.py @@ -263,7 +263,8 @@ def generate(self, namespace='yateto', gemm_cfg: GeneratorCollection = None, cost_estimator=BoundingBoxCostEstimator, - include_tensors=set()): + include_tensors=set(), + template=(None, None)): if not gemm_cfg: gemm_cfg = DefaultGeneratorCollection(self._arch) @@ -285,10 +286,10 @@ def generate(self, print('Generating unit tests...') def unit_test_body(cpp, testFramework): for kernel in self._kernels: - UnitTestGenerator(self._arch).generate(cpp, kernel.namespace, kernel.name, kernel.name, kernel.cfg, gemm_cfg, testFramework) + UnitTestGenerator(self._arch, template).generate(cpp, kernel.namespace, kernel.name, kernel.name, kernel.cfg, gemm_cfg, testFramework) for family in self._kernelFamilies.values(): for group, kernel in family.items(): - UnitTestGenerator(self._arch).generate(cpp, kernel.namespace, kernel.name, family.name, kernel.cfg, gemm_cfg, testFramework, group) + UnitTestGenerator(self._arch, template).generate(cpp, kernel.namespace, kernel.name, family.name, kernel.cfg, gemm_cfg, testFramework, group) with Cpp(fUTdoctest.cpp) as cpp: Doctest().generate(cpp, namespace, fKernels.hName, fInit.hName, unit_test_body) with Cpp(fUTcxxtest.h) as cpp: @@ -322,7 +323,7 @@ def unit_test_body(cpp, testFramework): print('Generating kernels...') cache = RoutineCache() - optKernelGenerator = OptimisedKernelGenerator(self._arch, cache) + optKernelGenerator = OptimisedKernelGenerator(self._arch, cache, template) kernelSource = StringIO() kernelSourceContent = '' @@ -341,6 +342,8 @@ def unit_test_body(cpp, testFramework): header.include(fTensors.hName) cpp.include(fKernels.hName) with cpp.Namespace(namespace), header.Namespace(namespace): + header.TemplateStructForward(optKernelGenerator.NAMESPACE, template[0]) + with header.TemplateStruct(optKernelGenerator.NAMESPACE, template[1]): # Group kernels by namespace for kernel_namespace, kernels in kernel_dict.items(): for kernel in kernels: @@ -398,7 +401,7 @@ def unit_test_body(cpp, testFramework): print('Generating initialization code...') # Sort order: Namespace, base name of group, idx of tensor in group sort_key = lambda x: (x.namespace, x.name()) - initGen = InitializerGenerator(self._arch, sorted(tensors.values(), key=sort_key)) + initGen = InitializerGenerator(self._arch, sorted(tensors.values(), key=sort_key), template) with Cpp(fTensors.h) as header: with header.HeaderGuard(self._headerGuardName(namespace, self.TENSORS_FILE_NAME)): with header.Namespace(namespace): From ccced93064ffd5dcc374b151ff319ac5f98e24e7 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 13 Aug 2023 04:02:37 +0200 Subject: [PATCH 2/4] Fix bugs, make everything compile --- yateto/codegen/code.py | 4 +- yateto/codegen/visitor.py | 107 +++++++++++++++++++++----------------- yateto/generator.py | 69 ++++++++++++------------ 3 files changed, 97 insertions(+), 83 deletions(-) diff --git a/yateto/codegen/code.py b/yateto/codegen/code.py index 45666b1..8b2f9a7 100644 --- a/yateto/codegen/code.py +++ b/yateto/codegen/code.py @@ -177,7 +177,9 @@ def forwardStruct(self, name): self.__call__('struct {};'.format(name)) def Struct(self, name): - return Block(self, 'struct ' + name, foot=';') + if len(name) == 0: + return NoScope() + return Block(self, 'struct ' + name, foot='; // struct ' + name) def HeaderGuard(self, name): return HeaderGuard(self, name) diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index 70eb1e6..0802f21 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -200,7 +200,8 @@ def _addFromKO(cls, koEntries, entries): entries[key] = entries[key] | value - def generate(self, cpp, header, name, kernelOutlines, familyStride=None): + def generate(self, cpp, header, name, kernelOutlines, namespace, familyStride=None): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' tensors = collections.OrderedDict() prefetch = collections.OrderedDict() writable = dict() @@ -279,7 +280,7 @@ def kernelArgs(base_name_with_namespace, groups, writable, is_constant, target): if not writable: typ += ' const' if len(next(iter(groups))) > 0: - class_name = f'{prefix}{InitializerGenerator.TENSOR_NAMESPACE}::{base_name}' + class_name = f'{InitializerGenerator.TENSOR_NAMESPACE}{templateval}::{base_name_with_namespace}' container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{typ}{ptr_type}>' header(f'{class_name}::{container_type} {base_name};') else: @@ -346,32 +347,31 @@ def generate_extra_offset_args(base_name_with_namespace, groups): with header.Function(funName, args, '{} {}'.format(MODIFIERS, self._arch.ulongTypename)): header('return {}[{}];'.format(function, indexF)) - templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' + namespaceval = '' if namespace == '' or namespace is None else f'{namespace}::' flopCounters = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME] - for fc in flopCounters: - cpp('{} {} const {}{}::{}{};'.format( - CONSTEXPR, - self._arch.ulongTypename, - self.NAMESPACE, - templateval, - name, - fc, - brackets - )) - if familyStride is not None: - cpp('{0} {1}{2}::{3}::{4} {1}{2}::{3}::{4}[];'.format( - CONSTEXPR, - self.NAMESPACE, - templateval, - name, - self.MEMBER_FUNCTION_PTR_NAME, - self.EXECUTE_ARRAY_NAME - )) + #for fc in flopCounters: + # header('{} {} static const {}{}{};'.format( + # CONSTEXPR, + # self._arch.ulongTypename, + # name, + # fc, + # brackets + # )) + #if familyStride is not None: + # cpp('{0} {1}{2}::{3}{4}::{5} {1}{2}::{3}{4}::{6}[];'.format( + # CONSTEXPR, + # self.NAMESPACE, + # templateval, + # namespaceval, + # name, + # self.MEMBER_FUNCTION_PTR_NAME, + # self.EXECUTE_ARRAY_NAME + # )) for index, kernelOutline in enumerate(kernelOutlines): if kernelOutline is None: continue - with cpp.Function('{}{}::{}::{}'.format(self.NAMESPACE, templateval, name, executeName(index))): + with cpp.Function('{}{}::{}{}::{}'.format(self.NAMESPACE, templateval, namespaceval, name, executeName(index))): sclrs = sorted(list(kernelOutline.scalars), key=str) for scalar in sclrs: cpp('assert(!std::isnan({}));'.format(scalar)) @@ -607,10 +607,10 @@ def iterate_collect(self): yield cur_namespace, cur_dict def generateTensorsH(self, header): - for namespace, tensor_dict in self.iterate_collect(): - with header.Namespace(namespace): - header.TemplateStructForward(self.TENSOR_NAMESPACE, self._template[0]) - with header.TemplateStruct(self.TENSOR_NAMESPACE, self._template[1]): + header.TemplateStructForward(self.TENSOR_NAMESPACE, self._template[0]) + with header.TemplateStruct(self.TENSOR_NAMESPACE, self._template[1]): + for namespace, tensor_dict in self.iterate_collect(): + with header.Struct(namespace): for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items(): with header.Struct(baseNameWithoutNamespace): groupSize = self._groupSize[baseName] @@ -639,29 +639,42 @@ def generateTensorsH(self, header): def generateTensorsCpp(self, cpp): templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' for namespace, tensor_dict in self.iterate_collect(): - with cpp.Namespace(namespace): - for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - self._tensor(cpp, '::'.join([f'{self.TENSOR_NAMESPACE}{templateval}', base_name_without_namespace, '']), tensors, self._groupSize[base_name], True) + for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): + prefix_parts = [f'{self.TENSOR_NAMESPACE}{templateval}'] + if len(namespace) > 0: + prefix_parts += [namespace] + prefix_parts += [base_name_without_namespace, ''] + prefix = '::'.join(prefix_parts) + self._tensor(cpp, prefix, tensors, self._groupSize[base_name], True) def generateInitH(self, header): templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' - for namespace, tensor_dict in self.iterate_collect(): - with header.Namespace(namespace): - header.TemplateStructForward(self.INIT_NAMESPACE, self._template[0]) - with header.TemplateStruct(self.INIT_NAMESPACE, self._template[1]): + header.TemplateStructForward(self.INIT_NAMESPACE, self._template[0]) + with header.TemplateStruct(self.INIT_NAMESPACE, self._template[1]): + for namespace, tensor_dict in self.iterate_collect(): + with header.Struct(namespace): for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - self._init(header, base_name, base_name_without_namespace, '', tensors, False) - for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - self._init2(header, base_name, base_name_without_namespace, f'{self.INIT_NAMESPACE}{templateval}', tensors, False) + prefix_parts = [f'{self.TENSOR_NAMESPACE}{templateval}'] + if len(namespace) > 0: + prefix_parts += [namespace] + prefix = '::'.join(prefix_parts) + self._init(header, base_name, base_name_without_namespace, '', tensors, False, prefix=prefix) + for namespace, tensor_dict in self.iterate_collect(): + prefix_parts = [f'{self.INIT_NAMESPACE}{templateval}'] + if len(namespace) > 0: + prefix_parts += [namespace] + prefix = '::'.join(prefix_parts) + for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): + self._init2(header, base_name, base_name_without_namespace, prefix, tensors, False) def generateInitCpp(self, cpp): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' for namespace, tensor_dict in self.iterate_collect(): for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - prefix_parts = [] + prefix_parts = [f'{self.INIT_NAMESPACE}{templateval}'] if len(namespace) > 0: - prefix_parts.append(namespace) - templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' - prefix_parts += [f'{self.INIT_NAMESPACE}{templateval}', base_name_without_namespace, ''] + prefix_parts += [namespace] + prefix_parts += [base_name_without_namespace, ''] prefix = '::'.join(prefix_parts) self._init(cpp, base_name, base_name_without_namespace, prefix, tensors, True) self._init2(cpp, base_name, base_name_without_namespace, prefix, tensors, True) @@ -672,16 +685,16 @@ def _tensor(self, cpp, name, tensors, groupSize, declarationOnly): self._array(cpp, self._numberType, name + self.SHAPE_NAME, shape, groupSize, declarationOnly) self._array(cpp, self._numberType, name + self.SIZE_NAME, size, groupSize, declarationOnly, alwaysArray=False) - def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarationOnly): + def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarationOnly, prefix=''): groupSize = self._groupSize[baseName] stride = groupSizeToStride(groupSize) index = lambda group: str(address(group, stride)) if len(group) > 0 else '' if declarationOnly: - for group,tensor in tensors.items(): - ml = tensor.memoryLayout() - tv = self._tensorViewGenerator(ml) - tv.arrays(cpp, ml, self._arch, name, index(group), self._numberType, True) + # for group,tensor in tensors.items(): + # ml = tensor.memoryLayout() + # tv = self._tensorViewGenerator(ml) + # tv.arrays(cpp, ml, self._arch, name, index(group), self._numberType, True) valueNames = dict() for group,tensor in tensors.items(): values = tensor.values() @@ -697,7 +710,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat self._array(cpp, self._realPtrType, name + self.VALUES_BASENAME, valueNames, groupSize, alwaysArray=False, constexpr=False, static=False) else: templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' - with cpp.Struct('{0} : {1}::{0}'.format(baseNameWithoutNamespace, f'{self.TENSOR_NAMESPACE}{templateval}')): + with cpp.Struct('{0} : {1}::{0}'.format(baseNameWithoutNamespace, prefix)): for group,tensor in tensors.items(): ml = tensor.memoryLayout() tv = self._tensorViewGenerator(ml) diff --git a/yateto/generator.py b/yateto/generator.py index 041ee20..35bc8c3 100644 --- a/yateto/generator.py +++ b/yateto/generator.py @@ -254,8 +254,8 @@ def addFamily(self, prefetch = prefetchGenerator(*p) if prefetchGenerator is not None else None family.add(indexedName, ast, prefetch, namespace, target=target) - def _headerGuardName(self, namespace, fileBaseName): - partlist = namespace.upper().split('::') + [fileBaseName.upper(), self.HEADER_GUARD_SUFFIX] + def _headerGuardName(self, namespace, template, fileBaseName): + partlist = namespace.upper().split('::') + template[1].upper().split(',') + [fileBaseName.upper(), self.HEADER_GUARD_SUFFIX] return '_'.join(partlist) def generate(self, @@ -293,7 +293,7 @@ def unit_test_body(cpp, testFramework): with Cpp(fUTdoctest.cpp) as cpp: Doctest().generate(cpp, namespace, fKernels.hName, fInit.hName, unit_test_body) with Cpp(fUTcxxtest.h) as cpp: - with cpp.HeaderGuard(self._headerGuardName(namespace, self.CXXTEST_FILE_NAME.replace('.', '_'))): + with cpp.HeaderGuard(self._headerGuardName(namespace, template, self.CXXTEST_FILE_NAME.replace('.', '_'))): CxxTest().generate(cpp, namespace, fKernels.hName, fInit.hName, unit_test_body) @@ -310,16 +310,15 @@ def unit_test_body(cpp, testFramework): kernel_dict = {} for kernel in self._kernels: if kernel.namespace in kernel_dict: - kernel_dict[kernel.namespace].append(kernel) + kernel_dict[kernel.namespace].append(('kernel', kernel)) else: - kernel_dict[kernel.namespace] = [kernel] + kernel_dict[kernel.namespace] = [('kernel', kernel)] - kernel_family_dict = {} for family in self._kernelFamilies.values(): - if family.namespace in kernel_family_dict: - kernel_family_dict[family.namespace].append(family) + if family.namespace in kernel_dict: + kernel_dict[family.namespace].append(('family', family)) else: - kernel_family_dict[family.namespace] = [family] + kernel_dict[family.namespace] = [('family', family)] print('Generating kernels...') cache = RoutineCache() @@ -335,7 +334,7 @@ def unit_test_body(cpp, testFramework): cpp.include(fRoutines.hName) with Cpp(fKernels.h) as header: - with header.HeaderGuard(self._headerGuardName(namespace, self.KERNELS_FILE_NAME)): + with header.HeaderGuard(self._headerGuardName(namespace, template, self.KERNELS_FILE_NAME)): header.includeSys('cmath') header.includeSys('limits') header.include('yateto.h') @@ -344,28 +343,28 @@ def unit_test_body(cpp, testFramework): with cpp.Namespace(namespace), header.Namespace(namespace): header.TemplateStructForward(optKernelGenerator.NAMESPACE, template[0]) with header.TemplateStruct(optKernelGenerator.NAMESPACE, template[1]): - # Group kernels by namespace - for kernel_namespace, kernels in kernel_dict.items(): - for kernel in kernels: - kernelOutline = optKernelGenerator.generateKernelOutline(kernel.nonZeroFlops, - kernel.cfg, - gemm_cfg, - kernel.target) - with cpp.Namespace(kernel_namespace), header.Namespace(kernel_namespace): - optKernelGenerator.generate(cpp, header, kernel.name, [kernelOutline]) - - # Group families by namespace - for family_namespace, families in kernel_family_dict.items(): - for family in families: - kernelOutlines = [None] * len(family) - for group, kernel in family.items(): - kernelOutlines[group] = optKernelGenerator.generateKernelOutline(kernel.nonZeroFlops, - kernel.cfg, - gemm_cfg, - kernel.target) - - with cpp.Namespace(family_namespace), header.Namespace(family_namespace): - optKernelGenerator.generate(cpp, header, family.name, kernelOutlines, family.stride()) + # Group kernels and families by namespace + for kernel_namespace, kernelsfamilies in kernel_dict.items(): + with header.Struct(kernel_namespace): + for ktype, kernel in kernelsfamilies: + if ktype == 'kernel': + kernelOutline = optKernelGenerator.generateKernelOutline(kernel.nonZeroFlops, + kernel.cfg, + gemm_cfg, + kernel.target) + + optKernelGenerator.generate(cpp, header, kernel.name, [kernelOutline], kernel_namespace) + else: + family = kernel + kernelOutlines = [None] * len(family) + for group, kernel in family.items(): + kernelOutlines[group] = optKernelGenerator.generateKernelOutline(kernel.nonZeroFlops, + kernel.cfg, + gemm_cfg, + kernel.target) + + optKernelGenerator.generate(cpp, header, family.name, kernelOutlines, kernel_namespace, family.stride()) + kernelSourceContent = kernelSource.getvalue() with Cpp(fKernels.cpp) as cpp: @@ -378,7 +377,7 @@ def unit_test_body(cpp, testFramework): print('Calling external code generators...') with Cpp(fRoutines.h) as header: - with header.HeaderGuard(self._headerGuardName(namespace, self.ROUTINES_FILE_NAME)): + with header.HeaderGuard(self._headerGuardName(namespace, template, self.ROUTINES_FILE_NAME)): cache.generate(header, fRoutines.cpp, fGpulikeRoutines.cpp) # Mapping basename -> tensor @@ -403,7 +402,7 @@ def unit_test_body(cpp, testFramework): sort_key = lambda x: (x.namespace, x.name()) initGen = InitializerGenerator(self._arch, sorted(tensors.values(), key=sort_key), template) with Cpp(fTensors.h) as header: - with header.HeaderGuard(self._headerGuardName(namespace, self.TENSORS_FILE_NAME)): + with header.HeaderGuard(self._headerGuardName(namespace, template, self.TENSORS_FILE_NAME)): with header.Namespace(namespace): initGen.generateTensorsH(header) with Cpp(fTensors.cpp) as cpp: @@ -411,7 +410,7 @@ def unit_test_body(cpp, testFramework): with cpp.Namespace(namespace): initGen.generateTensorsCpp(cpp) with Cpp(fInit.h) as header: - with header.HeaderGuard(self._headerGuardName(namespace, self.INIT_FILE_NAME)): + with header.HeaderGuard(self._headerGuardName(namespace, template, self.INIT_FILE_NAME)): header.include(fTensors.hName) header.include(self.SUPPORT_LIBRARY_HEADER) with header.Namespace(namespace): From 03f16874adfc2127fe2ce84f1732b3734d1abb3b Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 18 Sep 2023 14:11:42 +0200 Subject: [PATCH 3/4] Fix template non-existence bug --- yateto/generator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yateto/generator.py b/yateto/generator.py index 6a21101..d9a48d5 100644 --- a/yateto/generator.py +++ b/yateto/generator.py @@ -255,7 +255,11 @@ def addFamily(self, family.add(indexedName, ast, prefetch, namespace, target=target) def _headerGuardName(self, namespace, template, fileBaseName): - partlist = namespace.upper().split('::') + template[1].upper().split(',') + [fileBaseName.upper(), self.HEADER_GUARD_SUFFIX] + if template[1] is None: + templateparts = [] + else: + templateparts = template[1].upper().split(',') + partlist = namespace.upper().split('::') + templateparts + [fileBaseName.upper(), self.HEADER_GUARD_SUFFIX] return '_'.join(partlist) def generate(self, From 321ac6db48665a533b18d4a7c2a757a465d2c530 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 18 Sep 2023 14:21:13 +0200 Subject: [PATCH 4/4] Fix double tensor namespace --- yateto/codegen/visitor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index 737d047..72b9f7b 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -705,7 +705,6 @@ def generateTensorsH(self, header): header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - with header.TemplateStruct(self.TENSOR_NAMESPACE, self._template[1]): for namespace, scalar_dict in self.iterate_collect_scalar(): with header.Struct(namespace): for (baseName, baseNameWithoutNamespace), scalars in scalar_dict.items():