diff --git a/yateto/codegen/code.py b/yateto/codegen/code.py index da6f93c..d0ac242 100644 --- a/yateto/codegen/code.py +++ b/yateto/codegen/code.py @@ -163,12 +163,26 @@ def Class(self, name): def classDeclaration(self, name): return self.__call__('class {};'.format(name)) + + def TemplateStructForward(self, name, params): + if params is None: + return self.__call__(f'struct {name};') + else: + return self.__call__(f'template<{params}> struct {name};') + + def TemplateStruct(self, name, values): + if values is None: + return Block(self, f'struct {name}', foot=';') + else: + return Block(self, f'template<> struct {name} <{values}>', foot=';') def forwardStruct(self, name): self.__call__('struct {};'.format(name)) def Struct(self, name): - return Block(self, 'struct ' + name, foot=';') + if len(name) == 0: + return NoScope() + return Block(self, 'struct ' + name, foot='; // struct ' + name) def HeaderGuard(self, name): return HeaderGuard(self, name) diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index 851b20f..72b9f7b 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -48,8 +48,9 @@ class KernelGenerator(object): PREFETCHVAR_NAME = '_prefetch' BUFFER_NAME = '_buffer' - def __init__(self, arch): + def __init__(self, arch, template): self._arch = arch + self._template = template @classmethod def _bufferName(cls, buf): @@ -108,8 +109,8 @@ class OptimisedKernelGenerator(KernelGenerator): TEMP_MAX_MEM_REQUIRED_NAME = 'TmpMaxMemRequiredInBytes' - def __init__(self, arch, routineCache): - super().__init__(arch) + def __init__(self, arch, routineCache, template): + super().__init__(arch, template) self._routineCache = routineCache class KernelOutline(object): @@ -203,7 +204,8 @@ def _addFromKO(cls, koEntries, entries): entries[key] = entries[key] | value - def generate(self, cpp, header, name, kernelOutlines, familyStride=None): + def generate(self, cpp, header, name, kernelOutlines, namespace, familyStride=None): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' tensors = collections.OrderedDict() prefetch = collections.OrderedDict() writable = dict() @@ -235,152 +237,152 @@ def generate(self, cpp, header, name, kernelOutlines, familyStride=None): formatArray = lambda lst: lst[0] brackets = '' - with header.Namespace(self.NAMESPACE): - with header.Struct(name): - header('{} {} const {}{} = {};'.format( - MODIFIERS, - self._arch.ulongTypename, - self.NONZEROFLOPS_NAME, - brackets, - formatArray([kernelOutline.nonZeroFlops if kernelOutline else 0 for kernelOutline in kernelOutlines]) - )) - header('{} {} const {}{} = {};'.format( - MODIFIERS, - self._arch.ulongTypename, - self.HARDWAREFLOPS_NAME, - brackets, - formatArray([kernelOutline.hwFlops if kernelOutline else 0 for kernelOutline in kernelOutlines]) - )) - - # tmp mem required by a kernel(s) - tmp_mem_list = [kernelOutline.tmp_mem_size if kernelOutline else 0 for kernelOutline in kernelOutlines] - header('{} {} const {}{} = {};'.format(MODIFIERS, - self._arch.ulongTypename, - self.TEMP_MEM_REQUIRED_NAME, - brackets, - formatArray(tmp_mem_list))) - - header('{} {} const {} = {};'.format(MODIFIERS, - self._arch.ulongTypename, - self.TEMP_MAX_MEM_REQUIRED_NAME, - max(tmp_mem_list))) - - if target == 'gpu': - # LinearAllocatorT controls external extra mem. allocated on gpu for tmp. variables - header(f'yateto::LinearAllocatorT<{self._arch.typename}> linearAllocator;') - - header.emptyline() + with header.Struct(name): + header('{} {} const {}{} = {};'.format( + MODIFIERS, + self._arch.ulongTypename, + self.NONZEROFLOPS_NAME, + brackets, + formatArray([kernelOutline.nonZeroFlops if kernelOutline else 0 for kernelOutline in kernelOutlines]) + )) + header('{} {} const {}{} = {};'.format( + MODIFIERS, + self._arch.ulongTypename, + self.HARDWAREFLOPS_NAME, + brackets, + formatArray([kernelOutline.hwFlops if kernelOutline else 0 for kernelOutline in kernelOutlines]) + )) - def kernelArgs(base_name_with_namespace, groups, writable, is_constant, target): - prefix, base_name = Tensor.splitBasename(base_name_with_namespace) - typ = self._arch.typename - ptr_type = '**' if not is_constant and target == 'gpu' else '*' - if not writable: - typ += ' const' - if len(next(iter(groups))) > 0: - class_name = f'{prefix}{InitializerGenerator.TENSOR_NAMESPACE}::{base_name}' - container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{typ}{ptr_type}>' - header(f'{class_name}::{container_type} {base_name};') - else: - header(f'{typ}{ptr_type} {base_name}{{}};') - - def scalarArgs(base_name_with_namespace, groups): + # tmp mem required by a kernel(s) + tmp_mem_list = [kernelOutline.tmp_mem_size if kernelOutline else 0 for kernelOutline in kernelOutlines] + header('{} {} const {}{} = {};'.format(MODIFIERS, + self._arch.ulongTypename, + self.TEMP_MEM_REQUIRED_NAME, + brackets, + formatArray(tmp_mem_list))) + + header('{} {} const {} = {};'.format(MODIFIERS, + self._arch.ulongTypename, + self.TEMP_MAX_MEM_REQUIRED_NAME, + max(tmp_mem_list))) + + if target == 'gpu': + # LinearAllocatorT controls external extra mem. allocated on gpu for tmp. variables + header(f'yateto::LinearAllocatorT<{self._arch.typename}> linearAllocator;') + + header.emptyline() + + def kernelArgs(base_name_with_namespace, groups, writable, is_constant, target): + prefix, base_name = Tensor.splitBasename(base_name_with_namespace) + typ = self._arch.typename + ptr_type = '**' if not is_constant and target == 'gpu' else '*' + if not writable: + typ += ' const' + if len(next(iter(groups))) > 0: + class_name = f'{InitializerGenerator.TENSOR_NAMESPACE}{templateval}::{base_name_with_namespace}' + container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{typ}{ptr_type}>' + header(f'{class_name}::{container_type} {base_name};') + else: + header(f'{typ}{ptr_type} {base_name}{{}};') + def scalarArgs(base_name_with_namespace, groups): + prefix, base_name = Tensor.splitBasename(base_name_with_namespace) + typ = self._arch.typename + if len(next(iter(groups))) > 0: + class_name = f'{prefix}{InitializerGenerator.TENSOR_NAMESPACE}{templateval}::{base_name}' + container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{typ}>' + header(f'{class_name}::{container_type} {base_name};') + else: + header(f'{typ} {base_name} = std::numeric_limits<{typ}>::signaling_NaN();') + + for baseName, groups in scalars.items(): + scalarArgs(baseName, + groups) + for baseName, groups in tensors.items(): + kernelArgs(baseName, + groups, + writable[baseName], + is_compute_constant_tensors[baseName], + target) + header.emptyline() + + # containers with extra offsets for GPU-like computations + if target == 'gpu': + header(f'unsigned {BatchedOperationsAux.NUM_ELEMENTS_NAME} = 0;') + header(f'void *{BatchedOperationsAux.STREAM_PTR_NAME} = {BatchedOperationsAux.FORBIDDEN_STREAM_PTR};') + header(f'unsigned *{BatchedOperationsAux.FLAGS_NAME} = nullptr;') + + def generate_extra_offset_args(base_name_with_namespace, groups): prefix, base_name = Tensor.splitBasename(base_name_with_namespace) - typ = self._arch.typename + offset_type = 'int' + offset_name = f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{base_name}' if len(next(iter(groups))) > 0: - class_name = f'{prefix}{InitializerGenerator.TENSOR_NAMESPACE}::{base_name}' - container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{typ}>' - header(f'{class_name}::{container_type} {base_name};') + class_name = f'{InitializerGenerator.TENSOR_NAMESPACE}{templateval}::{base_name_with_namespace}' + container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{offset_type}>' + header(f'{class_name}::{container_type} {offset_name};') else: - header(f'{typ} {base_name} = std::numeric_limits<{typ}>::signaling_NaN();') + header(f'{offset_type} {offset_name}{{}};') - for baseName, groups in scalars.items(): - scalarArgs(baseName, - groups) - for baseName, groups in tensors.items(): - kernelArgs(baseName, - groups, - writable[baseName], - is_compute_constant_tensors[baseName], - target) - header.emptyline() + for base_name, groups in tensors.items(): + generate_extra_offset_args(base_name, groups) + header.emptyline() - # containers with extra offsets for GPU-like computations - if target == 'gpu': - header(f'unsigned {BatchedOperationsAux.NUM_ELEMENTS_NAME} = 0;') - header(f'void *{BatchedOperationsAux.STREAM_PTR_NAME} = {BatchedOperationsAux.FORBIDDEN_STREAM_PTR};') - header(f'unsigned *{BatchedOperationsAux.FLAGS_NAME} = nullptr;') - - def generate_extra_offset_args(base_name_with_namespace, groups): - prefix, base_name = Tensor.splitBasename(base_name_with_namespace) - offset_type = 'int' - offset_name = f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{base_name}' - if len(next(iter(groups))) > 0: - class_name = f'{prefix}{InitializerGenerator.TENSOR_NAMESPACE}::{base_name}' - container_type = f'{InitializerGenerator.CONTAINER_CLASS_NAME}<{offset_type}>' - header(f'{class_name}::{container_type} {offset_name};') - else: - header(f'{offset_type} {offset_name}{{}};') - - for base_name, groups in tensors.items(): - generate_extra_offset_args(base_name, groups) + if len(prefetch) > 0: + with header.Struct(self.PREFETCHSTRUCT_NAME): + for baseName, groups in prefetch.items(): + kernelArgs(baseName, groups, writable=False, is_constant=False, target='any') + header('{} {};'.format(self.PREFETCHSTRUCT_NAME, self.PREFETCHVAR_NAME)) header.emptyline() - if len(prefetch) > 0: - with header.Struct(self.PREFETCHSTRUCT_NAME): - for baseName, groups in prefetch.items(): - kernelArgs(baseName, groups, writable=False, is_constant=False, target='any') - header('{} {};'.format(self.PREFETCHSTRUCT_NAME, self.PREFETCHVAR_NAME)) - header.emptyline() - - for index, kernelOutline in enumerate(kernelOutlines): - if kernelOutline: - header.functionDeclaration(executeName(index)) - - if familyStride is not None: - header('using {} = void ({}::*)();'.format(self.MEMBER_FUNCTION_PTR_NAME, name)) - header('{} {} {}[] = {};'.format( - MODIFIERS, - self.MEMBER_FUNCTION_PTR_NAME, - self.EXECUTE_ARRAY_NAME, - formatArray(['&{}::{}'.format(name, executeName(index)) if kernelOutline else 'nullptr' for index, kernelOutline in enumerate(kernelOutlines)]) - )) - args = typedNdArgs(len(familyStride), self._arch.uintTypename) - indexF = indexFun(familyStride) - with header.Function(self.FIND_EXECUTE_NAME, args, '{} {}'.format(MODIFIERS, self.MEMBER_FUNCTION_PTR_NAME)): - header('return {}[{}];'.format(self.EXECUTE_ARRAY_NAME, indexF)) - with header.Function(self.EXECUTE_NAME, args, '{} void'.format(INLINE)): - header('(this->*{}({}))();'.format(self.FIND_EXECUTE_NAME, ', '.join(ndargs(len(familyStride))))) - - aux_functions = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME, self.TEMP_MEM_REQUIRED_NAME] - for function in aux_functions: - funName = function[:1].lower() + function[1:] - with header.Function(funName, args, '{} {}'.format(MODIFIERS, self._arch.ulongTypename)): - header('return {}[{}];'.format(function, indexF)) + for index, kernelOutline in enumerate(kernelOutlines): + if kernelOutline: + header.functionDeclaration(executeName(index)) + if familyStride is not None: + header('using {} = void ({}::*)();'.format(self.MEMBER_FUNCTION_PTR_NAME, name)) + header('{} {} {}[] = {};'.format( + MODIFIERS, + self.MEMBER_FUNCTION_PTR_NAME, + self.EXECUTE_ARRAY_NAME, + formatArray(['&{}::{}'.format(name, executeName(index)) if kernelOutline else 'nullptr' for index, kernelOutline in enumerate(kernelOutlines)]) + )) + args = typedNdArgs(len(familyStride), self._arch.uintTypename) + indexF = indexFun(familyStride) + with header.Function(self.FIND_EXECUTE_NAME, args, '{} {}'.format(MODIFIERS, self.MEMBER_FUNCTION_PTR_NAME)): + header('return {}[{}];'.format(self.EXECUTE_ARRAY_NAME, indexF)) + with header.Function(self.EXECUTE_NAME, args, '{} void'.format(INLINE)): + header('(this->*{}({}))();'.format(self.FIND_EXECUTE_NAME, ', '.join(ndargs(len(familyStride))))) + + aux_functions = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME, self.TEMP_MEM_REQUIRED_NAME] + for function in aux_functions: + funName = function[:1].lower() + function[1:] + with header.Function(funName, args, '{} {}'.format(MODIFIERS, self._arch.ulongTypename)): + header('return {}[{}];'.format(function, indexF)) + + namespaceval = '' if namespace == '' or namespace is None else f'{namespace}::' flopCounters = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME] - for fc in flopCounters: - cpp('{} {} const {}::{}::{}{};'.format( - CONSTEXPR, - self._arch.ulongTypename, - self.NAMESPACE, - name, - fc, - brackets - )) - if familyStride is not None: - cpp('{0} {1}::{2}::{3} {1}::{2}::{4}[];'.format( - CONSTEXPR, - self.NAMESPACE, - name, - self.MEMBER_FUNCTION_PTR_NAME, - self.EXECUTE_ARRAY_NAME - )) + #for fc in flopCounters: + # header('{} {} static const {}{}{};'.format( + # CONSTEXPR, + # self._arch.ulongTypename, + # name, + # fc, + # brackets + # )) + #if familyStride is not None: + # cpp('{0} {1}{2}::{3}{4}::{5} {1}{2}::{3}{4}::{6}[];'.format( + # CONSTEXPR, + # self.NAMESPACE, + # templateval, + # namespaceval, + # name, + # self.MEMBER_FUNCTION_PTR_NAME, + # self.EXECUTE_ARRAY_NAME + # )) for index, kernelOutline in enumerate(kernelOutlines): if kernelOutline is None: continue - with cpp.Function('{}::{}::{}'.format(self.NAMESPACE, name, executeName(index))): + with cpp.Function('{}{}::{}{}::{}'.format(self.NAMESPACE, templateval, namespaceval, name, executeName(index))): for base_name_with_namespace, groups in kernelOutline.scalars.items(): base_name = Tensor.splitBasename(base_name_with_namespace)[-1] if len(next(iter(groups))) > 0: @@ -405,8 +407,8 @@ def generate_extra_offset_args(base_name_with_namespace, groups): class UnitTestGenerator(KernelGenerator): KERNEL_VAR = 'krnl' - def __init__(self, arch): - super().__init__(arch) + def __init__(self, arch, template): + super().__init__(arch, template) def deduce_single_scalar(self, scalar): if scalar is None: @@ -461,6 +463,7 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, gemm_cfg, testFra scalars = ScalarsSet().visit(cfg) scalars = sorted(scalars, key=str) variables = SortedGlobalsList().visit(cfg) + templateparam = '' if self._template[1] is None else '<' + self._template[1] + '>' kernel_prefix = '{}::'.format(namespace) if namespace else '' with cpp.Function(**testFramework.functionArgs(testName)): factory = UnitTestFactory(cpp, self._arch, self._name, testFramework) @@ -484,8 +487,9 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, gemm_cfg, testFra ) ) prefix = '{}::'.format(var.tensor.namespace) if var.tensor.namespace else '' - cpp( '{prefix}{initNS}::{baseName}::{viewStruct}{groupTemplate}::{createFun}({name}).copyToView({viewName});'.format( + cpp( '{prefix}{initNS}{template}::{baseName}::{viewStruct}{groupTemplate}::{createFun}({name}).copyToView({viewName});'.format( initNS = InitializerGenerator.INIT_NAMESPACE, + template = templateparam, supportNS = SUPPORT_LIBRARY_NAMESPACE, groupTemplate=self._groupTemplate(var.tensor), prefix=prefix, @@ -498,7 +502,7 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, gemm_cfg, testFra ) cpp.emptyline() - cpp( '{}{}::{} {};'.format(kernel_prefix, OptimisedKernelGenerator.NAMESPACE, kernelClass, self.KERNEL_VAR) ) + cpp( '{}{}{}::{} {};'.format(kernel_prefix, OptimisedKernelGenerator.NAMESPACE, templateparam, kernelClass, self.KERNEL_VAR) ) for var in scalars: cpp( '{}.{}{} = {};'.format(self.KERNEL_VAR, var.baseName(), self._groupIndex(var), self._tensorNameS(var)) ) for var in variables: @@ -589,8 +593,9 @@ def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declaration cpp(self.formatArray(numberType, namespace + self.ROWIND_NAME + index, memLayout.rowIndex(), declarationOnly)) cpp(self.formatArray(numberType, namespace + self.COLPTR_NAME + index, memLayout.colPointer(), declarationOnly)) - def __init__(self, arch, tensors, scalars): + def __init__(self, arch, tensors, scalars, template): self._arch = arch + self._template = template self._numberType = '{} const'.format(self._arch.uintTypename) self._realType = '{} const'.format(self._arch.typename) self._realPtrType = self._realType + '*' @@ -672,74 +677,97 @@ def iterate_collect_scalar(self): yield cur_namespace, cur_dict def generateTensorsH(self, header): - for namespace, tensor_dict in self.iterate_collect(): - with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE): - for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items(): - with header.Struct(baseNameWithoutNamespace): - groupSize = self._groupSize[baseName] - self._tensor(header, '', tensors, groupSize, False) - args = ndargs(len(groupSize)) - typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) - returnType = '{} {}'.format(MODIFIERS, self._arch.uintTypename) - if len(groupSize) > 0: - with header.Function(self.INDEX_FUN_NAME, typedArgs, returnType): - header('return {};'.format(indexFun(groupSizeToStride(groupSize)))) - with header.Function(self.SIZE_FUN_NAME, typedArgs, returnType): - if len(groupSize) == 0: - header('return {};'.format(self.SIZE_NAME)) - else: - header('return {}[{}({})];'.format(self.SIZE_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - if len(groupSize) > 0: - header('template') - with header.Struct(self.CONTAINER_CLASS_NAME): - header('T {}[{}];'.format(self.CONTAINER_DATA_NAME, reduce(operator.mul, groupSize))) - header('{}() : {}{{}} {{}}'.format(self.CONTAINER_CLASS_NAME, self.CONTAINER_DATA_NAME)) - with header.Function('operator()', typedArgs, '{} T&'.format(INLINE)): - header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): - header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - for namespace, scalar_dict in self.iterate_collect_scalar(): - with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE): - for (baseName, baseNameWithoutNamespace), scalars in scalar_dict.items(): - with header.Struct(baseNameWithoutNamespace): - groupSize = self._groupSizeScalar[baseName] - args = ndargs(len(groupSize)) - typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) - if len(groupSize) > 0: - with header.Function(self.INDEX_FUN_NAME, typedArgs, returnType): - header('return {};'.format(indexFun(groupSizeToStride(groupSize)))) - if len(groupSize) > 0: - header('template') - with header.Struct(self.CONTAINER_CLASS_NAME): - header('T {}[{}];'.format(self.CONTAINER_DATA_NAME, reduce(operator.mul, groupSize))) - with header.Function(self.CONTAINER_CLASS_NAME, '', ''): - pass - with header.Function('operator()', typedArgs, '{} T&'.format(INLINE)): - header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): - header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) + header.TemplateStructForward(self.TENSOR_NAMESPACE, self._template[0]) + with header.TemplateStruct(self.TENSOR_NAMESPACE, self._template[1]): + for namespace, tensor_dict in self.iterate_collect(): + with header.Struct(namespace): + for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items(): + with header.Struct(baseNameWithoutNamespace): + groupSize = self._groupSize[baseName] + self._tensor(header, '', tensors, groupSize, False) + args = ndargs(len(groupSize)) + typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) + returnType = '{} {}'.format(MODIFIERS, self._arch.uintTypename) + if len(groupSize) > 0: + with header.Function(self.INDEX_FUN_NAME, typedArgs, returnType): + header('return {};'.format(indexFun(groupSizeToStride(groupSize)))) + with header.Function(self.SIZE_FUN_NAME, typedArgs, returnType): + if len(groupSize) == 0: + header('return {};'.format(self.SIZE_NAME)) + else: + header('return {}[{}({})];'.format(self.SIZE_NAME, self.INDEX_FUN_NAME, ', '.join(args))) + if len(groupSize) > 0: + header('template') + with header.Struct(self.CONTAINER_CLASS_NAME): + header('T {}[{}];'.format(self.CONTAINER_DATA_NAME, reduce(operator.mul, groupSize))) + header('{}() : {}{{}} {{}}'.format(self.CONTAINER_CLASS_NAME, self.CONTAINER_DATA_NAME)) + with header.Function('operator()', typedArgs, '{} T&'.format(INLINE)): + header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) + with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): + header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) + for namespace, scalar_dict in self.iterate_collect_scalar(): + with header.Struct(namespace): + for (baseName, baseNameWithoutNamespace), scalars in scalar_dict.items(): + with header.Struct(baseNameWithoutNamespace): + groupSize = self._groupSizeScalar[baseName] + args = ndargs(len(groupSize)) + typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) + if len(groupSize) > 0: + with header.Function(self.INDEX_FUN_NAME, typedArgs, returnType): + header('return {};'.format(indexFun(groupSizeToStride(groupSize)))) + if len(groupSize) > 0: + header('template') + with header.Struct(self.CONTAINER_CLASS_NAME): + header('T {}[{}];'.format(self.CONTAINER_DATA_NAME, reduce(operator.mul, groupSize))) + with header.Function(self.CONTAINER_CLASS_NAME, '', ''): + pass + with header.Function('operator()', typedArgs, '{} T&'.format(INLINE)): + header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) + with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): + header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) def generateTensorsCpp(self, cpp): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' for namespace, tensor_dict in self.iterate_collect(): - with cpp.Namespace(namespace): - for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - self._tensor(cpp, '::'.join([self.TENSOR_NAMESPACE, base_name_without_namespace, '']), tensors, self._groupSize[base_name], True) + for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): + prefix_parts = [f'{self.TENSOR_NAMESPACE}{templateval}'] + if len(namespace) > 0: + prefix_parts += [namespace] + prefix_parts += [base_name_without_namespace, ''] + prefix = '::'.join(prefix_parts) + self._tensor(cpp, prefix, tensors, self._groupSize[base_name], True) def generateInitH(self, header): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' + header.TemplateStructForward(self.INIT_NAMESPACE, self._template[0]) + with header.TemplateStruct(self.INIT_NAMESPACE, self._template[1]): + for namespace, tensor_dict in self.iterate_collect(): + with header.Struct(namespace): + for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): + prefix_parts = [f'{self.TENSOR_NAMESPACE}{templateval}'] + if len(namespace) > 0: + prefix_parts += [namespace] + prefix = '::'.join(prefix_parts) + self._init(header, base_name, base_name_without_namespace, '', tensors, False, prefix=prefix) for namespace, tensor_dict in self.iterate_collect(): - with header.Namespace(namespace), header.Namespace(self.INIT_NAMESPACE): - for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - self._init(header, base_name, base_name_without_namespace, '', tensors, False) + prefix_parts = [f'{self.INIT_NAMESPACE}{templateval}'] + if len(namespace) > 0: + prefix_parts += [namespace] + prefix = '::'.join(prefix_parts) + for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): + self._init2(header, base_name, base_name_without_namespace, prefix, tensors, False) def generateInitCpp(self, cpp): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' for namespace, tensor_dict in self.iterate_collect(): for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): - prefix_parts = [] + prefix_parts = [f'{self.INIT_NAMESPACE}{templateval}'] if len(namespace) > 0: - prefix_parts.append(namespace) - prefix_parts += [self.INIT_NAMESPACE, base_name_without_namespace, ''] + prefix_parts += [namespace] + prefix_parts += [base_name_without_namespace, ''] prefix = '::'.join(prefix_parts) self._init(cpp, base_name, base_name_without_namespace, prefix, tensors, True) + self._init2(cpp, base_name, base_name_without_namespace, prefix, tensors, True) def _tensor(self, cpp, name, tensors, groupSize, declarationOnly): shape = {group: tensor.shape() for group,tensor in tensors.items()} @@ -747,16 +775,16 @@ def _tensor(self, cpp, name, tensors, groupSize, declarationOnly): self._array(cpp, self._numberType, name + self.SHAPE_NAME, shape, groupSize, declarationOnly) self._array(cpp, self._numberType, name + self.SIZE_NAME, size, groupSize, declarationOnly, alwaysArray=False) - def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarationOnly): + def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarationOnly, prefix=''): groupSize = self._groupSize[baseName] stride = groupSizeToStride(groupSize) index = lambda group: str(address(group, stride)) if len(group) > 0 else '' if declarationOnly: - for group,tensor in tensors.items(): - ml = tensor.memoryLayout() - tv = self._tensorViewGenerator(ml) - tv.arrays(cpp, ml, self._arch, name, index(group), self._numberType, True) + # for group,tensor in tensors.items(): + # ml = tensor.memoryLayout() + # tv = self._tensorViewGenerator(ml) + # tv.arrays(cpp, ml, self._arch, name, index(group), self._numberType, True) valueNames = dict() for group,tensor in tensors.items(): values = tensor.values() @@ -771,7 +799,8 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat if len(valueNames) > 1: self._array(cpp, self._realPtrType, name + self.VALUES_BASENAME, valueNames, groupSize, alwaysArray=False, constexpr=False, static=False) else: - with cpp.Struct('{0} : {1}::{0}'.format(baseNameWithoutNamespace, self.TENSOR_NAMESPACE)): + templateval = '' if self._template[1] is None else '<' + self._template[1] + '>' + with cpp.Struct('{0} : {1}::{0}'.format(baseNameWithoutNamespace, prefix)): for group,tensor in tensors.items(): ml = tensor.memoryLayout() tv = self._tensorViewGenerator(ml) @@ -803,6 +832,13 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) cpp('template<{}> struct {} {{}};'.format(typedArgs, self.VIEW_STRUCT_NAME)) + def _init2(self, cpp, baseName, baseNameWithoutNamespace, prefix, tensors, declarationOnly): + groupSize = self._groupSize[baseName] + stride = groupSizeToStride(groupSize) + index = lambda group: str(address(group, stride)) if len(group) > 0 else '' + viewArgs = self.TensorView.arguments(self._arch) + + if not declarationOnly: if len(groupSize) > 0: for group,tensor in tensors.items(): ml = tensor.memoryLayout() @@ -810,7 +846,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat typename = tv.typename(len(ml.shape()), self._arch) special = ','.join(str(g) for g in group) cpp('template<>') - with cpp.Struct('{}::{}<{}>'.format(baseNameWithoutNamespace, self.VIEW_STRUCT_NAME, special)): + with cpp.Struct('{}::{}::{}<{}>'.format(prefix, baseNameWithoutNamespace, self.VIEW_STRUCT_NAME, special)): cpp('typedef {} {};'.format(typename, self.VIEW_TYPE_NAME)) with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='{} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)): tv.generate(cpp, ml, self._arch, index(group)) diff --git a/yateto/generator.py b/yateto/generator.py index 3b9d47d..d9a48d5 100644 --- a/yateto/generator.py +++ b/yateto/generator.py @@ -254,8 +254,12 @@ def addFamily(self, prefetch = prefetchGenerator(*p) if prefetchGenerator is not None else None family.add(indexedName, ast, prefetch, namespace, target=target) - def _headerGuardName(self, namespace, fileBaseName): - partlist = namespace.upper().split('::') + [fileBaseName.upper(), self.HEADER_GUARD_SUFFIX] + def _headerGuardName(self, namespace, template, fileBaseName): + if template[1] is None: + templateparts = [] + else: + templateparts = template[1].upper().split(',') + partlist = namespace.upper().split('::') + templateparts + [fileBaseName.upper(), self.HEADER_GUARD_SUFFIX] return '_'.join(partlist) def generate(self, @@ -263,7 +267,8 @@ def generate(self, namespace='yateto', gemm_cfg: GeneratorCollection = None, cost_estimator=BoundingBoxCostEstimator, - include_tensors=set()): + include_tensors=set(), + template=(None, None)): if not gemm_cfg: gemm_cfg = DefaultGeneratorCollection(self._arch) @@ -285,14 +290,14 @@ def generate(self, print('Generating unit tests...') def unit_test_body(cpp, testFramework): for kernel in self._kernels: - UnitTestGenerator(self._arch).generate(cpp, kernel.namespace, kernel.name, kernel.name, kernel.cfg, gemm_cfg, testFramework) + UnitTestGenerator(self._arch, template).generate(cpp, kernel.namespace, kernel.name, kernel.name, kernel.cfg, gemm_cfg, testFramework) for family in self._kernelFamilies.values(): for group, kernel in family.items(): - UnitTestGenerator(self._arch).generate(cpp, kernel.namespace, kernel.name, family.name, kernel.cfg, gemm_cfg, testFramework, group) + UnitTestGenerator(self._arch, template).generate(cpp, kernel.namespace, kernel.name, family.name, kernel.cfg, gemm_cfg, testFramework, group) with Cpp(fUTdoctest.cpp) as cpp: Doctest().generate(cpp, namespace, fKernels.hName, fInit.hName, unit_test_body) with Cpp(fUTcxxtest.h) as cpp: - with cpp.HeaderGuard(self._headerGuardName(namespace, self.CXXTEST_FILE_NAME.replace('.', '_'))): + with cpp.HeaderGuard(self._headerGuardName(namespace, template, self.CXXTEST_FILE_NAME.replace('.', '_'))): CxxTest().generate(cpp, namespace, fKernels.hName, fInit.hName, unit_test_body) @@ -308,20 +313,19 @@ def unit_test_body(cpp, testFramework): kernel_dict = {} for kernel in self._kernels: if kernel.namespace in kernel_dict: - kernel_dict[kernel.namespace].append(kernel) + kernel_dict[kernel.namespace].append(('kernel', kernel)) else: - kernel_dict[kernel.namespace] = [kernel] + kernel_dict[kernel.namespace] = [('kernel', kernel)] - kernel_family_dict = {} for family in self._kernelFamilies.values(): - if family.namespace in kernel_family_dict: - kernel_family_dict[family.namespace].append(family) + if family.namespace in kernel_dict: + kernel_dict[family.namespace].append(('family', family)) else: - kernel_family_dict[family.namespace] = [family] + kernel_dict[family.namespace] = [('family', family)] print('Generating kernels...') cache = RoutineCache() - optKernelGenerator = OptimisedKernelGenerator(self._arch, cache) + optKernelGenerator = OptimisedKernelGenerator(self._arch, cache, template) kernelSource = StringIO() kernelSourceContent = '' @@ -333,35 +337,37 @@ def unit_test_body(cpp, testFramework): cpp.include(fRoutines.hName) with Cpp(fKernels.h) as header: - with header.HeaderGuard(self._headerGuardName(namespace, self.KERNELS_FILE_NAME)): + with header.HeaderGuard(self._headerGuardName(namespace, template, self.KERNELS_FILE_NAME)): header.includeSys('cmath') header.includeSys('limits') header.include('yateto.h') header.include(fTensors.hName) cpp.include(fKernels.hName) with cpp.Namespace(namespace), header.Namespace(namespace): - # Group kernels by namespace - for kernel_namespace, kernels in kernel_dict.items(): - for kernel in kernels: - kernelOutline = optKernelGenerator.generateKernelOutline(kernel.nonZeroFlops, - kernel.cfg, - gemm_cfg, - kernel.target) - with cpp.Namespace(kernel_namespace), header.Namespace(kernel_namespace): - optKernelGenerator.generate(cpp, header, kernel.name, [kernelOutline]) - - # Group families by namespace - for family_namespace, families in kernel_family_dict.items(): - for family in families: - kernelOutlines = [None] * len(family) - for group, kernel in family.items(): - kernelOutlines[group] = optKernelGenerator.generateKernelOutline(kernel.nonZeroFlops, - kernel.cfg, - gemm_cfg, - kernel.target) - - with cpp.Namespace(family_namespace), header.Namespace(family_namespace): - optKernelGenerator.generate(cpp, header, family.name, kernelOutlines, family.stride()) + header.TemplateStructForward(optKernelGenerator.NAMESPACE, template[0]) + with header.TemplateStruct(optKernelGenerator.NAMESPACE, template[1]): + # Group kernels and families by namespace + for kernel_namespace, kernelsfamilies in kernel_dict.items(): + with header.Struct(kernel_namespace): + for ktype, kernel in kernelsfamilies: + if ktype == 'kernel': + kernelOutline = optKernelGenerator.generateKernelOutline(kernel.nonZeroFlops, + kernel.cfg, + gemm_cfg, + kernel.target) + + optKernelGenerator.generate(cpp, header, kernel.name, [kernelOutline], kernel_namespace) + else: + family = kernel + kernelOutlines = [None] * len(family) + for group, kernel in family.items(): + kernelOutlines[group] = optKernelGenerator.generateKernelOutline(kernel.nonZeroFlops, + kernel.cfg, + gemm_cfg, + kernel.target) + + optKernelGenerator.generate(cpp, header, family.name, kernelOutlines, kernel_namespace, family.stride()) + kernelSourceContent = kernelSource.getvalue() with Cpp(fKernels.cpp) as cpp: @@ -374,7 +380,7 @@ def unit_test_body(cpp, testFramework): print('Calling external code generators...') with Cpp(fRoutines.h) as header: - with header.HeaderGuard(self._headerGuardName(namespace, self.ROUTINES_FILE_NAME)): + with header.HeaderGuard(self._headerGuardName(namespace, template, self.ROUTINES_FILE_NAME)): cache.generate(header, fRoutines.cpp, fGpulikeRoutines.cpp) # Mapping basename -> tensor @@ -400,9 +406,9 @@ def unit_test_body(cpp, testFramework): print('Generating initialization code...') # Sort order: Namespace, base name of group, idx of tensor in group sort_key = lambda x: (x.namespace, x.name()) - initGen = InitializerGenerator(self._arch, sorted(tensors.values(), key=sort_key), sorted(scalars, key=sort_key)) + initGen = InitializerGenerator(self._arch, sorted(tensors.values(), key=sort_key), sorted(scalars, key=sort_key), template) with Cpp(fTensors.h) as header: - with header.HeaderGuard(self._headerGuardName(namespace, self.TENSORS_FILE_NAME)): + with header.HeaderGuard(self._headerGuardName(namespace, template, self.TENSORS_FILE_NAME)): with header.Namespace(namespace): initGen.generateTensorsH(header) with Cpp(fTensors.cpp) as cpp: @@ -410,7 +416,7 @@ def unit_test_body(cpp, testFramework): with cpp.Namespace(namespace): initGen.generateTensorsCpp(cpp) with Cpp(fInit.h) as header: - with header.HeaderGuard(self._headerGuardName(namespace, self.INIT_FILE_NAME)): + with header.HeaderGuard(self._headerGuardName(namespace, template, self.INIT_FILE_NAME)): header.include(fTensors.hName) header.include(self.SUPPORT_LIBRARY_HEADER) with header.Namespace(namespace):