diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h index 90b0811..843a322 100644 --- a/include/yateto/TensorView.h +++ b/include/yateto/TensorView.h @@ -7,13 +7,25 @@ #include #include +#if defined(__CUDACC__) || defined(__HIP__) +#define YATETO_HOST __host__ +#define YATETO_DEVICE __device__ +#else +#define YATETO_HOST +#define YATETO_DEVICE +#endif + +#define YATETO_HOSTDEVICE YATETO_HOST YATETO_DEVICE + namespace yateto { template class slice { public: - explicit slice(uint_t start = 0, uint_t stop = std::numeric_limits::max()) +#pragma omp declare target begin + YATETO_HOSTDEVICE explicit slice(uint_t start = 0, uint_t stop = std::numeric_limits::max()) : start(start), stop(stop) {} +#pragma omp declare target end uint_t start; uint_t stop; @@ -27,24 +39,25 @@ namespace yateto { template class TensorView { public: - explicit TensorView(std::initializer_list shape) { +#pragma omp declare target + YATETO_HOSTDEVICE explicit TensorView(std::initializer_list shape) { std::copy(shape.begin(), shape.end(), m_shape); } - explicit TensorView(uint_t const shape[]) { + YATETO_HOSTDEVICE explicit TensorView(uint_t const shape[]) { for (uint_t d = 0; d < Dim; ++d) { m_shape[d] = shape[d]; } } - constexpr uint_t dim() const { + YATETO_HOSTDEVICE static constexpr uint_t dim() { return Dim; } - uint_t shape(uint_t dim) const { + YATETO_HOSTDEVICE uint_t shape(uint_t dim) const { return m_shape[dim]; } - +#pragma omp end declare target protected: uint_t m_shape[Dim]; }; @@ -52,36 +65,39 @@ namespace yateto { template class TensorView<0, real_t, uint_t> { public: - explicit TensorView(std::initializer_list shape) {} +#pragma omp declare target + YATETO_HOSTDEVICE explicit TensorView(std::initializer_list shape) {} - explicit TensorView(uint_t const shape[]) {} + YATETO_HOSTDEVICE explicit TensorView(uint_t const shape[]) {} - constexpr uint_t dim() const { + YATETO_HOSTDEVICE static constexpr uint_t dim() { return 0; } - uint_t shape(uint_t dim) const { + YATETO_HOSTDEVICE uint_t shape(uint_t dim) const { return 0; } +#pragma omp end declare target }; template class DenseTensorView : public TensorView { public: - explicit DenseTensorView(real_t* values, std::initializer_list shape, std::initializer_list start, std::initializer_list stop) +#pragma omp declare target + YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, std::initializer_list shape, std::initializer_list start, std::initializer_list stop) : TensorView(shape), m_values(values) { std::copy(start.begin(), start.end(), m_start); std::copy(stop.begin(), stop.end(), m_stop); computeStride(); } - explicit DenseTensorView(real_t* values, std::initializer_list shape) + YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, std::initializer_list shape) : TensorView(shape), m_values(values), m_start{} { std::copy(shape.begin(), shape.end(), m_stop); computeStride(); } - explicit DenseTensorView(real_t* values, uint_t const shape[], uint_t const start[], uint_t const stop[]) + YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, uint_t const shape[], uint_t const start[], uint_t const stop[]) : TensorView(shape), m_values(values) { for (uint_t d = 0; d < Dim; ++d) { m_start[d] = start[d]; @@ -90,7 +106,7 @@ namespace yateto { computeStride(); } - explicit DenseTensorView(real_t* values, uint_t const shape[]) + YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, uint_t const shape[]) : TensorView(shape), m_values(values), m_start{} { for (uint_t d = 0; d < Dim; ++d) { m_stop[d] = shape[d]; @@ -98,7 +114,7 @@ namespace yateto { computeStride(); } - explicit DenseTensorView(real_t* values, uint_t const shape[], uint_t const stride[]) + YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, uint_t const shape[], uint_t const stride[]) : TensorView(shape), m_values(values), m_start{} { for (uint_t d = 0; d < Dim; ++d) { m_stop[d] = shape[d]; @@ -106,11 +122,11 @@ namespace yateto { } } - uint_t size() const { + YATETO_HOSTDEVICE uint_t size() const { return (m_stop[Dim-1]-m_start[Dim-1]) * m_stride[Dim-1]; } - void setZero() { + YATETO_HOSTDEVICE void setZero() { uint_t entry[Dim]; std::copy(m_start, m_start + Dim, entry); while (entry[Dim-1] != m_stop[Dim-1]) { @@ -132,26 +148,26 @@ namespace yateto { } template - bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head) const { + YATETO_HOSTDEVICE bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head) const { return static_cast(head) >= start[dim] && static_cast(head) < stop[dim]; } template - bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head, Tail... tail) const { + YATETO_HOSTDEVICE bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head, Tail... tail) const { return static_cast(head) >= start[dim] && static_cast(head) < stop[dim] && isInRange(start, stop, dim+1, tail...); } template - bool isInRange(Entry... entry) const { + YATETO_HOSTDEVICE bool isInRange(Entry... entry) const { static_assert(sizeof...(entry) == Dim, "Number of arguments to isInRange(...) does not match the tensor dimension."); return isInRange(m_start, m_stop, 0, entry...); } template - real_t& operator()(Entry... entry) { + YATETO_HOSTDEVICE real_t& operator()(Entry... entry) { static_assert(sizeof...(entry) == Dim, "Number of arguments to operator() does not match the tensor dimension."); assert(isInRange(entry...)); @@ -159,14 +175,14 @@ namespace yateto { } template - real_t operator()(Entry... entry) const { + YATETO_HOSTDEVICE real_t operator()(Entry... entry) const { static_assert(sizeof...(entry) == Dim, "Number of arguments to operator() const does not match the tensor dimension."); assert(isInRange(entry...)); return m_values[address(entry...)]; } - real_t operator[](uint_t const entry[Dim]) const { + YATETO_HOSTDEVICE real_t operator[](uint_t const entry[Dim]) const { uint_t addr = 0; for (uint_t d = 0; d < Dim; ++d) { assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]); @@ -175,7 +191,7 @@ namespace yateto { return m_values[addr]; } - real_t& operator[](uint_t const entry[Dim]) { + YATETO_HOSTDEVICE real_t& operator[](uint_t const entry[Dim]) { uint_t addr = 0; for (uint_t d = 0; d < Dim; ++d) { assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]); @@ -185,7 +201,7 @@ namespace yateto { } template - void copyToView(view_t& other) const { + YATETO_HOSTDEVICE void copyToView(view_t& other) const { assert(Dim == other.dim()); uint_t entry[Dim]; @@ -217,7 +233,7 @@ namespace yateto { } template - auto subtensor(Entry... entry) -> DenseTensorView::value, real_t, uint_t> const { + YATETO_HOSTDEVICE auto subtensor(Entry... entry) -> DenseTensorView::value, real_t, uint_t> const { static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); constexpr auto nSlices = count_slices::value; uint_t begin[Dim]; @@ -228,16 +244,16 @@ namespace yateto { return subtensor; } - real_t* data() { + YATETO_HOSTDEVICE real_t* data() { return m_values; } - const real_t* data() const { + YATETO_HOSTDEVICE const real_t* data() const { return m_values; } protected: - void computeStride() { + YATETO_HOSTDEVICE void computeStride() { m_stride[0] = 1; for (uint_t d = 0; d < Dim-1; ++d) { m_stride[d+1] = m_stride[d] * (m_stop[d] - m_start[d]); @@ -245,26 +261,26 @@ namespace yateto { } template - uint_t address(Head head) const { + YATETO_HOSTDEVICE uint_t address(Head head) const { assert(static_cast(head) >= m_start[Dim-1] && static_cast(head) < m_stop[Dim-1]); return (head - m_start[Dim-1]) * m_stride[Dim-1]; } template - uint_t address(Head head, Tail... tail) const { + YATETO_HOSTDEVICE uint_t address(Head head, Tail... tail) const { uint_t const d = (Dim-1) - sizeof...(tail); assert(static_cast(head) >= m_start[d] && static_cast(head) < m_stop[d]); return (head - m_start[d]) * m_stride[d] + address(tail...); } template::value, int>::type = 0> - void extractDim(uint_t*& begin, uint_t*&, uint_t*&, uint_t dimNo, T entry) const { + YATETO_HOSTDEVICE void extractDim(uint_t*& begin, uint_t*&, uint_t*&, uint_t dimNo, T entry) const { assert(static_cast(entry) >= m_start[dimNo] && static_cast(entry) < m_stop[dimNo]); *begin++ = entry; } template>::value, int>::type = 0> - void extractDim(uint_t*& begin, uint_t*& size, uint_t*& stride, uint_t dimNo, T dim) const { + YATETO_HOSTDEVICE void extractDim(uint_t*& begin, uint_t*& size, uint_t*& stride, uint_t dimNo, T dim) const { *begin = std::max(m_start[dimNo], dim.start); *size++ = std::min(m_stop[dimNo], dim.stop) - *begin; ++begin; @@ -272,17 +288,18 @@ namespace yateto { } template - void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head) const { + YATETO_HOSTDEVICE void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head) const { extractDim(begin, size, stride, Dim-1, head); } template - void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head, Tail... tail) const { + YATETO_HOSTDEVICE void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head, Tail... tail) const { uint_t const d = (Dim-1) - sizeof...(tail); extractDim(begin, size, stride, d, head); extractSubtensor(begin, size, stride, tail...); } +#pragma omp end declare target real_t* m_values; uint_t m_start[Dim]; uint_t m_stop[Dim]; @@ -292,23 +309,24 @@ namespace yateto { template class DenseTensorView<0,real_t,uint_t> : public TensorView<0, real_t, uint_t> { public: - explicit DenseTensorView(real_t* values, std::initializer_list shape, std::initializer_list start, std::initializer_list stop) +#pragma omp declare target + YATETO_HOSTDEVICE explicit DenseTensorView(real_t* values, std::initializer_list shape, std::initializer_list start, std::initializer_list stop) : TensorView<0, real_t, uint_t>(shape), m_values(values) { } - uint_t size() const { + YATETO_HOSTDEVICE uint_t size() const { return 1; } - void setZero() { + YATETO_HOSTDEVICE void setZero() { m_values[0] = 0.0; } template - void copyToView(view_t& other) { + YATETO_HOSTDEVICE void copyToView(view_t& other) { other.m_values[0] = m_values[0]; } - +#pragma omp end declare target protected: real_t* m_values; }; @@ -316,23 +334,24 @@ namespace yateto { template class CSCMatrixView : public TensorView<2, real_t, uint_t> { public: - explicit CSCMatrixView(real_t* values, std::initializer_list shape, uint_t const* rowInd, uint_t const* colPtr) +#pragma omp declare target + YATETO_HOSTDEVICE explicit CSCMatrixView(real_t* values, std::initializer_list shape, uint_t const* rowInd, uint_t const* colPtr) : TensorView<2, real_t, uint_t>(shape), m_values(values), m_rowInd(rowInd), m_colPtr(colPtr) { } - explicit CSCMatrixView(real_t* values, uint_t const shape[], uint_t const* rowInd, uint_t const* colPtr) + YATETO_HOSTDEVICE explicit CSCMatrixView(real_t* values, uint_t const shape[], uint_t const* rowInd, uint_t const* colPtr) : TensorView<2, real_t, uint_t>(shape), m_values(values), m_rowInd(rowInd), m_colPtr(colPtr) { } - uint_t size() const { + YATETO_HOSTDEVICE uint_t size() const { return m_colPtr[ this->shape(1) ]; } - void setZero() { + YATETO_HOSTDEVICE void setZero() { memset(m_values, 0, size() * sizeof(real_t)); } - real_t operator()(uint_t row, uint_t col) const { + YATETO_HOSTDEVICE real_t operator()(uint_t row, uint_t col) const { assert(col >= 0 && col < this->shape(1)); uint_t addr = m_colPtr[ col ]; uint_t stop = m_colPtr[ col+1 ]; @@ -347,7 +366,7 @@ namespace yateto { return m_values[addr]; } - real_t& operator()(uint_t row, uint_t col) { + YATETO_HOSTDEVICE real_t& operator()(uint_t row, uint_t col) { assert(col >= 0 && col < this->shape(1)); uint_t addr = m_colPtr[ col ]; uint_t stop = m_colPtr[ col+1 ]; @@ -362,7 +381,7 @@ namespace yateto { return m_values[addr]; } - bool isInRange(uint_t row, uint_t col) const { + YATETO_HOSTDEVICE bool isInRange(uint_t row, uint_t col) const { assert(col >= 0 && col < this->shape(1)); uint_t addr = m_colPtr[ col ]; uint_t stop = m_colPtr[ col+1 ]; @@ -376,16 +395,16 @@ namespace yateto { return false; } - real_t& operator[](const uint_t entry[2]) { + YATETO_HOSTDEVICE real_t& operator[](const uint_t entry[2]) { return operator()(entry[0], entry[1]); } - real_t operator[](const uint_t entry[2]) const { + YATETO_HOSTDEVICE real_t operator[](const uint_t entry[2]) const { return operator()(entry[0], entry[1]); } template - void copyToView(view_t& other) { + YATETO_HOSTDEVICE void copyToView(view_t& other) { assert(2 == other.dim()); assert(this->shape(0) == other.shape(0) && this->shape(1) == other.shape(1)); @@ -399,7 +418,7 @@ namespace yateto { } } } - +#pragma omp end declare target protected: real_t* m_values; uint_t const* m_rowInd; diff --git a/yateto/arch.py b/yateto/arch.py index a19bc58..ffd3be5 100644 --- a/yateto/arch.py +++ b/yateto/arch.py @@ -111,7 +111,35 @@ def onHeap(self, numReals): def __eq__(self, other): return self.name == other.name - + + def indexing(self): + if self.backend in ['cuda', 'hip']: + return ('threadIdx.x', 'blockDim.x') + elif self.backend in ['omptarget']: + return ('omp_get_thread_num()', 'omp_get_num_threads()') + elif self.backend in ['oneapi', 'hipsycl', 'acpp']: + return ('item->get_local_id(0)', 'item->get_group().get_group_id(0)') + else: + raise NotImplementedError('Inline GPU kernel indexing not yet implemented for this type of backend') + + def barrier(self): + # TODO: maybe also grid-wise syncs? + if self.backend in ['cuda', 'hip']: + return '__syncthreads();' + elif self.backend in ['omptarget']: + return '#pragma omp barrier' + elif self.backend in ['oneapi', 'hipsycl', 'acpp']: + return 'item->barrier(sycl::access::fence_space::local_space);' + else: + raise NotImplementedError('Inline GPU kernel barriers are not yet implemented for this type of backend') + + def headers(self): + if self.backend in ['cpp', 'cuda']: + return [] + elif self.backend in ['hip']: + return ['hip/hip_runtime.h'] + elif self.backend in ['oneapi', 'hipsycl', 'acpp']: + return ['sycl/sycl.hpp'] def _get_name_and_precision(ident): return ident[1:], ident[0].upper() diff --git a/yateto/codegen/common.py b/yateto/codegen/common.py index e267494..2d5803c 100644 --- a/yateto/codegen/common.py +++ b/yateto/codegen/common.py @@ -50,7 +50,7 @@ def fromVar(cls, var, indices): is_const = var.tensor.is_compute_constant() return cls(str(var), indices, var.memoryLayout(), var.eqspp(), is_const, var.is_temporary) -def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo=None): +def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo=None, indexer=None): flops = 0 if indexNo == None: indexNo = len(indexNames)-1 @@ -59,10 +59,16 @@ def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo else: index = indexNames[indexNo] rng = ranges[index] + iterstart = rng.start + increment = 1 if pragmaSimd and indexNo == 0: - cpp('#pragma omp simd') - with cpp.For('int {3}{0} = {1}; {3}{0} < {2}; ++{3}{0}'.format(index, rng.start, rng.stop, prefix)): - flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, indexNo-1) + if indexer is None: + cpp('#pragma omp simd') + else: + iterstart = f'{rng.start} + {indexer[0]}' + increment = f'{indexer[1]}' + with cpp.For('int {3}{0} = {1}; {3}{0} < {2}; {3}{0} += {4}'.format(index, iterstart, rng.stop, prefix, increment)): + flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, indexNo-1, indexer) flops = flops * rng.size() return flops diff --git a/yateto/codegen/copyscaleadd/factory.py b/yateto/codegen/copyscaleadd/factory.py index dfbeafb..61f7b72 100644 --- a/yateto/codegen/copyscaleadd/factory.py +++ b/yateto/codegen/copyscaleadd/factory.py @@ -42,4 +42,4 @@ def generator(arch, descr, gemm_cfg, target): return CopyScaleAddGenerator(arch, descr) else: raise NotImplementedError(f'no implementation found for {target} target') - return Generic(arch, descr) + return Generic(arch, descr, target) diff --git a/yateto/codegen/copyscaleadd/generic.py b/yateto/codegen/copyscaleadd/generic.py index 44bf704..91fa35b 100644 --- a/yateto/codegen/copyscaleadd/generic.py +++ b/yateto/codegen/copyscaleadd/generic.py @@ -1,9 +1,10 @@ from ..common import * class Generic(object): - def __init__(self, arch, descr): + def __init__(self, arch, descr, target): self._arch = arch self._descr = descr + self._target = target def _formatTerm(self, alpha, term): prefix = '' @@ -42,4 +43,5 @@ def __call__(s): return flop - return forLoops(cpp, d.result.indices, d.loopRanges, CopyScaleAddBody()) + indexer = self._arch.indexing() if self._target == 'igpu' else None + return forLoops(cpp, d.result.indices, d.loopRanges, CopyScaleAddBody(), indexer=indexer) diff --git a/yateto/codegen/factory.py b/yateto/codegen/factory.py index b07ee8f..8ea3144 100644 --- a/yateto/codegen/factory.py +++ b/yateto/codegen/factory.py @@ -25,7 +25,7 @@ def generic_create(self, node, *args): def simple(self, result, term, add, scalar, routineCache, gemm_cfg): raise NotImplementedError - def temporary(self, bufname, size, iniZero=False, memory=list()): + def temporary(self, bufname, size, offset, iniZero=False, memory=list()): assert(iniZero == False or len(memory) == 0) if self._target == 'cpu': @@ -52,6 +52,13 @@ def temporary(self, bufname, size, iniZero=False, memory=list()): elif memory: ini = ' = {{{}}}'.format(', '.join(memory)) self._cpp(f'alignas({self._arch.alignment}) {self._arch.typename} {bufname}[{size}] {ini};') + elif self._target == 'igpu': + self._cpp(f'{self._arch.typename}* {bufname} = &sharedMemory[{offset}];') + if iniZero: + self._cpp.memset(bufname, size, self._arch.typename) + if memory: + for i, data in enumerate(memory): + self._cpp(f'{bufname}[{i}] = {data};') else: declaration = f'{self._arch.typename}* {bufname}' total_size = f'{BatchedOperationsAux.NUM_ELEMENTS_NAME} * {size}' @@ -59,7 +66,7 @@ def temporary(self, bufname, size, iniZero=False, memory=list()): def freeTmp(self): - if self._target == 'cpu': + if self._target in ['cpu', 'igpu']: for free in self._freeList: self._cpp(f'free({free});') elif self._target == 'gpu': @@ -70,7 +77,7 @@ def freeTmp(self): self._freeList = [] def reset_stream(self): - if self._target == 'cpu': + if self._target in ['cpu', 'igpu']: pass elif self._target == 'gpu': self._cpp(f'{BatchedOperationsAux.STREAM_PTR_NAME} = {BatchedOperationsAux.FORBIDDEN_STREAM_PTR};') @@ -78,7 +85,7 @@ def reset_stream(self): raise RuntimeError('unknown compute target') def reset_flags(self): - if self._target == 'cpu': + if self._target in ['cpu', 'igpu']: pass elif self._target == 'gpu': self._cpp(f'{BatchedOperationsAux.FLAGS_NAME} = nullptr;') @@ -250,7 +257,7 @@ def tensor(self, node, resultName, maxValue = 512): spp = node.spp() isDense = spp.count_nonzero() == size if isDense: - self.temporary(resultName, size) + self.temporary(resultName, size, 0) with self._cpp.For('int i = 0; i < {}; ++i'.format(size)): self._cpp('{}[i] = static_cast<{}>((i + {}) % {} + 1);'.format(resultName, self._arch.typename, self._rand, maxValue)) else: @@ -259,7 +266,7 @@ def tensor(self, node, resultName, maxValue = 512): for entry in zip(*nz): addr = ml.address(entry) memory[addr] = str(float((addr + self._rand) % maxValue)+1.0) - self.temporary(resultName, size, memory=memory) + self.temporary(resultName, size, 0, memory=memory) self._rand += 1 diff --git a/yateto/codegen/gemm/factory.py b/yateto/codegen/gemm/factory.py index a2a98c0..1378622 100644 --- a/yateto/codegen/gemm/factory.py +++ b/yateto/codegen/gemm/factory.py @@ -88,4 +88,4 @@ def generator(arch, descr, gemm_cfg, target): target) if gemmTool: return GemmGen(arch, descr, gemmTool) - return Generic(arch, descr) + return Generic(arch, descr, target) diff --git a/yateto/codegen/gemm/generic.py b/yateto/codegen/gemm/generic.py index 3b8537e..e2b1f06 100644 --- a/yateto/codegen/gemm/generic.py +++ b/yateto/codegen/gemm/generic.py @@ -4,8 +4,10 @@ class Generic(object): OUTER_INDEX = 'o' INNER_INDEX = 'i' - def __init__(self, arch, descr): + def __init__(self, arch, descr, target): + self._arch = arch self._descr = descr + self._target = target def _flopInit(self, beta): return 0 if beta in [0.0, 1.0] else 1 @@ -77,7 +79,15 @@ def _generateSparseDense(self, cpp): sizes = {0: k.size(), 1: n.size(), self.OUTER_INDEX: m.size(), self.INNER_INDEX: n.size()} trans = d.transB - with cpp.For('int {0} = 0; {0} < {1}; ++{0}'.format(self.OUTER_INDEX, sizes[self.OUTER_INDEX])): + if self._target == 'igpu': + indexing = self._arch.indexing() + iterstart = indexing[0] + iterincr = indexing[1] + else: + iterstart = 0 + iterincr = 1 + + with cpp.For(f'int {self.OUTER_INDEX} = {iterstart}; {self.OUTER_INDEX} < {sizes[self.OUTER_INDEX]}; {self.OUTER_INDEX} += {iterincr}'): if d.beta != 1.0: with cpp.For('int {0} = 0; {0} < {1}; ++{0}'.format(self.INNER_INDEX, sizes[self.INNER_INDEX])): CAddr = result([self.INNER_INDEX, self.INNER_INDEX]) @@ -107,10 +117,18 @@ def _generateDenseDense(self, cpp): Aaccess = self._accessFun(d.leftTerm, (m.start, k.start), False, d.transA) Baccess = self._accessFun(d.rightTerm, (k.start, n.start), False, d.transB) Caccess = self._accessFun(d.result, (m.start, n.start), False, False) + + if self._target == 'igpu': + indexing = self._arch.indexing() + mstart = indexing[0] + minc = indexing[1] + else: + mstart = 0 + minc = 1 with cpp.For('int n = 0; n < {0}; ++n'.format(n.size())): if d.beta != 1.0: - with cpp.For('int m = 0; m < {0}; ++m'.format(m.size())): + with cpp.For(f'int m = {mstart}; m < {m.size()}; m += {minc}'): cpp('{} = {}{};'.format( Caccess('m', 'n'), d.beta, @@ -118,7 +136,7 @@ def _generateDenseDense(self, cpp): ) ) with cpp.For('int k = 0; k < {0}; ++k'.format(k.size())): - with cpp.For('int m = 0; m < {0}; ++m'.format(m.size())): + with cpp.For(f'int m = {mstart}; m < {m.size()}; m += {minc}'): cpp( '{C} += {alpha} * {A} * {B};'.format( C = Caccess('m', 'n'), alpha = d.alpha, diff --git a/yateto/codegen/indexsum/factory.py b/yateto/codegen/indexsum/factory.py index 48e3423..2ef6311 100644 --- a/yateto/codegen/indexsum/factory.py +++ b/yateto/codegen/indexsum/factory.py @@ -21,7 +21,7 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, term: Ind def generator(arch, descr, target): - if target == 'cpu': + if target in ['cpu', 'igpu']: return Generic(arch, descr) elif target == 'gpu': raise RuntimeError("IndexSum operation has not been implemented for GPU-like architectures") \ No newline at end of file diff --git a/yateto/codegen/indexsum/generic.py b/yateto/codegen/indexsum/generic.py index 36df9f5..eae1f5c 100644 --- a/yateto/codegen/indexsum/generic.py +++ b/yateto/codegen/indexsum/generic.py @@ -1,9 +1,10 @@ from ..common import * class Generic(object): - def __init__(self, arch, descr): + def __init__(self, arch, descr, target): self._arch = arch self._descr = descr + self._target = target def generate(self, cpp, routineCache): d = self._descr @@ -27,4 +28,5 @@ def __call__(s): flop = 1 if d.alpha != 1.0 else 0 return d.sumLoopRange.size() + flop - return forLoops(cpp, d.result.indices, d.loopRanges, IndexSumBody()) + indexer = self._arch.indexing() if self._target == 'igpu' else None + return forLoops(cpp, d.result.indices, d.loopRanges, IndexSumBody(), indexer=indexer) diff --git a/yateto/codegen/product/factory.py b/yateto/codegen/product/factory.py index 8a85366..5699d28 100644 --- a/yateto/codegen/product/factory.py +++ b/yateto/codegen/product/factory.py @@ -25,8 +25,8 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, leftTerm: self.loopRanges = rA def generator(arch, descr, target): - if target == 'cpu': - return Generic(arch, descr) + if target in ['cpu', 'igpu']: + return Generic(arch, descr, target) elif target == 'gpu': raise RuntimeError("Product operation has not been implemented for GPU-like architectures") diff --git a/yateto/codegen/product/generic.py b/yateto/codegen/product/generic.py index 5cbba15..205c758 100644 --- a/yateto/codegen/product/generic.py +++ b/yateto/codegen/product/generic.py @@ -1,9 +1,10 @@ from ..common import * class Generic(object): - def __init__(self, arch, descr): + def __init__(self, arch, descr, target): self._arch = arch self._descr = descr + self._target = target def _mult(self, alpha): return '{} * '.format(alpha) if alpha != 1.0 else '' @@ -36,7 +37,8 @@ def __call__(s): ) return self._flop(d.add, d.alpha) - return forLoops(cpp, d.result.indices, d.loopRanges, ProductBody()) + indexer = self._arch.indexing() if self._target == 'igpu' else None + return forLoops(cpp, d.result.indices, d.loopRanges, ProductBody(), indexer=indexer) def _generateSparseDense(self, cpp): raise NotImplementedError diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index f0e67f5..49ea148 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -73,19 +73,42 @@ def generate(self, cpp, cfg, factory, routineCache, gemm_cfg): # NOTE: it is required to know in case if the memory is allocated on the heap # an provided by the user required_tmp_mem = 0 + memoffset = 0 cfg = DetermineLocalInitialization().visit(cfg) localPtrs = set() for pp in cfg: localPtrs.update(pp.bufferMap.keys()) if localPtrs: cpp( '{}{};'.format(self._arch.typename, ','.join(map(lambda x: ' *' + str(x), localPtrs))) ) + + if len(cfg) > 0: + last = cfg[-1] + first = cfg[0] + else: + first = None + last = None + + used = set() + + # TODO: rework synchronization for pp in cfg: for buf, size in pp.initBuffer.items(): required_tmp_mem += size * self._arch.bytesPerReal bufname = self._bufferName(buf) - factory.temporary(bufname, size) + factory.temporary(bufname, size, memoffset) + memoffset += size + + needsBarrier = False for local, buf in pp.bufferMap.items(): - cpp('{} = {};'.format(local, self._bufferName(buf))) + bufname = self._bufferName(buf) + cpp('{} = {};'.format(local, bufname)) + if bufname in used: + needsBarrier = True + used.add(bufname) + + if needsBarrier or (pp.action and pp.action.result.is_temporary and not first): + cpp(self._arch.barrier()) + action = pp.action if action: scalar = self.deduce_scalar(action) @@ -94,6 +117,9 @@ def generate(self, cpp, cfg, factory, routineCache, gemm_cfg): hwFlops += factory.create(action.term.node, action.result, action.term.variableList(), action.add, scalar, prefetchName, routineCache, gemm_cfg) else: hwFlops += factory.simple(action.result, action.term, action.add, scalar, routineCache, gemm_cfg) + + if action.result.is_temporary and pp is not last: + cpp(self._arch.barrier()) return hwFlops, required_tmp_mem class OptimisedKernelGenerator(KernelGenerator): @@ -324,6 +350,10 @@ def generate_extra_offset_args(base_name_with_namespace, groups): for base_name, groups in tensors.items(): generate_extra_offset_args(base_name, groups) + elif target == 'igpu': + header(f'{self._arch.typename}* sharedMemory = nullptr;') + if self._arch.backend in ['oneapi', 'acpp', 'hipsycl']: + header('sycl::nd_item<1>* item;') header.emptyline() if len(prefetch) > 0: @@ -332,10 +362,14 @@ def generate_extra_offset_args(base_name_with_namespace, groups): kernelArgs(baseName, groups, writable=False, is_constant=False, target='any') header('{} {};'.format(self.PREFETCHSTRUCT_NAME, self.PREFETCHVAR_NAME)) header.emptyline() + + hostdevice = 'YATETO_DEVICE' if target == 'igpu' else '' for index, kernelOutline in enumerate(kernelOutlines): if kernelOutline: - header.functionDeclaration(executeName(index)) + header('#pragma omp declare target') + header.functionDeclaration(executeName(index), returnType=f'{hostdevice} void') + header('#pragma omp end declare target') if familyStride is not None: header('using {} = void ({}::*)();'.format(self.MEMBER_FUNCTION_PTR_NAME, name)) @@ -349,8 +383,14 @@ def generate_extra_offset_args(base_name_with_namespace, groups): indexF = indexFun(familyStride) with header.Function(self.FIND_EXECUTE_NAME, args, '{} {}'.format(MODIFIERS, self.MEMBER_FUNCTION_PTR_NAME)): header('return {}[{}];'.format(self.EXECUTE_ARRAY_NAME, indexF)) - with header.Function(self.EXECUTE_NAME, args, '{} void'.format(INLINE)): - header('(this->*{}({}))();'.format(self.FIND_EXECUTE_NAME, ', '.join(ndargs(len(familyStride))))) + with header.Function(self.EXECUTE_NAME, args, f'{hostdevice} {INLINE} void'): + if target == 'igpu': + header('const auto indexF = {indexF};') + for index, kernelOutline in enumerate(kernelOutline): + with header.If(f'(indexF == {index})'): + header(f'{executeName(index)}();') + else: + header('(this->*{}({}))();'.format(self.FIND_EXECUTE_NAME, ', '.join(ndargs(len(familyStride))))) aux_functions = [self.NONZEROFLOPS_NAME, self.HARDWAREFLOPS_NAME, self.TEMP_MEM_REQUIRED_NAME] for function in aux_functions: @@ -379,28 +419,31 @@ def generate_extra_offset_args(base_name_with_namespace, groups): for index, kernelOutline in enumerate(kernelOutlines): if kernelOutline is None: continue + + fileh = header if target == 'igpu' else cpp - with cpp.Function('{}::{}::{}'.format(self.NAMESPACE, name, executeName(index))): - for base_name_with_namespace, groups in kernelOutline.scalars.items(): - base_name = Tensor.splitBasename(base_name_with_namespace)[-1] - if len(next(iter(groups))) > 0: - for gis in groups: - cpp('assert(!std::isnan({}({})));'.format(base_name, ','.join(str(gi) for gi in gis))) - else: - cpp(f'assert(!std::isnan({base_name}));') - for base_name_with_namespace, groups in kernelOutline.tensors.items(): - base_name = Tensor.splitBasename(base_name_with_namespace)[-1] - if len(next(iter(groups))) > 0: - for gis in groups: - cpp('assert({}({}) != nullptr);'.format(base_name, ','.join(str(gi) for gi in gis))) - else: - cpp(f'assert({base_name} != nullptr);') + with fileh.Function('{}::{}::{}'.format(self.NAMESPACE, name, executeName(index)), returnType=f'{hostdevice} void'): + if target != 'igpu': + for base_name_with_namespace, groups in kernelOutline.scalars.items(): + base_name = Tensor.splitBasename(base_name_with_namespace)[-1] + if len(next(iter(groups))) > 0: + for gis in groups: + fileh('assert(!std::isnan({}({})));'.format(base_name, ','.join(str(gi) for gi in gis))) + else: + fileh(f'assert(!std::isnan({base_name}));') + for base_name_with_namespace, groups in kernelOutline.tensors.items(): + base_name = Tensor.splitBasename(base_name_with_namespace)[-1] + if len(next(iter(groups))) > 0: + for gis in groups: + fileh('assert({}({}) != nullptr);'.format(base_name, ','.join(str(gi) for gi in gis))) + else: + fileh(f'assert({base_name} != nullptr);') - if target == 'gpu': - cpp(f'assert({BatchedOperationsAux.NUM_ELEMENTS_NAME} != 0);') - cpp(f'assert({BatchedOperationsAux.STREAM_PTR_NAME} != {BatchedOperationsAux.FORBIDDEN_STREAM_PTR});') + if target == 'gpu': + fileh(f'assert({BatchedOperationsAux.NUM_ELEMENTS_NAME} != 0);') + fileh(f'assert({BatchedOperationsAux.STREAM_PTR_NAME} != {BatchedOperationsAux.FORBIDDEN_STREAM_PTR});') - cpp(kernelOutline.function) + fileh(kernelOutline.function) class UnitTestGenerator(KernelGenerator): KERNEL_VAR = 'krnl' @@ -490,7 +533,7 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, target, gemm_cfg, for var in variables: factory.tensor(var.tensor, self._tensorName(var)) - factory.temporary(self._name(var), var.memoryLayout().requiredReals(), iniZero=True) + factory.temporary(self._name(var), var.memoryLayout().requiredReals(), 0, iniZero=True) shape = var.memoryLayout().shape() cpp('{supportNS}::DenseTensorView<{dim},{arch.typename},{arch.uintTypename}> {viewName}({utName}, {{{shape}}}, {{{start}}}, {{{shape}}});'.format( @@ -535,7 +578,6 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, target, gemm_cfg, cpp( f'{self.QUEUE}.memcpy({self._devPtrTensorName(var)}, &{self._devTensorName(var)}, sizeof({self._arch.typename}*)).wait();' ) cpp.emptyline() - cpp( '{}{}::{} {};'.format(kernel_prefix, OptimisedKernelGenerator.NAMESPACE, kernelClass, self.KERNEL_VAR) ) for var in scalars: cpp( '{}.{}{} = {};'.format(self.KERNEL_VAR, var.baseName(), self._groupIndex(var), self._tensorNameS(var)) ) @@ -746,10 +788,10 @@ def generateTensorsH(self, header): header('template') with header.Struct(self.CONTAINER_CLASS_NAME): header('T {}[{}];'.format(self.CONTAINER_DATA_NAME, reduce(operator.mul, groupSize))) - header('{}() : {}{{}} {{}}'.format(self.CONTAINER_CLASS_NAME, self.CONTAINER_DATA_NAME)) - with header.Function('operator()', typedArgs, '{} T&'.format(INLINE)): + header('YATETO_HOSTDEVICE {}() : {}{{}} {{}}'.format(self.CONTAINER_CLASS_NAME, self.CONTAINER_DATA_NAME)) + with header.Function('operator()', typedArgs, 'YATETO_HOSTDEVICE {} T&'.format(INLINE)): header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): + with header.Function('operator()', typedArgs, 'YATETO_HOSTDEVICE {} T const&'.format(INLINE), const=True): header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) for namespace, scalar_dict in self.iterate_collect_scalar(): with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE): @@ -850,7 +892,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat tv = self._tensorViewGenerator(ml) with cpp.Struct(self.VIEW_STRUCT_NAME): cpp('typedef {} {};'.format(tv.typename(len(ml.shape()), self._arch), self.VIEW_TYPE_NAME)) - with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='{} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)): + with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='YATETO_HOSTDEVICE {} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)): tv.generate(cpp, ml, self._arch, None) else: typedArgs = typedNdArgs(len(groupSize), self._arch.uintTypename) @@ -865,7 +907,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat cpp('template<>') with cpp.Struct('{}::{}<{}>'.format(baseNameWithoutNamespace, self.VIEW_STRUCT_NAME, special)): cpp('typedef {} {};'.format(typename, self.VIEW_TYPE_NAME)) - with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='{} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)): + with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgs, returnType='YATETO_HOSTDEVICE {} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME)): tv.generate(cpp, ml, self._arch, index(group)) def _array(self, cpp, typ, name, content, groupSize, declarationOnly=False, alwaysArray=True, constexpr=True, static=True): diff --git a/yateto/generator.py b/yateto/generator.py index 49c2559..e5e1d12 100644 --- a/yateto/generator.py +++ b/yateto/generator.py @@ -24,7 +24,7 @@ class Kernel(object): BASE_NAME = r'[a-zA-Z]\w*' VALID_NAME = r'^{}$'.format(BASE_NAME) - VALID_TARGETS = ['cpu', 'gpu'] + VALID_TARGETS = ['cpu', 'gpu', 'igpu'] def __init__(self, name, ast, prefetch=None, namespace=None, target='cpu'): self.name = name @@ -367,6 +367,10 @@ def unit_test_body(cpp, testFramework): header.includeSys('limits') header.include('yateto.h') header.include(fTensors.hName) + + for path in self._arch.headers(): + header.includeSys(path) + cpp.include(fKernels.hName) with cpp.Namespace(namespace), header.Namespace(namespace): # Group kernels by namespace