From 006af5df9286b0060e3861d11356c90ce368d2b6 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 25 Nov 2025 01:53:57 +0100 Subject: [PATCH 01/15] Fix one-element sum accumulations --- yateto/controlflow/visitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yateto/controlflow/visitor.py b/yateto/controlflow/visitor.py index b06d301..0042e80 100644 --- a/yateto/controlflow/visitor.py +++ b/yateto/controlflow/visitor.py @@ -42,7 +42,7 @@ def generic_visit(self, node): def visit_Add(self, node): variables = [self.visit(child) for child in node] - assert len(variables) > 1 + assert len(variables) >= 1 variables.sort(key=lambda var: int(not var.writable) + int(not var.isGlobal())) From 16f76166927a5e5685982751d90b34126846819b Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 25 Nov 2025 01:54:15 +0100 Subject: [PATCH 02/15] Fix action merging if we'd override a global variable --- yateto/controlflow/transformer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yateto/controlflow/transformer.py b/yateto/controlflow/transformer.py index 9195bdc..db937d4 100644 --- a/yateto/controlflow/transformer.py +++ b/yateto/controlflow/transformer.py @@ -99,7 +99,11 @@ def visit(self, cfg): V = ua.variables() for j in range(i+1,n): va = cfg[j].action - if va.isRHSVariable() and ua.result == va.term and va.result not in V and (ua.hasTrivialScalar() or va.hasTrivialScalar()): + if va.isRHSVariable() \ + and ua.result == va.term \ + and va.result not in V \ + and (ua.hasTrivialScalar() or va.hasTrivialScalar()) \ + and ua.result.isLocal(): found = j break elif ua.result in va.variables() or ua.result == va.result: From e1d7f49211bd6f20f41773c03eb6777dba894af1 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 29 Nov 2025 22:32:26 +0100 Subject: [PATCH 03/15] Add sparse tensor support --- include/yateto/TensorView.h | 186 ++++++++++++++++++++- yateto/aspp.py | 12 +- yateto/ast/node.py | 2 +- yateto/codegen/common.py | 32 +++- yateto/codegen/gemm/factory.py | 5 +- yateto/codegen/log/generic.py | 110 +++++++++---- yateto/codegen/product/factory.py | 4 +- yateto/codegen/visitor.py | 25 ++- yateto/memory.py | 257 +++++++++++++++++++++++++++--- 9 files changed, 557 insertions(+), 76 deletions(-) diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h index 0e243dd..f15c438 100644 --- a/include/yateto/TensorView.h +++ b/include/yateto/TensorView.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -109,6 +110,52 @@ namespace yateto { uint_t size() const { return (m_stop[Dim-1]-m_start[Dim-1]) * m_stride[Dim-1]; } + + template + void forall(F&& function) { + uint_t entry[Dim]; + std::copy(m_start, m_start + Dim, entry); + while (entry[Dim-1] != m_stop[Dim-1]) { + auto values = &operator[](entry); + for (uint_t i = 0; i < m_stop[0]-m_start[0]; ++i) { + entry[0] = i + m_start[0]; + std::invoke(std::forward(function), entry, values[i*m_stride[0]]); + } + if (Dim == 1) { + break; + } + + uint_t d = 0; + do { + entry[d] = m_start[d]; + d++; + ++entry[d]; + } while (entry[d] == m_stop[d] && d < Dim-1); + } + } + + template + void forall(F&& function) const { + uint_t entry[Dim]; + std::copy(m_start, m_start + Dim, entry); + while (entry[Dim-1] != m_stop[Dim-1]) { + auto values = &operator[](entry); + for (uint_t i = 0; i < m_stop[0]-m_start[0]; ++i) { + entry[0] = i + m_start[0]; + std::invoke(std::forward(function), entry, values[i*m_stride[0]]); + } + if (Dim == 1) { + break; + } + + uint_t d = 0; + do { + entry[d] = m_start[d]; + d++; + ++entry[d]; + } while (entry[d] == m_stop[d] && d < Dim-1); + } + } void setZero() { uint_t entry[Dim]; @@ -400,11 +447,148 @@ namespace yateto { } } + template + void forall(F&& function) { + uint_t entry[2]; + uint_t ncols = this->shape(1); + for (uint_t col = 0; col < ncols; ++col) { + entry[1] = col; + for (uint_t i = m_colPtr[col]; i < m_colPtr[col+1]; ++i) { + entry[0] = m_rowInd[i]; + std::invoke(std::forward(function), entry, m_values[i]); + } + } + } + + template + void forall(F&& function) const { + uint_t entry[2]; + uint_t ncols = this->shape(1); + for (uint_t col = 0; col < ncols; ++col) { + entry[1] = col; + for (uint_t i = m_colPtr[col]; i < m_colPtr[col+1]; ++i) { + entry[0] = m_rowInd[i]; + std::invoke(std::forward(function), entry, m_values[i]); + } + } + } + protected: real_t* m_values; uint_t const* m_rowInd; uint_t const* m_colPtr; }; -} + + template + class PatternMatrixView : public TensorView { + public: + // TODO: remove const_cast(pattern) + + explicit PatternMatrixView(real_t* values, std::initializer_list shape, uint_t const* pattern) + : TensorView(shape), m_values(values), m_pattern(const_cast(pattern), shape) { + + m_pattern.forall([&](const auto& index, const auto& idxval) { + if (idxval > 0) { + ++m_size; + } + }); + } + + explicit PatternMatrixView(real_t* values, uint_t const shape[], uint_t const* pattern) + : TensorView(shape), m_values(values), m_pattern(const_cast(pattern), shape) { + + m_pattern.forall([&](const auto& index, const auto& idxval) { + if (idxval > 0) { + ++m_size; + } + }); + } + + uint_t size() const { + return m_size; + } + + void setZero() { + m_pattern.forall([&](const auto& index, const auto& idxval) { + if (idxval > 0) { + m_values[idxval - 1] = 0; + } + }); + } + + template + real_t operator()(Args... index) const { + static_assert((std::is_integral_v && ...)); + const auto idx = m_pattern(index...); + return m_values[idx - 1]; + } + + template + real_t& operator()(Args... index) { + static_assert((std::is_integral_v && ...)); + const auto idx = m_pattern(index...); + return m_values[idx - 1]; + } + + template + bool isInRange(Args... index) const { + static_assert((std::is_integral_v && ...)); + const auto idx = m_pattern(index...); + return idx > 0; + } + + real_t& operator[](const uint_t entry[Dim]) { + const auto idx = m_pattern[entry]; + return m_values[idx - 1]; + } + + real_t operator[](const uint_t entry[Dim]) const { + const auto idx = m_pattern[entry]; + return m_values[idx - 1]; + } + + template + void forall(F&& function) { + m_pattern.forall([&, function = std::forward(function)](const auto& index, const auto& idxval) { + if (idxval > 0) { + std::invoke(function, index, m_values[idxval - 1]); + } + }); + } + + template + void forall(F&& function) const { + m_pattern.forall([&, function = std::forward(function)](const auto& index, const auto& idxval) { + if (idxval > 0) { + std::invoke(function, index, m_values[idxval - 1]); + } + }); + } + + template + void copyToView(view_t& other) { + m_pattern.forall([&](const auto& index, const auto& idxval) { + if (idxval > 0) { + other[index] = m_values[idxval - 1]; + } + else { + other[index] = 0; + } + }); + } + + template + auto subtensor(Entry... entry) const { + static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); + const auto patternSubtensor = m_pattern.subtensor(entry...); + return PatternMatrixView::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor); + } + + protected: + real_t* m_values{nullptr}; + DenseTensorView m_pattern; + std::size_t m_size{0}; + }; +} // namespace yateto #endif diff --git a/yateto/aspp.py b/yateto/aspp.py index c3b0b66..5ccc5b7 100644 --- a/yateto/aspp.py +++ b/yateto/aspp.py @@ -43,7 +43,7 @@ def transposed(self, shape): pass @abstractmethod - def indexSum(self, sourceIndices, targetIndices): + def indexSum(self, sourceIndices, targetIndices, fixedIndices): pass @abstractmethod @@ -74,7 +74,8 @@ def reshape(self, shape): def transposed(self, perm): return type(self)(tuple(self.shape[p] for p in perm)) - def indexSum(self, sourceIndices, targetIndices): + def indexSum(self, sourceIndices, targetIndices, fixedIndices): + # silently assume that targetIndices and fixedIndices don't overlap return type(self)(tuple(self.shape[sourceIndices.find(targetIndex)] for targetIndex in targetIndices)) @staticmethod @@ -160,8 +161,11 @@ def reshape(self, shape): def transposed(self, perm): return type(self)(self.pattern.transpose(perm).copy(order=self.NUMPY_DEFAULT_ORDER)) - def indexSum(self, sourceIndices, targetIndices): - return general(np.einsum('{}->{}'.format(sourceIndices, targetIndices), self.pattern)) + def indexSum(self, sourceIndices, targetIndices, fixedIndices): + # silently assume that targetIndices and fixedIndices don't overlap + selector = tuple(fixedIndices[idx] if idx in fixedIndices else slice(None) for idx in sourceIndices) + einsumsource = ''.join('' if idx in fixedIndices else idx for idx in sourceIndices) + return general(np.einsum('{}->{}'.format(einsumsource, targetIndices), self.pattern[selector])) @staticmethod def add(a1, a2): diff --git a/yateto/ast/node.py b/yateto/ast/node.py index 93da011..78fc58b 100644 --- a/yateto/ast/node.py +++ b/yateto/ast/node.py @@ -335,7 +335,7 @@ def sumIndex(self): def computeSparsityPattern(self, *spps): assert len(spps) <= 1 spp = spps[0] if len(spps) == 1 else self.term().eqspp() - return spp.indexSum(self.term().indices, self.indices) + return spp.indexSum(self.term().indices, self.indices, {}) class Contraction(BinOp): def __init__(self, indices, lTerm, rTerm, sumIndices): diff --git a/yateto/codegen/common.py b/yateto/codegen/common.py index e2b8966..0b80e75 100644 --- a/yateto/codegen/common.py +++ b/yateto/codegen/common.py @@ -66,20 +66,36 @@ def fromVar(cls, var, indices): addressing = None # var.tensor.addressing return cls(str(var), indices, var.memoryLayout(), var.eqspp(), is_const, var.is_temporary, values, datatype, addressing) -def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo=None): +def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', fixed={}, indexNo=None): flops = 0 + firstLoop = False if indexNo == None: indexNo = len(indexNames)-1 + firstLoop = True if indexNo < 0: - flops = body() + if firstLoop: + with cpp.AnonymousScope(): + flops = body() + else: + flops = body() else: index = indexNames[indexNo] rng = ranges[index] - if pragmaSimd and indexNo == 0: + if pragmaSimd: cpp('#pragma omp simd') - with cpp.For('int {3}{0} = {1}; {3}{0} < {2}; ++{3}{0}'.format(index, rng.start, rng.stop, prefix)): - flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, indexNo-1) - flops = flops * rng.size() + if index in fixed: + value = fixed[index] + if value >= rng.start and value < rng.stop: + with cpp.AnonymousScope(): + cpp(f'constexpr int {prefix}{index} = {value};') + flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, fixed, indexNo-1) + else: + # out of range + flops = 0 + else: + with cpp.For('int {3}{0} = {1}; {3}{0} < {2}; ++{3}{0}'.format(index, rng.start, rng.stop, prefix)): + flops = forLoops(cpp, indexNames, ranges, body, False, prefix, fixed, indexNo-1) + flops = flops * rng.size() return flops def loopRanges(term: IndexedTensorDescription, loopIndices): @@ -98,8 +114,8 @@ def testLoopRangesAContainedInB(A, B): def boundingBoxFromLoopRanges(indices, loopRanges): return BoundingBox([loopRanges[index] for index in indices]) -def reduceSpp(spp, sourceIndices, targetIndices): - return spp.indexSum(sourceIndices, targetIndices) +def reduceSpp(spp, sourceIndices, targetIndices, fixedIndices): + return spp.indexSum(sourceIndices, targetIndices, fixedIndices) def initializeWithZero(cpp, arch, result: TensorDescription, writeBB = None): if writeBB: diff --git a/yateto/codegen/gemm/factory.py b/yateto/codegen/gemm/factory.py index 595e0c0..50fc3e5 100644 --- a/yateto/codegen/gemm/factory.py +++ b/yateto/codegen/gemm/factory.py @@ -1,5 +1,4 @@ from ...ast.indices import BoundingBox, Range -from ...memory import CSCMemoryLayout from ..common import TensorDescription from .generic import Generic from .gemmgen import GemmGen @@ -28,8 +27,8 @@ def __init__(self, self.beta = beta self.prefetchName = prefetchName - self.isACsc = isinstance(self.leftTerm.memoryLayout, CSCMemoryLayout) - self.isBCsc = isinstance(self.rightTerm.memoryLayout, CSCMemoryLayout) + self.isACsc = self.leftTerm.memoryLayout.isSparse() + self.isBCsc = self.rightTerm.memoryLayout.isSparse() bbA = BoundingBox.fromSpp(self.leftTerm.eqspp) bbB = BoundingBox.fromSpp(self.rightTerm.eqspp) diff --git a/yateto/codegen/log/generic.py b/yateto/codegen/log/generic.py index 68c2e50..5948980 100644 --- a/yateto/codegen/log/generic.py +++ b/yateto/codegen/log/generic.py @@ -9,40 +9,46 @@ def __init__(self, arch, descr, target): self._descr = descr self._target = target - def _pointer(self, cpp, targetName, baseName, term, loopIndices, const=True): + def _pointer(self, cpp, targetName, baseName, term, loopIndices, fixed, const=True): indices = term.indices & loopIndices - addressStr = term.memoryLayout.addressString(term.indices, indices) if len(indices) > 0 else '' + addressStr = term.memoryLayout.addressString(term.indices, indices, fixed) if len(indices) > 0 else '' if len(addressStr) > 0: addressStr = ' + ' + addressStr cpp('{} {}* {} = {}{};'.format(self._arch.typename, 'const' if const else '', targetName, baseName, addressStr)) - def _alignedStart(self, term, loopIndices): + def _alignedStart(self, term, loopIndices, fixed): if len(loopIndices) == 0: return True - return term.memoryLayout.isAlignedAddressString(term.indices, term.indices & loopIndices) + return term.memoryLayout.isAlignedAddressString(term.indices, term.indices & loopIndices, fixed) - def _memLayout(self, term, I, J): + def _memLayout(self, term, I, J, fixed): if len(I) == 0 and len(J) == 0: return DenseMemoryLayout((1,1)) elif len(I) == 0: - ml = term.memoryLayout.vec(term.indices, J) + ml = term.memoryLayout.vec(term.indices, J, fixed) return ml.withDummyDimension() elif len(J) == 0: - ml = term.memoryLayout.vec(term.indices, I) + ml = term.memoryLayout.vec(term.indices, I, fixed) return ml.withDummyDimension() elif len(term.indices) == 2: return term.memoryLayout - return term.memoryLayout.unfold(term.indices, I, J) + return term.memoryLayout.unfold(term.indices, I, J, fixed) + + def _unroll(self, term, I): + if not term.memoryLayout.isSparse(): + return set() + + return I - def _reduce(self, term, subset, memLayout): - return reduceSpp(term.eqspp, term.indices, subset).reshape(memLayout.shape()) + def _reduce(self, term, subset, memLayout, fixed): + return reduceSpp(term.eqspp, term.indices, subset, fixed).reshape(memLayout.shape()) def _defuse(self, fusedRange, term, I): if len(I) == 1: return {next(iter(I)): fusedRange} return term.memoryLayout.defuse(fusedRange, term.indices, I) - def generate(self, cpp, routineCache, gemm_cfg): + def _generateSingle(self, cpp, routineCache, gemm_cfg, fixed = {}): d = self._descr A = d.leftTerm.indices - d.loopIndices @@ -69,15 +75,15 @@ def generate(self, cpp, routineCache, gemm_cfg): innerCname = '_Cin' if hasInnerLoops else outerCname innerPrefetchName = '_Cprefetchin' if hasInnerLoops and outerPrefetchName is not None else outerPrefetchName - alignedStartA = not hasOuterLoops or self._alignedStart(d.leftTerm, d.outerLoopIndices) + alignedStartA = not hasOuterLoops or self._alignedStart(d.leftTerm, d.outerLoopIndices, fixed) - AmemLayout = self._memLayout(d.leftTerm, Im, Ik) - BmemLayout = self._memLayout(d.rightTerm, Ik, In) - CmemLayout = self._memLayout(d.result, Im, In) + AmemLayout = self._memLayout(d.leftTerm, Im, Ik, fixed) + BmemLayout = self._memLayout(d.rightTerm, Ik, In, fixed) + CmemLayout = self._memLayout(d.result, Im, In, fixed) - Aeqspp = self._reduce(d.leftTerm, A, AmemLayout) - Beqspp = self._reduce(d.rightTerm, B, BmemLayout) - Ceqspp = self._reduce(d.result, C, CmemLayout) + Aeqspp = self._reduce(d.leftTerm, A, AmemLayout, fixed) + Beqspp = self._reduce(d.rightTerm, B, BmemLayout, fixed) + Ceqspp = self._reduce(d.result, C, CmemLayout, fixed) gemmDescr = gemm.Description( leftTerm = TensorDescription(innerAname, AmemLayout, Aeqspp, d.leftTerm.is_compute_constant, d.leftTerm.is_temporary), @@ -88,8 +94,8 @@ def generate(self, cpp, routineCache, gemm_cfg): alpha = d.alpha, beta = 1.0 if d.add else 0.0, arch = self._arch, - alignedStartA = self._alignedStart(d.leftTerm, d.outerLoopIndices) and self._alignedStart(d.leftTerm, d.innerLoopIndices), - alignedStartC = self._alignedStart(d.result, d.outerLoopIndices) and self._alignedStart(d.result, d.innerLoopIndices), + alignedStartA = self._alignedStart(d.leftTerm, d.outerLoopIndices, fixed) and self._alignedStart(d.leftTerm, d.innerLoopIndices, fixed), + alignedStartC = self._alignedStart(d.result, d.outerLoopIndices, fixed) and self._alignedStart(d.result, d.innerLoopIndices, fixed), prefetchName = innerPrefetchName ) @@ -105,11 +111,11 @@ def generate(self, cpp, routineCache, gemm_cfg): class LoGBody(object): def __call__(s): if hasInnerLoops: - self._pointer(cpp, innerAname, outerAname, d.leftTerm, d.innerLoopIndices) - self._pointer(cpp, innerBname, outerBname, d.rightTerm, d.innerLoopIndices) - self._pointer(cpp, innerCname, outerCname, d.result, d.innerLoopIndices, const=False) + self._pointer(cpp, innerAname, outerAname, d.leftTerm, d.innerLoopIndices, fixed) + self._pointer(cpp, innerBname, outerBname, d.rightTerm, d.innerLoopIndices, fixed) + self._pointer(cpp, innerCname, outerCname, d.result, d.innerLoopIndices, fixed, const=False) if outerPrefetchName is not None: - self._pointer(cpp, innerPrefetchName, outerPrefetchName, d.result, d.innerLoopIndices) + self._pointer(cpp, innerPrefetchName, outerPrefetchName, d.result, d.innerLoopIndices, fixed) generator = gemm.generator(self._arch, gemmDescr, gemm_cfg, self._target) return generator.generate(cpp, routineCache) @@ -117,18 +123,60 @@ class InnerLoopBody(object): def __call__(s): flops = 0 if hasOuterLoops: - self._pointer(cpp, outerAname, d.leftTerm.name, d.leftTerm, d.outerLoopIndices) - self._pointer(cpp, outerBname, d.rightTerm.name, d.rightTerm, d.outerLoopIndices) - self._pointer(cpp, outerCname, d.result.name, d.result, d.outerLoopIndices, const=False) + self._pointer(cpp, outerAname, d.leftTerm.name, d.leftTerm, d.outerLoopIndices, fixed) + self._pointer(cpp, outerBname, d.rightTerm.name, d.rightTerm, d.outerLoopIndices, fixed) + self._pointer(cpp, outerCname, d.result.name, d.result, d.outerLoopIndices, fixed, const=False) if d.prefetchName is not None: - self._pointer(cpp, outerPrefetchName, d.prefetchName, d.result, d.outerLoopIndices) + self._pointer(cpp, outerPrefetchName, d.prefetchName, d.result, d.outerLoopIndices, fixed) + if d.assignLoopRanges is not None: gemmDescr.setBeta(0.0) - flops += forLoops(cpp, d.innerLoopIndices, d.assignLoopRanges, LoGBody(), pragmaSimd=False) + flops += forLoops(cpp, d.innerLoopIndices, d.assignLoopRanges, LoGBody(), pragmaSimd=False, fixed=fixed) if d.addLoopRanges is not None: gemmDescr.setBeta(1.0) - flops += forLoops(cpp, d.innerLoopIndices, d.addLoopRanges, LoGBody(), pragmaSimd=False) + flops += forLoops(cpp, d.innerLoopIndices, d.addLoopRanges, LoGBody(), pragmaSimd=False, fixed=fixed) return flops - return forLoops(cpp, d.outerLoopIndices, d.loopRanges, InnerLoopBody(), pragmaSimd=False) + return forLoops(cpp, d.outerLoopIndices, d.loopRanges, InnerLoopBody(), pragmaSimd=False, fixed=fixed) + + def _generateUnroll(self, cpp, routineCache, gemm_cfg, fixed, unroll): + d = self._descr + + if len(unroll) == 0: + return self._generateSingle(cpp, routineCache, gemm_cfg, fixed) + + unrollNow = unroll[0] + + rngNow = d.loopRanges[unrollNow] + + result = 0 + for i in range(rngNow.start, rngNow.stop): + fixedNow = dict(fixed) + fixedNow[unrollNow] = i + result += self._generateUnroll(cpp, routineCache, gemm_cfg, fixedNow, unroll[1:]) + + return result + + def generate(self, cpp, routineCache, gemm_cfg): + d = self._descr + + A = d.leftTerm.indices - d.loopIndices + B = d.rightTerm.indices - d.loopIndices + C = d.result.indices - d.loopIndices + Im = set(A) & set(C) + In = set(B) & set(C) + Ik = set(A) & set(B) + + toBeUnrolled = d.loopRanges.keys() + + unrollNeeded = set() + if d.leftTerm.memoryLayout.isSparse(): + unrollNeeded |= set(d.leftTerm.indices) + if d.rightTerm.memoryLayout.isSparse(): + unrollNeeded |= set(d.rightTerm.indices) + if d.result.memoryLayout.isSparse(): + unrollNeeded |= set(d.result.indices) + + toBeUnrolled &= unrollNeeded + return self._generateUnroll(cpp, routineCache, gemm_cfg, {}, list(toBeUnrolled)) diff --git a/yateto/codegen/product/factory.py b/yateto/codegen/product/factory.py index 8a85366..3871277 100644 --- a/yateto/codegen/product/factory.py +++ b/yateto/codegen/product/factory.py @@ -10,8 +10,8 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, leftTerm: self.leftTerm = leftTerm self.rightTerm = rightTerm - self.isACsc = isinstance(self.leftTerm.memoryLayout, CSCMemoryLayout) - self.isBCsc = isinstance(self.rightTerm.memoryLayout, CSCMemoryLayout) + self.isACsc = self.leftTerm.memoryLayout.isSparse() + self.isBCsc = self.rightTerm.memoryLayout.isSparse() rA = loopRanges(self.leftTerm, self.result.indices) rB = loopRanges(self.rightTerm, self.result.indices) diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index d322bd3..2cf384e 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -12,6 +12,8 @@ from .common import BatchedOperationsAux from ..type import Scalar +import numpy as np + SUPPORT_LIBRARY_NAMESPACE = 'yateto' CONSTEXPR = 'constexpr' STATIC = 'static' @@ -646,6 +648,8 @@ def generate(cpp, group, memLayout): raise NotImplementedError def listToInitializerList(self, lst): + if isinstance(lst, np.ndarray): + lst = lst.flatten() return '{{{}}}'.format(', '.join([str(l) for l in lst])) def formatArray(self, numberType, name, values, declarationOnly): @@ -692,6 +696,24 @@ def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declaration cpp(self.formatArray(numberType, namespace + self.ROWIND_NAME + index, memLayout.rowIndex(), declarationOnly)) cpp(self.formatArray(numberType, namespace + self.COLPTR_NAME + index, memLayout.colPointer(), declarationOnly)) + class PatternMatrixView(TensorView): + PATTERN_NAME = 'Pattern' + + def typename(self, dim, arch): + return f'::{SUPPORT_LIBRARY_NAMESPACE}::{type(self).__name__}<{dim}, {arch.typename},{arch.uintTypename}>' + + def generate(self, cpp, memLayout, arch, index): + cpp( 'return {}({}, {}, {});'.format( + self.typename(len(memLayout.shape()), arch), + self.ARGUMENT_NAME, + self.listToInitializerList(memLayout.shape()), + self.PATTERN_NAME + (index if index is not None else '') + ) + ) + + def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declarationOnly): + cpp(self.formatArray(numberType, namespace + self.PATTERN_NAME + index, memLayout.pattern(), declarationOnly)) + def __init__(self, arch, tensors, scalars): self._arch = arch self._numberType = '{} const'.format(self._arch.uintTypename) @@ -734,7 +756,8 @@ def __init__(self, arch, tensors, scalars): def _tensorViewGenerator(self, memoryLayout): memLayoutMap = { 'DenseMemoryLayout': self.DenseTensorView, - 'CSCMemoryLayout': self.CSCMatrixView + 'CSCMemoryLayout': self.CSCMatrixView, + 'PatternMemoryLayout': self.PatternMatrixView } return memLayoutMap[type(memoryLayout).__name__]() diff --git a/yateto/memory.py b/yateto/memory.py index 9a689c8..dbefe19 100644 --- a/yateto/memory.py +++ b/yateto/memory.py @@ -5,6 +5,8 @@ import numpy as np from abc import ABC, abstractmethod +from . import aspp + class MemoryLayout(ABC): def __init__(self, shape): self._shape = shape @@ -48,6 +50,36 @@ def __eq__(self, other): def isCompatible(self, spp): pass + def _subShape(self, positions): + sub = 1 + for p in positions: + sub *= self._shape[p] + return sub + + def defuse(self, fusedRange, indices, I): + positions = indices.positions(I) + s = self._subShape(positions) + ranges = dict() + start = fusedRange.start + stop = fusedRange.stop-1 + for p in reversed(positions): + s //= self._shape[p] + b = start // s + B = stop // s + ranges[ indices[p] ] = Range(b, B+1) + start -= b*s + stop -= B*s + return ranges + + def notWrittenAddresses(self, writeBB): + if writeBB == self._bbox: + return [] + + assert writeBB in self._bbox + re = [range(r.start, r.stop) for r in self._bbox] + we = [range(w.start, w.stop) for w in writeBB] + return [self.address(e) for e in set(itertools.product(*re)) - set(itertools.product(*we)) if self.hasValue(e)] + class DenseMemoryLayout(MemoryLayout): ALIGNMENT_ARCH = None @@ -147,7 +179,7 @@ def requiredReals(self): size = self._bbox[-1].size() * self._stride[-1] return size - def addressString(self, indices, I = None, prefix='_'): + def addressString(self, indices, I = None, Z = None, prefix='_'): if len(self._bbox) == 0: return '0' if I is None: @@ -161,7 +193,7 @@ def addressString(self, indices, I = None, prefix='_'): a.append('{}*{}{}'.format(self._stride[p], prefix, indices[p])) return ' + '.join(a) - def isAlignedAddressString(self, indices, I = None): + def isAlignedAddressString(self, indices, I = None, Z = None): if I is None: I = set(indices) positions = indices.positions(I) @@ -173,12 +205,6 @@ def isAlignedAddressString(self, indices, I = None): def mayFuse(self, positions): return all( [self._stride[j] == self._shape[i]*self._stride[i] for i,j in zip(positions[:-1], positions[1:])] ) - def _subShape(self, positions): - sub = 1 - for p in positions: - sub *= self._shape[p] - return sub - def _subRange(self, positions): start = 0 stop = 0 @@ -192,7 +218,7 @@ def _subRange(self, positions): def _firstStride(self, positions): return self._stride[ positions[0] ] - def vec(self, indices, I): + def vec(self, indices, I, Z): positionsI = indices.positions(I) assert self.mayFuse( indices.positions(I) ) @@ -208,7 +234,7 @@ def withDummyDimension(self): stride = self._stride + (self._bbox[-1].size() * self._stride[-1],) return DenseMemoryLayout(shape, bbox, stride) - def unfold(self, indices, I, J): + def unfold(self, indices, I, J, Z): positionsI = indices.positions(I) positionsJ = indices.positions(J) assert self.mayFuse( indices.positions(I) ) and self.mayFuse( indices.positions(J) ) @@ -221,21 +247,6 @@ def unfold(self, indices, I, J): stride = (self._firstStride(positionsI), self._firstStride(positionsJ)) return DenseMemoryLayout(shape, bbox, stride) - - def defuse(self, fusedRange, indices, I): - positions = indices.positions(I) - s = self._subShape(positions) - ranges = dict() - start = fusedRange.start - stop = fusedRange.stop-1 - for p in reversed(positions): - s //= self._shape[p] - b = start // s - B = stop // s - ranges[ indices[p] ] = Range(b, B+1) - start -= b*s - stop -= B*s - return ranges def isCompatible(self, spp): return BoundingBox.fromSpp(spp) in self.bbox() @@ -245,6 +256,13 @@ def __eq__(self, other): def __str__(self): return '{}(shape: {}, bounding box: {}, stride: {})'.format(type(self).__name__, self._shape, self._bbox, self._stride) + + def isSparse(self): + return False + + def hasValue(self, entry): + assert entry in self._bbox + return True class CSCMemoryLayout(MemoryLayout): def __init__(self, spp, alignStride=False): @@ -318,6 +336,16 @@ def address(self, entry): return start + find[0] + def hasValue(self, entry): + assert entry in self._bbox + + start = self._colPtr[ entry[1] ] + stop = self._colPtr[ entry[1]+1 ] + subRowInd = self._rowIndex[start:stop] + + find = np.where(subRowInd == entry[0])[0] + return len(find) == 1 + def subtensorOffset(self, topLeftEntry): assert topLeftEntry in self._bbox assert topLeftEntry[0] <= self._bbox[0].start @@ -358,7 +386,186 @@ def isCompatible(self, spp): def __eq__(self, other): return self._bbox == other._bbox and np.array_equal(self._rowIndex, other._rowIndex) and np.array_equal(self._colPtr, other._colPtr) + def isSparse(self): + return True + + +class PatternMemoryLayout(MemoryLayout): + def __init__(self, spp, alignStride=False, pattern=None): + super().__init__(spp.shape if spp is not None else pattern.shape) + + if spp is None: + spp = aspp.general(pattern != 0) + + self.aligned = alignStride + + self._bbox = BoundingBox.fromSpp(spp) + if self.aligned: + range0 = self._bbox[0] + rnew = Range( DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(range0.start), DenseMemoryLayout.ALIGNMENT_ARCH.alignedUpper(range0.stop) ) + self._bbox = BoundingBox([rnew] + self._bbox[1:]) + + nonzeros = spp.nonzero() + nonzeros = sorted(zip(*nonzeros), key=lambda x: x[::-1]) + + if self.aligned: + nonzeros_pre = set(nonzeros) + for nonzero in nonzeros: + lower = DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(nonzero[0]) + # no alignedUpper call here: avoid reduction to a single element when on alignment boundaries + upper = lower + DenseMemoryLayout.ALIGNMENT_ARCH.alignedReals + + for i in range(lower, upper): + nonzeros_pre.add(tuple([np.int64(i)] + list(nonzero[1:]))) + + nonzeros = list(nonzeros_pre) + nonzeros = sorted(zip(*[[nonzero[i] for nonzero in nonzeros] for i in range(len(self._shape))]), key=lambda x: x[::-1]) + + self._pattern = np.zeros(self._shape, dtype=int, order='F') + + for i, nonzero in enumerate(nonzeros): + self._pattern[tuple(nonzero)] = i + 1 if pattern is None else pattern[tuple(nonzero)] + + self._nonzeros = nonzeros + + # TODO: self._next = np.zeros(self._shape, dtype=int, order='F') + # point to the top-left entry + + def requiredReals(self): + return len(self._nonzeros) + + def isSparse(self): + return True + + def bbox(self): + return self._bbox + + def bboxi(self, dim): + return self._bbox[dim] + + def hasValue(self, entry): + return self._pattern[tuple(entry)] > 0 + + def address(self, entry): + assert entry in self._bbox + assert self._pattern[tuple(entry)] > 0 + + return self._pattern[tuple(entry)] - 1 + + def subtensorOffset(self, topLeftEntry): + assert topLeftEntry in self._bbox + + return 0 + + #assert self._next[tuple(topLeftEntry)] > 0 + + #return self._next[tuple(topLeftEntry)] - 1 + + def entries(self, *rng): + return [tuple(e - r.start for e,r in zip(ex, rng)) for ex in self._nonzeros if + all(e >= r.start and e < r.stop for e,r in zip(ex, rng))] + + def alignedStride(self): + return self.aligned + + def mayVectorizeDim(self, dim): + return dim == 0 and self.aligned + + def pattern(self): + return self._pattern + + @classmethod + def fromSpp(cls, spp, **kwargs): + return PatternMemoryLayout(spp, **kwargs) + + def __contains__(self, entry): + return entry in self._bbox + + def isCompatible(self, spp): + comp = self.fromSpp(spp, alignStride=self.aligned) + + bboxOk = comp._bbox in self._bbox + sppOk = set(comp.entries(*comp._bbox)).issubset(set(self.entries(*comp._bbox))) + + return bboxOk and sppOk + + def vec(self, indices, I, Z): + positionsI = indices.positions(I) + positionsZ = indices.positions(Z) + + # I and Z need to partition perfectly + + error = lambda: None + selector = [error for _ in range(len(self._shape))] + + for p, z in zip(positionsZ, Z.values()): + selector[p] = z + for p in positionsI: + selector[p] = slice(None) + + pattern = self._pattern[tuple(selector)].transpose(positionsI).flatten() + + return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern) + + def withDummyDimension(self): + pattern = np.expand_dims(self._pattern, -1) + return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern) + + def unfold(self, indices, I, J, Z): + positionsI = indices.positions(I) + positionsJ = indices.positions(J) + positionsZ = indices.positions(Z) + + if positionsI[0] > positionsJ[0]: + positionsJ, positionsI = positionsI, positionsJ + + error = lambda: None + selector = [error for _ in range(len(self._shape))] + dimmap = [error for _ in range(len(self._shape))] + + i = 0 + sizeI = 1 + sizeJ = 1 + for p in positionsI: + selector[i] = slice(None) + dimmap[p] = i + sizeI *= self._pattern.shape[p] + i += 1 + for p in positionsJ: + selector[i] = slice(None) + dimmap[p] = i + sizeJ *= self._pattern.shape[p] + i += 1 + for p, z in zip(positionsZ, Z.values()): + selector[i] = z + dimmap[p] = i + i += 1 + + pattern = self._pattern.transpose(dimmap)[tuple(selector)].reshape((sizeI, sizeJ)) + + return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern) + + def addressString(self, indices, I = None, Z = None, prefix='_'): + # handled differently; via unrolling + return '' + + def isAlignedAddressString(self, indices, I = None, Z = None): + # TODO + return self.aligned + + def mayFuse(self, positions): + # we can always generate a new pattern + return True + + def __eq__(self, other): + return self._bbox == other._bbox and np.array_equal(self._pattern, other._pattern) + class AlignedCSCMemoryLayout: @classmethod def fromSpp(cls, spp, **kwargs): return CSCMemoryLayout(spp, alignStride=True) + +class AlignedPatternMemoryLayout: + @classmethod + def fromSpp(cls, spp, **kwargs): + return PatternMemoryLayout(spp, alignStride=True) From 28f5854414c8fccf4b9569b420ca07e2cc58cfdc Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 2 Dec 2025 16:59:08 +0100 Subject: [PATCH 04/15] Default-initialize some arrays --- include/yateto/TensorView.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h index f15c438..5948699 100644 --- a/include/yateto/TensorView.h +++ b/include/yateto/TensorView.h @@ -47,7 +47,7 @@ namespace yateto { } protected: - uint_t m_shape[Dim]; + uint_t m_shape[Dim]{}; }; template @@ -113,7 +113,7 @@ namespace yateto { template void forall(F&& function) { - uint_t entry[Dim]; + uint_t entry[Dim]{}; std::copy(m_start, m_start + Dim, entry); while (entry[Dim-1] != m_stop[Dim-1]) { auto values = &operator[](entry); @@ -136,7 +136,7 @@ namespace yateto { template void forall(F&& function) const { - uint_t entry[Dim]; + uint_t entry[Dim]{}; std::copy(m_start, m_start + Dim, entry); while (entry[Dim-1] != m_stop[Dim-1]) { auto values = &operator[](entry); @@ -158,7 +158,7 @@ namespace yateto { } void setZero() { - uint_t entry[Dim]; + uint_t entry[Dim]{}; std::copy(m_start, m_start + Dim, entry); while (entry[Dim-1] != m_stop[Dim-1]) { auto values = &operator[](entry); @@ -235,7 +235,7 @@ namespace yateto { void copyToView(view_t& other) const { assert(Dim == other.dim()); - uint_t entry[Dim]; + uint_t entry[Dim]{}; for (uint_t d = 0; d < Dim; ++d) { assert(this->shape(d) == other.shape(d)); entry[d] = m_start[d]; @@ -267,9 +267,9 @@ namespace yateto { auto subtensor(Entry... entry) -> DenseTensorView::value, real_t, uint_t> const { static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); constexpr auto nSlices = count_slices::value; - uint_t begin[Dim]; - uint_t size[nSlices]; - uint_t stride[nSlices]; + uint_t begin[Dim]{}; + uint_t size[nSlices]{}; + uint_t stride[nSlices]{}; extractSubtensor(begin, size, stride, entry...); DenseTensorView subtensor(&operator[](begin), size, stride); return subtensor; @@ -331,9 +331,9 @@ namespace yateto { } real_t* m_values; - uint_t m_start[Dim]; - uint_t m_stop[Dim]; - uint_t m_stride[Dim]; + uint_t m_start[Dim]{}; + uint_t m_stop[Dim]{}; + uint_t m_stride[Dim]{}; }; template From e78a05e3bf166c8f8393a3f5e72e40e8bd5e01fc Mon Sep 17 00:00:00 2001 From: David Schneller Date: Wed, 3 Dec 2025 05:04:57 +0100 Subject: [PATCH 05/15] Adjust subtensor offset --- yateto/memory.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yateto/memory.py b/yateto/memory.py index dbefe19..7093af0 100644 --- a/yateto/memory.py +++ b/yateto/memory.py @@ -453,9 +453,13 @@ def address(self, entry): return self._pattern[tuple(entry)] - 1 def subtensorOffset(self, topLeftEntry): + tle = topLeftEntry assert topLeftEntry in self._bbox + + subpat = [self._pattern[tle] for ex in self._nonzeros if + all(e >= tle[i] for i,e in enumerate(ex))] - return 0 + return subpat[0] - 1 if len(subpat) > 0 else 0 #assert self._next[tuple(topLeftEntry)] > 0 From b5d10a500f797acb9a8f4124e043e14504c099af Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 22 Dec 2025 20:58:56 +0100 Subject: [PATCH 06/15] Sparse CSA; initial tests --- .github/workflows/yateto-ci.yml | 2 +- tests/code-gen/sparsity.py | 37 ++++++++++++++++++++++ yateto/codegen/copyscaleadd/generic.py | 44 ++++++++++++++++++++++---- 3 files changed, 75 insertions(+), 8 deletions(-) create mode 100644 tests/code-gen/sparsity.py diff --git a/.github/workflows/yateto-ci.yml b/.github/workflows/yateto-ci.yml index b637aba..dfe0bbc 100644 --- a/.github/workflows/yateto-ci.yml +++ b/.github/workflows/yateto-ci.yml @@ -73,7 +73,7 @@ jobs: - name: Codegen Tests run: | cd ./tests/code-gen - for example in matmul minimal indices slicing; do + for example in matmul minimal indices slicing sparsity; do for build_type in Debug Release; do for precision in single double; do echo " ====== Test Config: ======" diff --git a/tests/code-gen/sparsity.py b/tests/code-gen/sparsity.py new file mode 100644 index 0000000..4e998c5 --- /dev/null +++ b/tests/code-gen/sparsity.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +from yateto import * + +from yateto.memory import * + +import numpy as np + +def add(g): + N = 4 + A = Tensor('A', (N,), spp=np.array([1,0,0,1]), memoryLayoutClass=PatternMemoryLayout) + B = Tensor('B', (N,), spp=np.array([1,0,1,0]), memoryLayoutClass=PatternMemoryLayout) + D = Tensor('D', (N,), spp=np.ones((N,)), memoryLayoutClass=PatternMemoryLayout) + B2 = Tensor('B2', (N, N), spp=np.ones((N, N)), memoryLayoutClass=PatternMemoryLayout) + B3 = Tensor('B3', (N, N), spp=np.ones((N, N)), memoryLayoutClass=CSCMemoryLayout) + Z = Tensor('Z', (N, N, N), spp=np.ones((N, N, N)), memoryLayoutClass=PatternMemoryLayout) + C = Tensor('C', (N, N)) + C2 = Tensor('C2', (N, N, N)) + C3 = Tensor('C3', (N, N, N, N, N, N)) + + class Counter: + def __init__(self): + self.counter = 0 + + counter = Counter() + + def _(kernel): + counter.counter += 1 + g.add(f'kernel{counter.counter}', kernel) + + _(C['ab'] <= A['a']) + _(C['ab'] <= A['a'] + B['a']) + _(C['ab'] <= A['a'] + B['b']) + _(C['ab'] <= A['a'] * B['b']) + _(C2['abc'] <= A['a'] + B['b']) + _(C['ij'] <= B2['ik'] * B3['kj']) + _(C2['zij'] <= B2['ik'] * Z['kjz']) diff --git a/yateto/codegen/copyscaleadd/generic.py b/yateto/codegen/copyscaleadd/generic.py index 44bf704..a857cd3 100644 --- a/yateto/codegen/copyscaleadd/generic.py +++ b/yateto/codegen/copyscaleadd/generic.py @@ -5,24 +5,40 @@ def __init__(self, arch, descr): self._arch = arch self._descr = descr - def _formatTerm(self, alpha, term): + def _formatTerm(self, alpha, term, entry): prefix = '' if alpha == 0.0: return '' if alpha == 1.0: prefix = term.name else: - prefix = '{} * {}'.format(alpha, term.name) - return '{}[{}]'.format(prefix, term.memoryLayout.addressString(term.indices)) + prefix = f'{alpha} * {term.name}' + + if entry is None: + return f'{prefix}[{term.memoryLayout.addressString(term.indices)}]' + else: + if term.memoryLayout.hasValue(entry): + return f'{prefix}[{term.memoryLayout.address(entry)}]' + else: + # needed for some temporaries + return self._arch.formatConstant(0.0) def generate(self, cpp, routineCache): d = self._descr if d.beta == 0.0: - writeBB = boundingBoxFromLoopRanges(d.result.indices, d.loopRanges) - initializeWithZero(cpp, self._arch, d.result, writeBB) + if d.term.memoryLayout.isSparse(): + initializeWithZero(cpp, self._arch, d.result) + else: + writeBB = boundingBoxFromLoopRanges(d.result.indices, d.loopRanges) + initializeWithZero(cpp, self._arch, d.result, writeBB) + class CopyScaleAddBody(object): + def __init__(self, resultEntry, termEntry): + self.resultEntry = resultEntry + self.termEntry = termEntry + def __call__(s): op = '=' flop = 0 @@ -38,8 +54,22 @@ def __call__(s): flop += 1 elif d.beta != 0.0: raise NotImplementedError - cpp( '{} {} {};'.format(self._formatTerm(1.0, d.result), op, self._formatTerm(alpha, d.term)) ) + cpp( f'{self._formatTerm(1.0, d.result, s.resultEntry)} {op} {self._formatTerm(alpha, d.term, s.termEntry)};' ) return flop - return forLoops(cpp, d.result.indices, d.loopRanges, CopyScaleAddBody()) + if d.term.memoryLayout.isSparse(): + + indexmap = d.result.indices.positions(d.term.indices, sort=False) + + flops = 0 + nonzeros = d.result.eqspp.nonzero() + for entryR in sorted(zip(*nonzeros), key=lambda x: x[::-1]): + entry = tuple(entryR[ pos ] for pos in indexmap) + flops += CopyScaleAddBody(entryR, entry)() + + return flops + + else: + + return forLoops(cpp, d.result.indices, d.loopRanges, CopyScaleAddBody(None, None)) From 68997fa5713e0a46e944e28a5de8f8897f5c4184 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 23 Dec 2025 00:31:06 +0100 Subject: [PATCH 07/15] Fix argument count --- yateto/memory.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/yateto/memory.py b/yateto/memory.py index d095f68..84e91ec 100644 --- a/yateto/memory.py +++ b/yateto/memory.py @@ -290,6 +290,9 @@ def storage(self): def alignmentOffset(self, dim): return 0 + + def equalStride(self, dim): + return True class CSCMemoryLayout(MemoryLayout): def __init__(self, spp, alignStride=False): @@ -352,7 +355,7 @@ def rowIndex(self): def colPointer(self): return self._colPtr - def isAlignedAddressString(self, indices, I = None): + def isAlignedAddressString(self, indices, I = None, Z = None): if I is None: I = set(indices) positions = indices.positions(I) @@ -434,6 +437,9 @@ def alignmentOffset(self, dim): def isSparse(self): return True + + def equalStride(self, dim): + return False class PatternMemoryLayout(MemoryLayout): @@ -609,6 +615,9 @@ def mayFuse(self, positions): def __eq__(self, other): return self._bbox == other._bbox and np.array_equal(self._pattern, other._pattern) + + def equalStride(self, dim): + return False class AlignedCSCMemoryLayout: @classmethod @@ -678,14 +687,14 @@ def isCompatible(self, spp): def mayVectorizeDim(self, dim): return self.base.mayVectorizeDim(dim) - def isAlignedAddressString(self, indices, I = None): - return self.base.isAlignedAddressString(indices, I) + def isAlignedAddressString(self, indices, I = None, Z = None): + return self.base.isAlignedAddressString(indices, I, Z) - def addressString(self, indices, I = None, prefix='_', offsets=()): + def addressString(self, indices, I = None, Z = None, prefix='_', offsets=()): if len(offsets) == 0: offsets = [0] * len(self._shape) newOffsets = tuple(offsets[i] if self.index != i else offsets[i] + self.start for i in range(len(self._shape))) - return self.base.addressString(indices, I, prefix, newOffsets) + return self.base.addressString(indices, I, Z, prefix, newOffsets) def subslice(self, index, start, end): return MemoryLayoutView(self, index, start, end) From d5925906cfb0b392d50f7c12e3bce55ac36c27fb Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 5 Jan 2026 14:57:42 +0100 Subject: [PATCH 08/15] Update tests --- tests/code-gen/sparsity.py | 16 +++++++++++++++- yateto/memory.py | 20 +++++++++++--------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/tests/code-gen/sparsity.py b/tests/code-gen/sparsity.py index 4e998c5..8366e14 100644 --- a/tests/code-gen/sparsity.py +++ b/tests/code-gen/sparsity.py @@ -6,14 +6,25 @@ import numpy as np +def checkerboard(shape): + # cf. https://stackoverflow.com/a/51715491 + return np.indices(tuple(shape)).sum(axis=0) % 2 + +def invert(pattern): + return 1 - pattern + def add(g): N = 4 A = Tensor('A', (N,), spp=np.array([1,0,0,1]), memoryLayoutClass=PatternMemoryLayout) B = Tensor('B', (N,), spp=np.array([1,0,1,0]), memoryLayoutClass=PatternMemoryLayout) D = Tensor('D', (N,), spp=np.ones((N,)), memoryLayoutClass=PatternMemoryLayout) + E = Tensor('E', (N,), spp=checkerboard((N,)), memoryLayoutClass=PatternMemoryLayout) B2 = Tensor('B2', (N, N), spp=np.ones((N, N)), memoryLayoutClass=PatternMemoryLayout) B3 = Tensor('B3', (N, N), spp=np.ones((N, N)), memoryLayoutClass=CSCMemoryLayout) + B4 = Tensor('B4', (N, N), spp=checkerboard((N, N)), memoryLayoutClass=PatternMemoryLayout) + B5 = Tensor('B5', (N, N), spp=invert(checkerboard((N, N))), memoryLayoutClass=PatternMemoryLayout) Z = Tensor('Z', (N, N, N), spp=np.ones((N, N, N)), memoryLayoutClass=PatternMemoryLayout) + C0 = Tensor('C0', (N,)) C = Tensor('C', (N, N)) C2 = Tensor('C2', (N, N, N)) C3 = Tensor('C3', (N, N, N, N, N, N)) @@ -29,9 +40,12 @@ def _(kernel): g.add(f'kernel{counter.counter}', kernel) _(C['ab'] <= A['a']) - _(C['ab'] <= A['a'] + B['a']) _(C['ab'] <= A['a'] + B['b']) _(C['ab'] <= A['a'] * B['b']) _(C2['abc'] <= A['a'] + B['b']) + _(C2['abc'] <= B4['ac']) _(C['ij'] <= B2['ik'] * B3['kj']) + _(C['ij'] <= B4['ik'] * B3['kj']) + _(C['ij'] <= B4['ik'] * B3['kj'] * B4['ij']) _(C2['zij'] <= B2['ik'] * Z['kjz']) + _(C0['k'] <= B4['ik'] * A['i']) diff --git a/yateto/memory.py b/yateto/memory.py index 84e91ec..1145e35 100644 --- a/yateto/memory.py +++ b/yateto/memory.py @@ -480,9 +480,6 @@ def __init__(self, spp, alignStride=False, pattern=None): self._nonzeros = nonzeros - # TODO: self._next = np.zeros(self._shape, dtype=int, order='F') - # point to the top-left entry - def requiredReals(self): return len(self._nonzeros) @@ -508,14 +505,14 @@ def subtensorOffset(self, topLeftEntry): tle = topLeftEntry assert topLeftEntry in self._bbox - subpat = [self._pattern[tle] for ex in self._nonzeros if + subpat = [self._pattern[ex] for ex in self._nonzeros if all(e >= tle[i] for i,e in enumerate(ex))] - return subpat[0] - 1 if len(subpat) > 0 else 0 + result = subpat[0] - 1 if len(subpat) > 0 else 0 - #assert self._next[tuple(topLeftEntry)] > 0 + assert result >= 0 - #return self._next[tuple(topLeftEntry)] - 1 + return result def entries(self, *rng): return [tuple(e - r.start for e,r in zip(ex, rng)) for ex in self._nonzeros if @@ -615,9 +612,11 @@ def mayFuse(self, positions): def __eq__(self, other): return self._bbox == other._bbox and np.array_equal(self._pattern, other._pattern) - + def equalStride(self, dim): - return False + # search for: all zeros + nzp = (self._pattern != 0) + return nzp.sum(axis=dim) == nzp.max(axis=dim) * nzp.shape[dim] class AlignedCSCMemoryLayout: @classmethod @@ -775,3 +774,6 @@ def alignmentOffset(self, dim): newval = val + self.start val = newval - DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(newval) return val + + def equalStride(self, dim): + return self.base.equalStride(dim) From d379bea8b426d0086a09551151dd01ebca41fa70 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 10 Jan 2026 22:40:05 +0100 Subject: [PATCH 09/15] Fix ordering --- yateto/codegen/visitor.py | 2 +- yateto/memory.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index 1b0c364..ad4c0d0 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -649,7 +649,7 @@ def generate(cpp, group, memLayout): def listToInitializerList(self, lst): if isinstance(lst, np.ndarray): - lst = lst.flatten() + lst = lst.flatten(order='K') return '{{{}}}'.format(', '.join([str(l) for l in lst])) def formatArray(self, numberType, name, values, declarationOnly): diff --git a/yateto/memory.py b/yateto/memory.py index 1145e35..ad88972 100644 --- a/yateto/memory.py +++ b/yateto/memory.py @@ -452,7 +452,7 @@ def __init__(self, spp, alignStride=False, pattern=None): self.aligned = alignStride self._bbox = BoundingBox.fromSpp(spp) - if self.aligned: + if alignStride: range0 = self._bbox[0] rnew = Range( DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(range0.start), DenseMemoryLayout.ALIGNMENT_ARCH.alignedUpper(range0.stop) ) self._bbox = BoundingBox([rnew] + self._bbox[1:]) @@ -460,7 +460,7 @@ def __init__(self, spp, alignStride=False, pattern=None): nonzeros = spp.nonzero() nonzeros = sorted(zip(*nonzeros), key=lambda x: x[::-1]) - if self.aligned: + if alignStride: nonzeros_pre = set(nonzeros) for nonzero in nonzeros: lower = DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(nonzero[0]) @@ -473,6 +473,7 @@ def __init__(self, spp, alignStride=False, pattern=None): nonzeros = list(nonzeros_pre) nonzeros = sorted(zip(*[[nonzero[i] for nonzero in nonzeros] for i in range(len(self._shape))]), key=lambda x: x[::-1]) + # keep everything in F order self._pattern = np.zeros(self._shape, dtype=int, order='F') for i, nonzero in enumerate(nonzeros): @@ -556,7 +557,7 @@ def vec(self, indices, I, Z): for p in positionsI: selector[p] = slice(None) - pattern = self._pattern[tuple(selector)].transpose(positionsI).flatten() + pattern = self._pattern[tuple(selector)].transpose(positionsI).flatten(order='F') return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern) From 3af8c4568cd0482c434c69eaf70eacb9effa8994 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Mon, 1 Dec 2025 04:00:52 +0100 Subject: [PATCH 10/15] Update CMake standard for interface tests --- tests/interface/CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/interface/CMakeLists.txt b/tests/interface/CMakeLists.txt index eb604d7..acadc58 100644 --- a/tests/interface/CMakeLists.txt +++ b/tests/interface/CMakeLists.txt @@ -3,6 +3,10 @@ project(inteface-test) find_package(CxxTest REQUIRED) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + enable_testing() # generate and add an interface test add_custom_command(COMMAND ${CXXTEST_PYTHON_TESTGEN_EXECUTABLE} @@ -12,5 +16,4 @@ enable_testing() add_executable(tensor-view-target TensorView.t.cpp) target_include_directories(tensor-view-target PUBLIC ${CMAKE_SOURCE_DIR}/../../include ${CXXTEST_INCLUDE_DIRS}) -target_compile_options(tensor-view-target PUBLIC "-std=c++11") -add_test(tensor-view tensor-view-target) \ No newline at end of file +add_test(tensor-view tensor-view-target) From d785aadcc16336867049b57652bad64aaa1b889f Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 10 Jan 2026 22:48:02 +0100 Subject: [PATCH 11/15] Extend interface tests --- include/yateto/TensorView.h | 30 +++++++++---- tests/interface/TensorView.t.h | 79 ++++++++++++++++++++++++++++++---- yateto/codegen/visitor.py | 4 +- 3 files changed, 94 insertions(+), 19 deletions(-) diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h index 5948699..f1be097 100644 --- a/include/yateto/TensorView.h +++ b/include/yateto/TensorView.h @@ -480,23 +480,35 @@ namespace yateto { }; template - class PatternMatrixView : public TensorView { + class PatternTensorView : public TensorView { public: // TODO: remove const_cast(pattern) - explicit PatternMatrixView(real_t* values, std::initializer_list shape, uint_t const* pattern) + explicit PatternTensorView(real_t* values, std::initializer_list shape, uint_t const* pattern) : TensorView(shape), m_values(values), m_pattern(const_cast(pattern), shape) { - m_pattern.forall([&](const auto& index, const auto& idxval) { - if (idxval > 0) { - ++m_size; - } - }); + computeSize(); } - explicit PatternMatrixView(real_t* values, uint_t const shape[], uint_t const* pattern) + explicit PatternTensorView(real_t* values, uint_t const shape[], uint_t const* pattern) : TensorView(shape), m_values(values), m_pattern(const_cast(pattern), shape) { + computeSize(); + } + + explicit PatternTensorView(real_t* values, std::initializer_list shape, DenseTensorView pattern) + : TensorView(shape), m_values(values), m_pattern(std::move(pattern)) { + + computeSize(); + } + + explicit PatternTensorView(real_t* values, uint_t const shape[], DenseTensorView pattern) + : TensorView(shape), m_values(values), m_pattern(std::move(pattern)) { + + computeSize(); + } + + void computeSize() { m_pattern.forall([&](const auto& index, const auto& idxval) { if (idxval > 0) { ++m_size; @@ -581,7 +593,7 @@ namespace yateto { auto subtensor(Entry... entry) const { static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); const auto patternSubtensor = m_pattern.subtensor(entry...); - return PatternMatrixView::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor); + return PatternTensorView::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor); } protected: diff --git a/tests/interface/TensorView.t.h b/tests/interface/TensorView.t.h index 6487c93..c5f1159 100644 --- a/tests/interface/TensorView.t.h +++ b/tests/interface/TensorView.t.h @@ -1,24 +1,26 @@ #include #include +#include + using namespace yateto; -class TensorViewTestSuite : public CxxTest::TestSuite +class DenseTensorViewTestSuite : public CxxTest::TestSuite { private: - double m_data[12]; + double data_[12]; public: void setUp() { for (int i = 0; i < 12; ++i) { - m_data[i] = static_cast(i+1); + data_[i] = static_cast(i+1); } } void testAccess() { - DenseTensorView<3, double> tensor(m_data, {3,2,2}); + DenseTensorView<3, double> tensor(data_, {3,2,2}); TS_ASSERT_EQUALS(tensor(0,0,0), 1.0); TS_ASSERT_EQUALS(tensor(1,1,0), 5.0); TS_ASSERT_EQUALS(tensor(2,1,1), 12.0); @@ -26,7 +28,7 @@ class TensorViewTestSuite : public CxxTest::TestSuite void testSubtensor() { - DenseTensorView<3, double> tensor(m_data, {3,2,2}); + DenseTensorView<3, double> tensor(data_, {3,2,2}); auto sub = tensor.subtensor(1, slice<>(), slice<>()); TS_ASSERT_EQUALS(sub(0,0), 2.0); TS_ASSERT_EQUALS(sub(1,0), 5.0); @@ -46,14 +48,75 @@ class TensorViewTestSuite : public CxxTest::TestSuite void testSetZero() { - DenseTensorView<3, double> tensor(m_data, {3,2,2}); + DenseTensorView<3, double> tensor(data_, {3,2,2}); auto sub = tensor.subtensor(1, slice<>(), slice<>()); sub.setZero(); for (int i = 0; i < 12; ++i) { if ((i-1) % 3 == 0) { - TS_ASSERT_EQUALS(m_data[i], 0.0); + TS_ASSERT_EQUALS(data_[i], 0.0); + } else { + TS_ASSERT_EQUALS(data_[i], static_cast(i+1)); + } + } + } +}; + + +class PatternTensorViewTestSuite : public CxxTest::TestSuite +{ +private: + double data_[6]; + uint32_t pattern_[12]; + +public: + void setUp() + { + for (int i = 0; i < 6; ++i) { + data_[i] = static_cast(2*i+1); + } + for (int i = 0; i < 12; ++i) { + pattern_[i] = (i % 2 == 0) ? (i + 1) : 0; + } + } + + void testAccess() + { + PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_); + TS_ASSERT_EQUALS(tensor(0,0,0), 0.0); + TS_ASSERT_EQUALS(tensor(1,1,0), 0.0); + TS_ASSERT_EQUALS(tensor(2,1,1), 12.0); + } + + void testSubtensor() + { + PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_); + auto sub = tensor.subtensor(1, slice<>(), slice<>()); + TS_ASSERT_EQUALS(sub(0,0), 2.0); + TS_ASSERT_EQUALS(sub(1,0), 0.0); + TS_ASSERT_EQUALS(sub(0,1), 8.0); + TS_ASSERT_EQUALS(sub(1,1), 0.0); + + auto sub2 = sub.subtensor(1, slice<>()); + TS_ASSERT_EQUALS(sub2(0), 0.0); + TS_ASSERT_EQUALS(sub2(1), 0.0); + + auto sub3 = tensor.subtensor(slice<>(1,3), slice<>(), slice<>()); + TS_ASSERT_EQUALS(sub3(0,0,0), 2.0); + TS_ASSERT_EQUALS(sub3(0,1,0), 0.0); + TS_ASSERT_EQUALS(sub3(1,0,1), 0.0); + TS_ASSERT_EQUALS(sub3(1,1,1), 12.0); + } + + void testSetZero() + { + PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_); + auto sub = tensor.subtensor(1, slice<>(), slice<>()); + sub.setZero(); + for (int i = 0; i < 12; ++i) { + if ((i-1) % 3 == 0 || i % 2 == 1) { + TS_ASSERT_EQUALS(data_[i], 0.0); } else { - TS_ASSERT_EQUALS(m_data[i], static_cast(i+1)); + TS_ASSERT_EQUALS(data_[i], static_cast(i+1)); } } } diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index ad4c0d0..c64d19d 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -696,7 +696,7 @@ def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declaration cpp(self.formatArray(numberType, namespace + self.ROWIND_NAME + index, memLayout.rowIndex(), declarationOnly)) cpp(self.formatArray(numberType, namespace + self.COLPTR_NAME + index, memLayout.colPointer(), declarationOnly)) - class PatternMatrixView(TensorView): + class PatternTensorView(TensorView): PATTERN_NAME = 'Pattern' def typename(self, dim, arch): @@ -757,7 +757,7 @@ def _tensorViewGenerator(self, memoryLayout): memLayoutMap = { 'DenseMemoryLayout': self.DenseTensorView, 'CSCMemoryLayout': self.CSCMatrixView, - 'PatternMemoryLayout': self.PatternMatrixView + 'PatternMemoryLayout': self.PatternTensorView } return memLayoutMap[type(memoryLayout).__name__]() From 4a47223ac5e031d66caa46056b0146716499bca9 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 10 Jan 2026 23:07:36 +0100 Subject: [PATCH 12/15] Un-const `subtensor` function --- include/yateto/TensorView.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h index f1be097..a07546b 100644 --- a/include/yateto/TensorView.h +++ b/include/yateto/TensorView.h @@ -206,14 +206,14 @@ namespace yateto { } template - real_t operator()(Entry... entry) const { + const real_t& operator()(Entry... entry) const { static_assert(sizeof...(entry) == Dim, "Number of arguments to operator() const does not match the tensor dimension."); assert(isInRange(entry...)); return m_values[address(entry...)]; } - real_t operator[](uint_t const entry[Dim]) const { + const real_t& operator[](uint_t const entry[Dim]) const { uint_t addr = 0; for (uint_t d = 0; d < Dim; ++d) { assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]); @@ -264,7 +264,7 @@ namespace yateto { } template - auto subtensor(Entry... entry) -> DenseTensorView::value, real_t, uint_t> const { + auto subtensor(Entry... entry) -> DenseTensorView::value, real_t, uint_t> { static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); constexpr auto nSlices = count_slices::value; uint_t begin[Dim]{}; @@ -590,7 +590,7 @@ namespace yateto { } template - auto subtensor(Entry... entry) const { + auto subtensor(Entry... entry) { static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); const auto patternSubtensor = m_pattern.subtensor(entry...); return PatternTensorView::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor); From 41812b1fe21b2d4e40e4ee1477cd8722e2d0b0bd Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 10 Jan 2026 23:20:50 +0100 Subject: [PATCH 13/15] Fix interface tests --- .github/workflows/yateto-ci.yml | 2 +- include/yateto/TensorView.h | 4 +-- tests/interface/TensorView.t.h | 55 ++++++++++++++++++++++----------- 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/.github/workflows/yateto-ci.yml b/.github/workflows/yateto-ci.yml index dfe0bbc..d2b0736 100644 --- a/.github/workflows/yateto-ci.yml +++ b/.github/workflows/yateto-ci.yml @@ -33,7 +33,7 @@ jobs: mkdir -p ./build-${build_type} && cd ./build-${build_type} cmake .. -GNinja -DCMAKE_BUILD_TYPE=${build_type} ninja - ninja test + ctest -j --rerun-failed --output-on-failure cd .. done diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h index a07546b..990df6c 100644 --- a/include/yateto/TensorView.h +++ b/include/yateto/TensorView.h @@ -529,7 +529,7 @@ namespace yateto { } template - real_t operator()(Args... index) const { + const real_t& operator()(Args... index) const { static_assert((std::is_integral_v && ...)); const auto idx = m_pattern(index...); return m_values[idx - 1]; @@ -554,7 +554,7 @@ namespace yateto { return m_values[idx - 1]; } - real_t operator[](const uint_t entry[Dim]) const { + const real_t& operator[](const uint_t entry[Dim]) const { const auto idx = m_pattern[entry]; return m_values[idx - 1]; } diff --git a/tests/interface/TensorView.t.h b/tests/interface/TensorView.t.h index c5f1159..4eb0778 100644 --- a/tests/interface/TensorView.t.h +++ b/tests/interface/TensorView.t.h @@ -75,36 +75,55 @@ class PatternTensorViewTestSuite : public CxxTest::TestSuite data_[i] = static_cast(2*i+1); } for (int i = 0; i < 12; ++i) { - pattern_[i] = (i % 2 == 0) ? (i + 1) : 0; + pattern_[i] = (i % 2 == 0) ? (i / 2 + 1) : 0; } } + void testBasic() + { + PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_); + TS_ASSERT_EQUALS(tensor.size(), 6); + } + void testAccess() { PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_); - TS_ASSERT_EQUALS(tensor(0,0,0), 0.0); - TS_ASSERT_EQUALS(tensor(1,1,0), 0.0); - TS_ASSERT_EQUALS(tensor(2,1,1), 12.0); + TS_ASSERT(tensor.isInRange(0,0,0)); + TS_ASSERT_EQUALS(tensor(0,0,0), 1.0); + TS_ASSERT(tensor.isInRange(1,1,0)); + TS_ASSERT_EQUALS(tensor(1,1,0), 5.0); + TS_ASSERT(!tensor.isInRange(2,1,1)); + // TS_ASSERT_EQUALS(tensor(2,1,1), 0.0); } void testSubtensor() { PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_); auto sub = tensor.subtensor(1, slice<>(), slice<>()); - TS_ASSERT_EQUALS(sub(0,0), 2.0); - TS_ASSERT_EQUALS(sub(1,0), 0.0); - TS_ASSERT_EQUALS(sub(0,1), 8.0); - TS_ASSERT_EQUALS(sub(1,1), 0.0); + TS_ASSERT(!sub.isInRange(0,0)); + // TS_ASSERT_EQUALS(sub(0,0), 0.0); + TS_ASSERT(sub.isInRange(1,0)); + TS_ASSERT_EQUALS(sub(1,0), 5.0); + TS_ASSERT(!sub.isInRange(0,1)); + // TS_ASSERT_EQUALS(sub(0,1), 0.0); + TS_ASSERT(sub.isInRange(1,1)); + TS_ASSERT_EQUALS(sub(1,1), 11.0); auto sub2 = sub.subtensor(1, slice<>()); - TS_ASSERT_EQUALS(sub2(0), 0.0); - TS_ASSERT_EQUALS(sub2(1), 0.0); + TS_ASSERT(sub2.isInRange(0)); + TS_ASSERT_EQUALS(sub2(0), 5.0); + TS_ASSERT(sub2.isInRange(1)); + TS_ASSERT_EQUALS(sub2(1), 11.0); - auto sub3 = tensor.subtensor(slice<>(1,3), slice<>(), slice<>()); - TS_ASSERT_EQUALS(sub3(0,0,0), 2.0); - TS_ASSERT_EQUALS(sub3(0,1,0), 0.0); - TS_ASSERT_EQUALS(sub3(1,0,1), 0.0); - TS_ASSERT_EQUALS(sub3(1,1,1), 12.0); + auto sub3 = tensor.subtensor(slice<>(1,3), slice<>(), slice<>()); + TS_ASSERT(!sub3.isInRange(0,0,0)); + // TS_ASSERT_EQUALS(sub3(0,0,0), 0.0); + TS_ASSERT(sub3.isInRange(0,1,0)); + TS_ASSERT_EQUALS(sub3(0,1,0), 5.0); + TS_ASSERT(sub3.isInRange(1,0,1)); + TS_ASSERT_EQUALS(sub3(1,0,1), 9.0); + TS_ASSERT(!sub3.isInRange(1,1,1)); + // TS_ASSERT_EQUALS(sub3(1,1,1), 0.0); } void testSetZero() @@ -112,11 +131,11 @@ class PatternTensorViewTestSuite : public CxxTest::TestSuite PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_); auto sub = tensor.subtensor(1, slice<>(), slice<>()); sub.setZero(); - for (int i = 0; i < 12; ++i) { - if ((i-1) % 3 == 0 || i % 2 == 1) { + for (int i = 0; i < 6; ++i) { + if ((2*i-1) % 3 == 0) { TS_ASSERT_EQUALS(data_[i], 0.0); } else { - TS_ASSERT_EQUALS(data_[i], static_cast(i+1)); + TS_ASSERT_EQUALS(data_[i], static_cast(2*i+1)); } } } From 12877e7391dc87af26da700af0040d4bce325df1 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 10 Jan 2026 23:48:26 +0100 Subject: [PATCH 14/15] Test and (mostly) fix unrolling --- tests/code-gen/sparsity.py | 15 ++++++++++++++- yateto/ast/indices.py | 3 +++ yateto/codegen/gemm/gemmgen.py | 2 +- yateto/codegen/gemm/generic.py | 14 +++++++------- yateto/codegen/log/generic.py | 6 ------ yateto/memory.py | 32 +++++++++++++++++++++----------- 6 files changed, 46 insertions(+), 26 deletions(-) diff --git a/tests/code-gen/sparsity.py b/tests/code-gen/sparsity.py index 8366e14..dec5a68 100644 --- a/tests/code-gen/sparsity.py +++ b/tests/code-gen/sparsity.py @@ -15,6 +15,7 @@ def invert(pattern): def add(g): N = 4 + M = 2 A = Tensor('A', (N,), spp=np.array([1,0,0,1]), memoryLayoutClass=PatternMemoryLayout) B = Tensor('B', (N,), spp=np.array([1,0,1,0]), memoryLayoutClass=PatternMemoryLayout) D = Tensor('D', (N,), spp=np.ones((N,)), memoryLayoutClass=PatternMemoryLayout) @@ -27,7 +28,15 @@ def add(g): C0 = Tensor('C0', (N,)) C = Tensor('C', (N, N)) C2 = Tensor('C2', (N, N, N)) - C3 = Tensor('C3', (N, N, N, N, N, N)) + + YA = Tensor('YA', (M, M, M), spp=checkerboard((M, M, M)), memoryLayoutClass=PatternMemoryLayout) + YB = Tensor('YB', (M, M, M), spp=checkerboard((M, M, M)), memoryLayoutClass=PatternMemoryLayout) + YC = Tensor('YC', (M, M, M)) + C4 = Tensor('C4', (M, M, M, M)) + + ZA = Tensor('ZA', (M, M, M, M, M, M), spp=checkerboard((M, M, M, M, M, M)), memoryLayoutClass=PatternMemoryLayout) + ZB = Tensor('ZB', (M, M, M, M, M, M), spp=invert(checkerboard((M, M, M, M, M, M))), memoryLayoutClass=PatternMemoryLayout) + ZC = Tensor('ZC', (M, M, M, M, M, M)) class Counter: def __init__(self): @@ -49,3 +58,7 @@ def _(kernel): _(C['ij'] <= B4['ik'] * B3['kj'] * B4['ij']) _(C2['zij'] <= B2['ik'] * Z['kjz']) _(C0['k'] <= B4['ik'] * A['i']) + + _(C4['ijXY'] <= YA['Zik'] * YB['kXY'] * YC['Zkj']) + _(ZC['abcxyz'] <= ZA['abcijk'] * ZB['ijkxyz']) + _(ZC['abcxyz'] <= ZA['aibjck'] * ZB['zkyjxi']) diff --git a/yateto/ast/indices.py b/yateto/ast/indices.py index 6f2ab02..80c6e00 100644 --- a/yateto/ast/indices.py +++ b/yateto/ast/indices.py @@ -44,6 +44,9 @@ def positions(self, I, sort=True): return sorted(pos) return pos + def positionsIncomplete(self, I): + return [self.find(i) for i in I if i in self] + def __eq__(self, other): return other != None and self._indices == other._indices and self._size == other._size diff --git a/yateto/codegen/gemm/gemmgen.py b/yateto/codegen/gemm/gemmgen.py index 35b0b95..3eb5a96 100644 --- a/yateto/codegen/gemm/gemmgen.py +++ b/yateto/codegen/gemm/gemmgen.py @@ -109,7 +109,7 @@ def generate(self, cpp, routineCache): if d.isBCsc: sppB = d.rightTerm.memoryLayout.entries(k, n) sppBRows = d.rightTerm.memoryLayout.shape()[0] - + if d.isACsc and d.isBCsc: # count the flops by splitting into outer products (i.e. partition by k) # for each outer product, we need to compute all-by-all nonzero entries for m and n diff --git a/yateto/codegen/gemm/generic.py b/yateto/codegen/gemm/generic.py index 7d9d92d..7b6f0f7 100644 --- a/yateto/codegen/gemm/generic.py +++ b/yateto/codegen/gemm/generic.py @@ -61,10 +61,10 @@ def _generateSparseSparse(self, cpp): Caccess = self._accessFun(d.result, (m.start, n.start), False, False) rows, cols = (k, m) if d.transA else (m, k) - sppA = d.leftTerm.memoryLayout.entries(rows, cols) + sppA = d.leftTerm.memoryLayout.entriesRel(rows, cols) rows, cols = (n, k) if d.transB else (k, n) - sppB = d.rightTerm.memoryLayout.entries(rows, cols) + sppB = d.rightTerm.memoryLayout.entriesRel(rows, cols) if d.beta != 1.0: with cpp.For(f'int n = 0; n < {n.size()}; ++n'): @@ -80,9 +80,9 @@ def _generateSparseSparse(self, cpp): # note that we explicitly count all nonzero operations nzcount = 0 - for idxA, entry in enumerate(sppA): + for idxA, entry in sppA: eA = entry[::-1] if d.transA else entry - for idxB, entry in enumerate(sppB): + for idxB, entry in sppB: eB = entry[::-1] if d.transB else entry if eA[1] == eB[0]: @@ -112,7 +112,7 @@ def _generateSparseDense(self, cpp): if d.isACsc: rows, cols = (k, m) if d.transA else (m, k) - spp = d.leftTerm.memoryLayout.entries(rows, cols) + spp = d.leftTerm.memoryLayout.entriesRel(rows, cols) sparse = Aaccess result = lambda e: Caccess(e[0], self.OUTER_INDEX) dense = lambda e: Baccess(e[1], self.OUTER_INDEX) @@ -120,7 +120,7 @@ def _generateSparseDense(self, cpp): trans = d.transA elif d.isBCsc: rows, cols = (n, k) if d.transB else (k, n) - spp = d.rightTerm.memoryLayout.entries(rows, cols) + spp = d.rightTerm.memoryLayout.entriesRel(rows, cols) sparse = Baccess result = lambda e: Caccess(self.OUTER_INDEX, e[1]) dense = lambda e: Aaccess(self.OUTER_INDEX, e[0]) @@ -137,7 +137,7 @@ def _generateSparseDense(self, cpp): ' * ' + CAddr if d.beta != 0.0 else '' ) ) - for idx, entry in enumerate(spp): + for idx, entry in spp: e = entry[::-1] if trans else entry if e[0] < sizes[0] and e[1] < sizes[1]: cpp( '{result} += {alpha} * {dense} * {sparse};'.format( diff --git a/yateto/codegen/log/generic.py b/yateto/codegen/log/generic.py index 8d0d527..8e81078 100644 --- a/yateto/codegen/log/generic.py +++ b/yateto/codegen/log/generic.py @@ -31,12 +31,6 @@ def _memLayout(self, term, I, J, fixed): elif len(term.indices) == 2: return term.memoryLayout return term.memoryLayout.unfold(term.indices, I, J, fixed) - - def _unroll(self, term, I): - if not term.memoryLayout.isSparse(): - return set() - - return I def _reduce(self, term, subset, memLayout, fixed): return reduceSpp(term.eqspp, term.indices, subset, fixed).reshape(memLayout.shape()) diff --git a/yateto/memory.py b/yateto/memory.py index ad88972..8d7feda 100644 --- a/yateto/memory.py +++ b/yateto/memory.py @@ -394,6 +394,10 @@ def entries(self, rowRange, colRange): for col in colRange: e.extend([(self._rowIndex[i]-rowRange.start, col-colRange.start) for i in range(self._colPtr[col], self._colPtr[col+1])]) return e + + def entriesRel(self, *rng): + entries = self.entries(*rng) + return list(enumerate(entries)) def alignedStride(self): return self.aligned @@ -479,7 +483,7 @@ def __init__(self, spp, alignStride=False, pattern=None): for i, nonzero in enumerate(nonzeros): self._pattern[tuple(nonzero)] = i + 1 if pattern is None else pattern[tuple(nonzero)] - self._nonzeros = nonzeros + self._nonzeros = list(nonzeros) def requiredReals(self): return len(self._nonzeros) @@ -519,6 +523,11 @@ def entries(self, *rng): return [tuple(e - r.start for e,r in zip(ex, rng)) for ex in self._nonzeros if all(e >= r.start and e < r.stop for e,r in zip(ex, rng))] + def entriesRel(self, *rng): + offset = self.subtensorOffset(tuple(r.start for r in rng)) + return [(self._pattern[ex] - 1 - offset, tuple(e - r.start for e,r in zip(ex, rng))) for ex in self._nonzeros if + all(e >= r.start and e < r.stop for e,r in zip(ex, rng))] + def alignedStride(self): return self.aligned @@ -545,7 +554,7 @@ def isCompatible(self, spp): def vec(self, indices, I, Z): positionsI = indices.positions(I) - positionsZ = indices.positions(Z) + positionsZ = indices.positionsIncomplete(Z) # I and Z need to partition perfectly @@ -568,7 +577,7 @@ def withDummyDimension(self): def unfold(self, indices, I, J, Z): positionsI = indices.positions(I) positionsJ = indices.positions(J) - positionsZ = indices.positions(Z) + positionsZ = indices.positionsIncomplete(Z) if positionsI[0] > positionsJ[0]: positionsJ, positionsI = positionsI, positionsJ @@ -595,7 +604,7 @@ def unfold(self, indices, I, J, Z): dimmap[p] = i i += 1 - pattern = self._pattern.transpose(dimmap)[tuple(selector)].reshape((sizeI, sizeJ)) + pattern = self._pattern.transpose(dimmap)[tuple(selector)].reshape((sizeI, sizeJ), order='F') return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern) @@ -618,6 +627,9 @@ def equalStride(self, dim): # search for: all zeros nzp = (self._pattern != 0) return nzp.sum(axis=dim) == nzp.max(axis=dim) * nzp.shape[dim] + + def alignmentOffset(self, dim): + return 0 class AlignedCSCMemoryLayout: @classmethod @@ -755,13 +767,11 @@ def storage(self): def permuted(self, permutation): return MemoryLayoutView(self.base.permuted(permutation), permutation[self.index], self.start, self.end) - def entries(self, rowRange, colRange): - if self.index == 0: - return self.base.entries(Range(rowRange.start + self.start, rowRange.stop + self.start), colRange) - elif self.index == 1: - return self.base.entries(rowRange, Range(colRange.start + self.start, colRange.stop + self.start)) - else: - raise NotImplementedError() + def entries(self, *rng): + return self.base.entries([Range(r.start + self.start, r.stop + self.start) if self.index == i else r for i,r in enumerate(rng)]) + + def entriesRel(self, *rng): + return self.base.entriesRel([Range(r.start + self.start, r.stop + self.start) if self.index == i else r for i,r in enumerate(rng)]) def mayFuse(self, positions): return (self.index not in positions or positions[-1] == self.index) and self.base.mayFuse(positions) From 90b4ce98d077c078c6a00407eff1421dad6902c5 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Fri, 13 Feb 2026 19:01:34 +0100 Subject: [PATCH 15/15] Adjust the `PatternTensorView` to support const --- include/yateto/TensorView.h | 40 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h index 002a4f1..431b23e 100644 --- a/include/yateto/TensorView.h +++ b/include/yateto/TensorView.h @@ -500,37 +500,38 @@ namespace yateto { uint_t const* m_colPtr; }; - template + template class PatternTensorView : public TensorView { public: - // TODO: remove const_cast(pattern) + using data_t = std::conditional_t; + using dataref_t = std::conditional_t; - explicit PatternTensorView(real_t* values, std::initializer_list shape, uint_t const* pattern) - : TensorView(shape), m_values(values), m_pattern(const_cast(pattern), shape) { + explicit PatternTensorView(data_t values, std::initializer_list shape, uint_t const* pattern) + : TensorView(shape), m_values(values), m_pattern(pattern, shape) { computeSize(); } - explicit PatternTensorView(real_t* values, uint_t const shape[], uint_t const* pattern) - : TensorView(shape), m_values(values), m_pattern(const_cast(pattern), shape) { + explicit PatternTensorView(data_t values, uint_t const shape[], uint_t const* pattern) + : TensorView(shape), m_values(values), m_pattern(pattern, shape) { computeSize(); } - explicit PatternTensorView(real_t* values, std::initializer_list shape, DenseTensorView pattern) + explicit PatternTensorView(data_t values, std::initializer_list shape, DenseTensorView pattern) : TensorView(shape), m_values(values), m_pattern(std::move(pattern)) { computeSize(); } - explicit PatternTensorView(real_t* values, uint_t const shape[], DenseTensorView pattern) + explicit PatternTensorView(data_t values, uint_t const shape[], DenseTensorView pattern) : TensorView(shape), m_values(values), m_pattern(std::move(pattern)) { computeSize(); } void computeSize() { - m_pattern.forall([&](const auto& index, const auto& idxval) { + m_pattern.forall([&](const auto& /*index*/, const auto& idxval) { if (idxval > 0) { ++m_size; } @@ -542,7 +543,7 @@ namespace yateto { } void setZero() { - m_pattern.forall([&](const auto& index, const auto& idxval) { + m_pattern.forall([&](const auto& /*index*/, const auto& idxval) { if (idxval > 0) { m_values[idxval - 1] = 0; } @@ -557,7 +558,7 @@ namespace yateto { } template - real_t& operator()(Args... index) { + dataref_t operator()(Args... index) { static_assert((std::is_integral_v && ...)); const auto idx = m_pattern(index...); return m_values[idx - 1]; @@ -570,7 +571,7 @@ namespace yateto { return idx > 0; } - real_t& operator[](const uint_t entry[Dim]) { + dataref_t operator[](const uint_t entry[Dim]) { const auto idx = m_pattern[entry]; return m_values[idx - 1]; } @@ -599,7 +600,7 @@ namespace yateto { } template - void copyToView(view_t& other) { + void copyToView(view_t& other) const { m_pattern.forall([&](const auto& index, const auto& idxval) { if (idxval > 0) { other[index] = m_values[idxval - 1]; @@ -614,12 +615,19 @@ namespace yateto { auto subtensor(Entry... entry) { static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); const auto patternSubtensor = m_pattern.subtensor(entry...); - return PatternTensorView::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor); + return PatternTensorView::value, real_t, uint_t, Const>(m_values, this->m_shape, patternSubtensor); + } + + template + auto subtensor(Entry... entry) const { + static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); + const auto patternSubtensor = m_pattern.subtensor(entry...); + return PatternTensorView::value, real_t, uint_t, true>(m_values, this->m_shape, patternSubtensor); } protected: - real_t* m_values{nullptr}; - DenseTensorView m_pattern; + data_t m_values{nullptr}; + DenseTensorView m_pattern; std::size_t m_size{0}; }; } // namespace yateto