From 006af5df9286b0060e3861d11356c90ce368d2b6 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Tue, 25 Nov 2025 01:53:57 +0100
Subject: [PATCH 01/15] Fix one-element sum accumulations

---
 yateto/controlflow/visitor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/yateto/controlflow/visitor.py b/yateto/controlflow/visitor.py
index b06d301..0042e80 100644
--- a/yateto/controlflow/visitor.py
+++ b/yateto/controlflow/visitor.py
@@ -42,7 +42,7 @@ def generic_visit(self, node):
   
   def visit_Add(self, node):
     variables = [self.visit(child) for child in node]
-    assert len(variables) > 1
+    assert len(variables) >= 1
 
     variables.sort(key=lambda var: int(not var.writable) + int(not var.isGlobal()))
 

From 16f76166927a5e5685982751d90b34126846819b Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Tue, 25 Nov 2025 01:54:15 +0100
Subject: [PATCH 02/15] Fix action merging if we'd override a global variable

---
 yateto/controlflow/transformer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/yateto/controlflow/transformer.py b/yateto/controlflow/transformer.py
index 9195bdc..db937d4 100644
--- a/yateto/controlflow/transformer.py
+++ b/yateto/controlflow/transformer.py
@@ -99,7 +99,11 @@ def visit(self, cfg):
         V = ua.variables()
         for j in range(i+1,n):
           va = cfg[j].action
-          if va.isRHSVariable() and ua.result == va.term and va.result not in V and (ua.hasTrivialScalar() or va.hasTrivialScalar()):
+          if va.isRHSVariable() \
+              and ua.result == va.term \
+              and va.result not in V \
+              and (ua.hasTrivialScalar() or va.hasTrivialScalar()) \
+              and ua.result.isLocal():
             found = j
             break
           elif ua.result in va.variables() or ua.result == va.result:

From e1d7f49211bd6f20f41773c03eb6777dba894af1 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Sat, 29 Nov 2025 22:32:26 +0100
Subject: [PATCH 03/15] Add sparse tensor support

---
 include/yateto/TensorView.h       | 186 ++++++++++++++++++++-
 yateto/aspp.py                    |  12 +-
 yateto/ast/node.py                |   2 +-
 yateto/codegen/common.py          |  32 +++-
 yateto/codegen/gemm/factory.py    |   5 +-
 yateto/codegen/log/generic.py     | 110 +++++++++----
 yateto/codegen/product/factory.py |   4 +-
 yateto/codegen/visitor.py         |  25 ++-
 yateto/memory.py                  | 257 +++++++++++++++++++++++++++---
 9 files changed, 557 insertions(+), 76 deletions(-)

diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h
index 0e243dd..f15c438 100644
--- a/include/yateto/TensorView.h
+++ b/include/yateto/TensorView.h
@@ -4,6 +4,7 @@
 #include <cassert>
 #include <cstring>
 #include <algorithm>
+#include <functional>
 #include <limits>
 #include <type_traits>
 
@@ -109,6 +110,52 @@ namespace yateto {
     uint_t size() const {
       return (m_stop[Dim-1]-m_start[Dim-1]) * m_stride[Dim-1];
     }
+
+    template<typename F>
+    void forall(F&& function) {
+      uint_t entry[Dim];
+      std::copy(m_start, m_start + Dim, entry);
+      while (entry[Dim-1] != m_stop[Dim-1]) {
+        auto values = &operator[](entry);
+        for (uint_t i = 0; i < m_stop[0]-m_start[0]; ++i) {
+          entry[0] = i + m_start[0];
+          std::invoke(std::forward<F>(function), entry, values[i*m_stride[0]]);
+        }
+        if (Dim == 1) {
+          break;
+        }
+
+        uint_t d = 0;
+        do {
+          entry[d] = m_start[d];
+          d++;
+          ++entry[d];
+        } while (entry[d] == m_stop[d] && d < Dim-1);
+      }
+    }
+
+    template<typename F>
+    void forall(F&& function) const {
+      uint_t entry[Dim];
+      std::copy(m_start, m_start + Dim, entry);
+      while (entry[Dim-1] != m_stop[Dim-1]) {
+        auto values = &operator[](entry);
+        for (uint_t i = 0; i < m_stop[0]-m_start[0]; ++i) {
+          entry[0] = i + m_start[0];
+          std::invoke(std::forward<F>(function), entry, values[i*m_stride[0]]);
+        }
+        if (Dim == 1) {
+          break;
+        }
+
+        uint_t d = 0;
+        do {
+          entry[d] = m_start[d];
+          d++;
+          ++entry[d];
+        } while (entry[d] == m_stop[d] && d < Dim-1);
+      }
+    }
     
     void setZero() {
       uint_t entry[Dim];
@@ -400,11 +447,148 @@ namespace yateto {
       }
     }
 
+    template<typename F>
+    void forall(F&& function) {
+      uint_t entry[2];
+      uint_t ncols = this->shape(1);
+      for (uint_t col = 0; col < ncols; ++col) {
+        entry[1] = col;
+        for (uint_t i = m_colPtr[col]; i < m_colPtr[col+1]; ++i) {
+          entry[0] = m_rowInd[i];
+          std::invoke(std::forward<F>(function), entry, m_values[i]);
+        }
+      }
+    }
+
+    template<typename F>
+    void forall(F&& function) const {
+      uint_t entry[2];
+      uint_t ncols = this->shape(1);
+      for (uint_t col = 0; col < ncols; ++col) {
+        entry[1] = col;
+        for (uint_t i = m_colPtr[col]; i < m_colPtr[col+1]; ++i) {
+          entry[0] = m_rowInd[i];
+          std::invoke(std::forward<F>(function), entry, m_values[i]);
+        }
+      }
+    }
+
   protected:
     real_t* m_values;
     uint_t const* m_rowInd;
     uint_t const* m_colPtr;
   };
-}
+
+  template<unsigned Dim, typename real_t, typename uint_t>
+  class PatternMatrixView : public TensorView<Dim, real_t, uint_t> {
+  public:
+    // TODO: remove const_cast<uint_t*>(pattern)
+
+    explicit PatternMatrixView(real_t* values, std::initializer_list<uint_t> shape, uint_t const* pattern)
+      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(const_cast<uint_t*>(pattern), shape) {
+
+      m_pattern.forall([&](const auto& index, const auto& idxval) {
+        if (idxval > 0) {
+          ++m_size;
+        }
+      });
+    }
+
+    explicit PatternMatrixView(real_t* values, uint_t const shape[], uint_t const* pattern)
+      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(const_cast<uint_t*>(pattern), shape) {
+
+      m_pattern.forall([&](const auto& index, const auto& idxval) {
+        if (idxval > 0) {
+          ++m_size;
+        }
+      });
+    }
+
+    uint_t size() const {
+      return m_size;
+    }
+
+    void setZero() {
+      m_pattern.forall([&](const auto& index, const auto& idxval) {
+        if (idxval > 0) {
+          m_values[idxval - 1] = 0;
+        }
+      });
+    }
+
+    template<typename ...Args>
+    real_t operator()(Args... index) const {
+      static_assert((std::is_integral_v<Args> && ...));
+      const auto idx = m_pattern(index...);
+      return m_values[idx - 1];
+    }
+
+    template<typename ...Args>
+    real_t& operator()(Args... index) {
+      static_assert((std::is_integral_v<Args> && ...));
+      const auto idx = m_pattern(index...);
+      return m_values[idx - 1];
+    }
+
+    template<typename ...Args>
+    bool isInRange(Args... index) const {
+      static_assert((std::is_integral_v<Args> && ...));
+      const auto idx = m_pattern(index...);
+      return idx > 0;
+    }
+
+    real_t& operator[](const uint_t entry[Dim]) {
+      const auto idx = m_pattern[entry];
+      return m_values[idx - 1];
+    }
+
+    real_t operator[](const uint_t entry[Dim]) const {
+      const auto idx = m_pattern[entry];
+      return m_values[idx - 1];
+    }
+
+    template<typename F>
+    void forall(F&& function) {
+      m_pattern.forall([&, function = std::forward<F>(function)](const auto& index, const auto& idxval) {
+        if (idxval > 0) {
+          std::invoke(function, index, m_values[idxval - 1]);
+        }
+      });
+    }
+
+    template<typename F>
+    void forall(F&& function) const {
+      m_pattern.forall([&, function = std::forward<F>(function)](const auto& index, const auto& idxval) {
+        if (idxval > 0) {
+          std::invoke(function, index, m_values[idxval - 1]);
+        }
+      });
+    }
+
+    template<class view_t>
+    void copyToView(view_t& other) {
+      m_pattern.forall([&](const auto& index, const auto& idxval) {
+        if (idxval > 0) {
+          other[index] = m_values[idxval - 1];
+        }
+        else {
+          other[index] = 0;
+        }
+      });
+    }
+
+    template<typename... Entry>
+    auto subtensor(Entry... entry) const {
+      static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
+      const auto patternSubtensor = m_pattern.subtensor(entry...);
+      return PatternMatrixView<count_slices<uint_t, Entry...>::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor);
+    }
+
+  protected:
+    real_t* m_values{nullptr};
+    DenseTensorView<Dim, uint_t, uint_t> m_pattern;
+    std::size_t m_size{0};
+  };
+} // namespace yateto
 
 #endif
diff --git a/yateto/aspp.py b/yateto/aspp.py
index c3b0b66..5ccc5b7 100644
--- a/yateto/aspp.py
+++ b/yateto/aspp.py
@@ -43,7 +43,7 @@ def transposed(self, shape):
     pass
 
   @abstractmethod
-  def indexSum(self, sourceIndices, targetIndices):
+  def indexSum(self, sourceIndices, targetIndices, fixedIndices):
     pass
 
   @abstractmethod
@@ -74,7 +74,8 @@ def reshape(self, shape):
   def transposed(self, perm):
     return type(self)(tuple(self.shape[p] for p in perm))
 
-  def indexSum(self, sourceIndices, targetIndices):
+  def indexSum(self, sourceIndices, targetIndices, fixedIndices):
+    # silently assume that targetIndices and fixedIndices don't overlap
     return type(self)(tuple(self.shape[sourceIndices.find(targetIndex)] for targetIndex in targetIndices))
 
   @staticmethod
@@ -160,8 +161,11 @@ def reshape(self, shape):
   def transposed(self, perm):
     return type(self)(self.pattern.transpose(perm).copy(order=self.NUMPY_DEFAULT_ORDER))
 
-  def indexSum(self, sourceIndices, targetIndices):
-    return general(np.einsum('{}->{}'.format(sourceIndices, targetIndices), self.pattern))
+  def indexSum(self, sourceIndices, targetIndices, fixedIndices):
+    # silently assume that targetIndices and fixedIndices don't overlap
+    selector = tuple(fixedIndices[idx] if idx in fixedIndices else slice(None) for idx in sourceIndices)
+    einsumsource = ''.join('' if idx in fixedIndices else idx for idx in sourceIndices)
+    return general(np.einsum('{}->{}'.format(einsumsource, targetIndices), self.pattern[selector]))
 
   @staticmethod
   def add(a1, a2):
diff --git a/yateto/ast/node.py b/yateto/ast/node.py
index 93da011..78fc58b 100644
--- a/yateto/ast/node.py
+++ b/yateto/ast/node.py
@@ -335,7 +335,7 @@ def sumIndex(self):
   def computeSparsityPattern(self, *spps):
     assert len(spps) <= 1
     spp = spps[0] if len(spps) == 1 else self.term().eqspp()
-    return spp.indexSum(self.term().indices, self.indices)
+    return spp.indexSum(self.term().indices, self.indices, {})
 
 class Contraction(BinOp):
   def __init__(self, indices, lTerm, rTerm, sumIndices):
diff --git a/yateto/codegen/common.py b/yateto/codegen/common.py
index e2b8966..0b80e75 100644
--- a/yateto/codegen/common.py
+++ b/yateto/codegen/common.py
@@ -66,20 +66,36 @@ def fromVar(cls, var, indices):
         addressing = None # var.tensor.addressing
     return cls(str(var), indices, var.memoryLayout(), var.eqspp(), is_const, var.is_temporary, values, datatype, addressing)
 
-def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo=None):
+def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', fixed={}, indexNo=None):
   flops = 0
+  firstLoop = False
   if indexNo == None:
     indexNo = len(indexNames)-1
+    firstLoop = True
   if indexNo < 0:
-    flops = body()
+    if firstLoop:
+      with cpp.AnonymousScope():
+        flops = body()
+    else:
+      flops = body()
   else:
     index = indexNames[indexNo]
     rng = ranges[index]
-    if pragmaSimd and indexNo == 0:
+    if pragmaSimd:
       cpp('#pragma omp simd')
-    with cpp.For('int {3}{0} = {1}; {3}{0} < {2}; ++{3}{0}'.format(index, rng.start, rng.stop, prefix)):
-      flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, indexNo-1)
-    flops = flops * rng.size()
+    if index in fixed:
+      value = fixed[index]
+      if value >= rng.start and value < rng.stop:
+        with cpp.AnonymousScope():
+          cpp(f'constexpr int {prefix}{index} = {value};')
+          flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, fixed, indexNo-1)
+      else:
+        # out of range
+        flops = 0
+    else:
+      with cpp.For('int {3}{0} = {1}; {3}{0} < {2}; ++{3}{0}'.format(index, rng.start, rng.stop, prefix)):
+        flops = forLoops(cpp, indexNames, ranges, body, False, prefix, fixed, indexNo-1)
+      flops = flops * rng.size()
   return flops
   
 def loopRanges(term: IndexedTensorDescription, loopIndices):
@@ -98,8 +114,8 @@ def testLoopRangesAContainedInB(A, B):
 def boundingBoxFromLoopRanges(indices, loopRanges):
   return BoundingBox([loopRanges[index] for index in indices])
 
-def reduceSpp(spp, sourceIndices, targetIndices):
-  return spp.indexSum(sourceIndices, targetIndices)
+def reduceSpp(spp, sourceIndices, targetIndices, fixedIndices):
+  return spp.indexSum(sourceIndices, targetIndices, fixedIndices)
 
 def initializeWithZero(cpp, arch, result: TensorDescription, writeBB = None):
   if writeBB:
diff --git a/yateto/codegen/gemm/factory.py b/yateto/codegen/gemm/factory.py
index 595e0c0..50fc3e5 100644
--- a/yateto/codegen/gemm/factory.py
+++ b/yateto/codegen/gemm/factory.py
@@ -1,5 +1,4 @@
 from ...ast.indices import BoundingBox, Range
-from ...memory import CSCMemoryLayout
 from ..common import TensorDescription
 from .generic import Generic
 from .gemmgen import GemmGen
@@ -28,8 +27,8 @@ def __init__(self,
     self.beta = beta
     self.prefetchName = prefetchName
     
-    self.isACsc = isinstance(self.leftTerm.memoryLayout, CSCMemoryLayout)
-    self.isBCsc = isinstance(self.rightTerm.memoryLayout, CSCMemoryLayout)
+    self.isACsc = self.leftTerm.memoryLayout.isSparse()
+    self.isBCsc = self.rightTerm.memoryLayout.isSparse()
     
     bbA = BoundingBox.fromSpp(self.leftTerm.eqspp)
     bbB = BoundingBox.fromSpp(self.rightTerm.eqspp)
diff --git a/yateto/codegen/log/generic.py b/yateto/codegen/log/generic.py
index 68c2e50..5948980 100644
--- a/yateto/codegen/log/generic.py
+++ b/yateto/codegen/log/generic.py
@@ -9,40 +9,46 @@ def __init__(self, arch, descr, target):
     self._descr = descr
     self._target = target
   
-  def _pointer(self, cpp, targetName, baseName, term, loopIndices, const=True):
+  def _pointer(self, cpp, targetName, baseName, term, loopIndices, fixed, const=True):
     indices = term.indices & loopIndices
-    addressStr = term.memoryLayout.addressString(term.indices, indices) if len(indices) > 0 else ''
+    addressStr = term.memoryLayout.addressString(term.indices, indices, fixed) if len(indices) > 0 else ''
     if len(addressStr) > 0:
       addressStr = ' + ' + addressStr
     cpp('{} {}* {} = {}{};'.format(self._arch.typename, 'const' if const else '', targetName, baseName, addressStr))
 
-  def _alignedStart(self, term, loopIndices):
+  def _alignedStart(self, term, loopIndices, fixed):
     if len(loopIndices) == 0:
       return True
-    return term.memoryLayout.isAlignedAddressString(term.indices, term.indices & loopIndices)
+    return term.memoryLayout.isAlignedAddressString(term.indices, term.indices & loopIndices, fixed)
     
-  def _memLayout(self, term, I, J):
+  def _memLayout(self, term, I, J, fixed):
     if len(I) == 0 and len(J) == 0:
       return DenseMemoryLayout((1,1))
     elif len(I) == 0:
-      ml = term.memoryLayout.vec(term.indices, J)
+      ml = term.memoryLayout.vec(term.indices, J, fixed)
       return ml.withDummyDimension()
     elif len(J) == 0:
-      ml = term.memoryLayout.vec(term.indices, I)
+      ml = term.memoryLayout.vec(term.indices, I, fixed)
       return ml.withDummyDimension()
     elif len(term.indices) == 2:
       return term.memoryLayout
-    return term.memoryLayout.unfold(term.indices, I, J)
+    return term.memoryLayout.unfold(term.indices, I, J, fixed)
+  
+  def _unroll(self, term, I):
+    if not term.memoryLayout.isSparse():
+      return set()
+
+    return I
 
-  def _reduce(self, term, subset, memLayout):
-    return reduceSpp(term.eqspp, term.indices, subset).reshape(memLayout.shape())
+  def _reduce(self, term, subset, memLayout, fixed):
+    return reduceSpp(term.eqspp, term.indices, subset, fixed).reshape(memLayout.shape())
   
   def _defuse(self, fusedRange, term, I):
     if len(I) == 1:
       return  {next(iter(I)): fusedRange}
     return term.memoryLayout.defuse(fusedRange, term.indices, I)
 
-  def generate(self, cpp, routineCache, gemm_cfg):
+  def _generateSingle(self, cpp, routineCache, gemm_cfg, fixed = {}):
     d = self._descr
     
     A = d.leftTerm.indices - d.loopIndices
@@ -69,15 +75,15 @@ def generate(self, cpp, routineCache, gemm_cfg):
     innerCname = '_Cin' if hasInnerLoops else outerCname
     innerPrefetchName = '_Cprefetchin' if hasInnerLoops and outerPrefetchName is not None else outerPrefetchName
 
-    alignedStartA = not hasOuterLoops or self._alignedStart(d.leftTerm, d.outerLoopIndices)
+    alignedStartA = not hasOuterLoops or self._alignedStart(d.leftTerm, d.outerLoopIndices, fixed)
     
-    AmemLayout = self._memLayout(d.leftTerm, Im, Ik)
-    BmemLayout = self._memLayout(d.rightTerm, Ik, In)
-    CmemLayout = self._memLayout(d.result, Im, In)
+    AmemLayout = self._memLayout(d.leftTerm, Im, Ik, fixed)
+    BmemLayout = self._memLayout(d.rightTerm, Ik, In, fixed)
+    CmemLayout = self._memLayout(d.result, Im, In, fixed)
 
-    Aeqspp = self._reduce(d.leftTerm, A, AmemLayout)
-    Beqspp = self._reduce(d.rightTerm, B, BmemLayout)
-    Ceqspp = self._reduce(d.result, C, CmemLayout)
+    Aeqspp = self._reduce(d.leftTerm, A, AmemLayout, fixed)
+    Beqspp = self._reduce(d.rightTerm, B, BmemLayout, fixed)
+    Ceqspp = self._reduce(d.result, C, CmemLayout, fixed)
 
     gemmDescr = gemm.Description(
       leftTerm = TensorDescription(innerAname, AmemLayout, Aeqspp, d.leftTerm.is_compute_constant, d.leftTerm.is_temporary),
@@ -88,8 +94,8 @@ def generate(self, cpp, routineCache, gemm_cfg):
       alpha = d.alpha,
       beta = 1.0 if d.add else 0.0,
       arch = self._arch,
-      alignedStartA = self._alignedStart(d.leftTerm, d.outerLoopIndices) and self._alignedStart(d.leftTerm, d.innerLoopIndices),
-      alignedStartC = self._alignedStart(d.result, d.outerLoopIndices) and self._alignedStart(d.result, d.innerLoopIndices),
+      alignedStartA = self._alignedStart(d.leftTerm, d.outerLoopIndices, fixed) and self._alignedStart(d.leftTerm, d.innerLoopIndices, fixed),
+      alignedStartC = self._alignedStart(d.result, d.outerLoopIndices, fixed) and self._alignedStart(d.result, d.innerLoopIndices, fixed),
       prefetchName = innerPrefetchName
     )
     
@@ -105,11 +111,11 @@ def generate(self, cpp, routineCache, gemm_cfg):
     class LoGBody(object):
       def __call__(s):
         if hasInnerLoops:
-          self._pointer(cpp, innerAname, outerAname, d.leftTerm, d.innerLoopIndices)
-          self._pointer(cpp, innerBname, outerBname, d.rightTerm, d.innerLoopIndices)
-          self._pointer(cpp, innerCname, outerCname, d.result, d.innerLoopIndices, const=False)
+          self._pointer(cpp, innerAname, outerAname, d.leftTerm, d.innerLoopIndices, fixed)
+          self._pointer(cpp, innerBname, outerBname, d.rightTerm, d.innerLoopIndices, fixed)
+          self._pointer(cpp, innerCname, outerCname, d.result, d.innerLoopIndices, fixed, const=False)
           if outerPrefetchName is not None:
-            self._pointer(cpp, innerPrefetchName, outerPrefetchName, d.result, d.innerLoopIndices)
+            self._pointer(cpp, innerPrefetchName, outerPrefetchName, d.result, d.innerLoopIndices, fixed)
         generator = gemm.generator(self._arch, gemmDescr, gemm_cfg, self._target)
         return generator.generate(cpp, routineCache)
 
@@ -117,18 +123,60 @@ class InnerLoopBody(object):
       def __call__(s):
         flops = 0
         if hasOuterLoops:
-          self._pointer(cpp, outerAname, d.leftTerm.name, d.leftTerm, d.outerLoopIndices)
-          self._pointer(cpp, outerBname, d.rightTerm.name, d.rightTerm, d.outerLoopIndices)
-          self._pointer(cpp, outerCname, d.result.name, d.result, d.outerLoopIndices, const=False)
+          self._pointer(cpp, outerAname, d.leftTerm.name, d.leftTerm, d.outerLoopIndices, fixed)
+          self._pointer(cpp, outerBname, d.rightTerm.name, d.rightTerm, d.outerLoopIndices, fixed)
+          self._pointer(cpp, outerCname, d.result.name, d.result, d.outerLoopIndices, fixed, const=False)
           if d.prefetchName is not None:
-            self._pointer(cpp, outerPrefetchName, d.prefetchName, d.result, d.outerLoopIndices)
+            self._pointer(cpp, outerPrefetchName, d.prefetchName, d.result, d.outerLoopIndices, fixed)
+
         if d.assignLoopRanges is not None:
           gemmDescr.setBeta(0.0)
-          flops += forLoops(cpp, d.innerLoopIndices, d.assignLoopRanges, LoGBody(), pragmaSimd=False)
+          flops += forLoops(cpp, d.innerLoopIndices, d.assignLoopRanges, LoGBody(), pragmaSimd=False, fixed=fixed)
         if d.addLoopRanges is not None:
           gemmDescr.setBeta(1.0)
-          flops += forLoops(cpp, d.innerLoopIndices, d.addLoopRanges, LoGBody(), pragmaSimd=False)
+          flops += forLoops(cpp, d.innerLoopIndices, d.addLoopRanges, LoGBody(), pragmaSimd=False, fixed=fixed)
         return flops
 
-    return forLoops(cpp, d.outerLoopIndices, d.loopRanges, InnerLoopBody(), pragmaSimd=False)
+    return forLoops(cpp, d.outerLoopIndices, d.loopRanges, InnerLoopBody(), pragmaSimd=False, fixed=fixed)
+
+  def _generateUnroll(self, cpp, routineCache, gemm_cfg, fixed, unroll):
+    d = self._descr
+
+    if len(unroll) == 0:
+      return self._generateSingle(cpp, routineCache, gemm_cfg, fixed)
+    
+    unrollNow = unroll[0]
+
+    rngNow = d.loopRanges[unrollNow]
+
+    result = 0
+    for i in range(rngNow.start, rngNow.stop):
+      fixedNow = dict(fixed)
+      fixedNow[unrollNow] = i
+      result += self._generateUnroll(cpp, routineCache, gemm_cfg, fixedNow, unroll[1:])
+    
+    return result
+
+  def generate(self, cpp, routineCache, gemm_cfg):
+    d = self._descr
+    
+    A = d.leftTerm.indices - d.loopIndices
+    B = d.rightTerm.indices - d.loopIndices
+    C = d.result.indices - d.loopIndices
+    Im = set(A) & set(C)
+    In = set(B) & set(C)
+    Ik = set(A) & set(B)
+
+    toBeUnrolled = d.loopRanges.keys()
+
+    unrollNeeded = set()
+    if d.leftTerm.memoryLayout.isSparse():
+      unrollNeeded |= set(d.leftTerm.indices)
+    if d.rightTerm.memoryLayout.isSparse():
+      unrollNeeded |= set(d.rightTerm.indices)
+    if d.result.memoryLayout.isSparse():
+      unrollNeeded |= set(d.result.indices)
+    
+    toBeUnrolled &= unrollNeeded
 
+    return self._generateUnroll(cpp, routineCache, gemm_cfg, {}, list(toBeUnrolled))
diff --git a/yateto/codegen/product/factory.py b/yateto/codegen/product/factory.py
index 8a85366..3871277 100644
--- a/yateto/codegen/product/factory.py
+++ b/yateto/codegen/product/factory.py
@@ -10,8 +10,8 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, leftTerm:
     self.leftTerm = leftTerm
     self.rightTerm = rightTerm
 
-    self.isACsc = isinstance(self.leftTerm.memoryLayout, CSCMemoryLayout)
-    self.isBCsc = isinstance(self.rightTerm.memoryLayout, CSCMemoryLayout)
+    self.isACsc = self.leftTerm.memoryLayout.isSparse()
+    self.isBCsc = self.rightTerm.memoryLayout.isSparse()
     
     rA = loopRanges(self.leftTerm, self.result.indices)
     rB = loopRanges(self.rightTerm, self.result.indices)
diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py
index d322bd3..2cf384e 100644
--- a/yateto/codegen/visitor.py
+++ b/yateto/codegen/visitor.py
@@ -12,6 +12,8 @@
 from .common import BatchedOperationsAux
 from ..type import Scalar
 
+import numpy as np
+
 SUPPORT_LIBRARY_NAMESPACE = 'yateto'
 CONSTEXPR = 'constexpr'
 STATIC = 'static'
@@ -646,6 +648,8 @@ def generate(cpp, group, memLayout):
       raise NotImplementedError
     
     def listToInitializerList(self, lst):
+      if isinstance(lst, np.ndarray):
+        lst = lst.flatten()
       return '{{{}}}'.format(', '.join([str(l) for l in lst]))
     
     def formatArray(self, numberType, name, values, declarationOnly):
@@ -692,6 +696,24 @@ def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declaration
       cpp(self.formatArray(numberType, namespace + self.ROWIND_NAME + index, memLayout.rowIndex(), declarationOnly))
       cpp(self.formatArray(numberType, namespace + self.COLPTR_NAME + index, memLayout.colPointer(), declarationOnly))
 
+  class PatternMatrixView(TensorView):
+    PATTERN_NAME = 'Pattern'
+    
+    def typename(self, dim, arch):
+      return f'::{SUPPORT_LIBRARY_NAMESPACE}::{type(self).__name__}<{dim}, {arch.typename},{arch.uintTypename}>'
+
+    def generate(self, cpp, memLayout, arch, index):
+      cpp( 'return {}({}, {}, {});'.format(
+          self.typename(len(memLayout.shape()), arch),
+          self.ARGUMENT_NAME,
+          self.listToInitializerList(memLayout.shape()),
+          self.PATTERN_NAME + (index if index is not None else '')
+        )
+      )
+
+    def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declarationOnly):
+      cpp(self.formatArray(numberType, namespace + self.PATTERN_NAME + index, memLayout.pattern(), declarationOnly))
+
   def __init__(self, arch, tensors, scalars):
     self._arch = arch
     self._numberType = '{} const'.format(self._arch.uintTypename)
@@ -734,7 +756,8 @@ def __init__(self, arch, tensors, scalars):
   def _tensorViewGenerator(self, memoryLayout):
     memLayoutMap = {
       'DenseMemoryLayout': self.DenseTensorView,
-      'CSCMemoryLayout': self.CSCMatrixView
+      'CSCMemoryLayout': self.CSCMatrixView,
+      'PatternMemoryLayout': self.PatternMatrixView
     }
     return memLayoutMap[type(memoryLayout).__name__]()
   
diff --git a/yateto/memory.py b/yateto/memory.py
index 9a689c8..dbefe19 100644
--- a/yateto/memory.py
+++ b/yateto/memory.py
@@ -5,6 +5,8 @@
 import numpy as np
 from abc import ABC, abstractmethod
 
+from . import aspp
+
 class MemoryLayout(ABC):
   def __init__(self, shape):
     self._shape = shape
@@ -48,6 +50,36 @@ def __eq__(self, other):
   def isCompatible(self, spp):
     pass
 
+  def _subShape(self, positions):
+    sub = 1
+    for p in positions:
+      sub *= self._shape[p]
+    return sub
+
+  def defuse(self, fusedRange, indices, I):
+    positions = indices.positions(I)
+    s = self._subShape(positions)
+    ranges = dict()
+    start = fusedRange.start
+    stop = fusedRange.stop-1
+    for p in reversed(positions):
+      s //= self._shape[p]
+      b = start // s
+      B = stop // s
+      ranges[ indices[p] ] = Range(b, B+1)
+      start -= b*s
+      stop -= B*s
+    return ranges
+  
+  def notWrittenAddresses(self, writeBB):
+    if writeBB == self._bbox:
+      return []
+
+    assert writeBB in self._bbox
+    re = [range(r.start, r.stop) for r in self._bbox]
+    we = [range(w.start, w.stop) for w in writeBB]
+    return [self.address(e) for e in set(itertools.product(*re)) - set(itertools.product(*we)) if self.hasValue(e)]
+
 class DenseMemoryLayout(MemoryLayout):
   ALIGNMENT_ARCH = None
 
@@ -147,7 +179,7 @@ def requiredReals(self):
     size = self._bbox[-1].size() * self._stride[-1]
     return size
   
-  def addressString(self, indices, I = None, prefix='_'):
+  def addressString(self, indices, I = None, Z = None, prefix='_'):
     if len(self._bbox) == 0:
       return '0'
     if I is None:
@@ -161,7 +193,7 @@ def addressString(self, indices, I = None, prefix='_'):
         a.append('{}*{}{}'.format(self._stride[p], prefix, indices[p]))
     return ' + '.join(a)
 
-  def isAlignedAddressString(self, indices, I = None):
+  def isAlignedAddressString(self, indices, I = None, Z = None):
     if I is None:
       I = set(indices)
     positions = indices.positions(I)
@@ -173,12 +205,6 @@ def isAlignedAddressString(self, indices, I = None):
   def mayFuse(self, positions):
     return all( [self._stride[j] == self._shape[i]*self._stride[i] for i,j in zip(positions[:-1], positions[1:])] )
   
-  def _subShape(self, positions):
-    sub = 1
-    for p in positions:
-      sub *= self._shape[p]
-    return sub
-  
   def _subRange(self, positions):
     start = 0
     stop = 0
@@ -192,7 +218,7 @@ def _subRange(self, positions):
   def _firstStride(self, positions):
     return self._stride[ positions[0] ]
 
-  def vec(self, indices, I):
+  def vec(self, indices, I, Z):
     positionsI = indices.positions(I)
     assert self.mayFuse( indices.positions(I) )
 
@@ -208,7 +234,7 @@ def withDummyDimension(self):
     stride = self._stride + (self._bbox[-1].size() * self._stride[-1],)
     return DenseMemoryLayout(shape, bbox, stride)
 
-  def unfold(self, indices, I, J):
+  def unfold(self, indices, I, J, Z):
     positionsI = indices.positions(I)
     positionsJ = indices.positions(J)
     assert self.mayFuse( indices.positions(I) ) and self.mayFuse( indices.positions(J) )
@@ -221,21 +247,6 @@ def unfold(self, indices, I, J):
     stride = (self._firstStride(positionsI), self._firstStride(positionsJ))
 
     return DenseMemoryLayout(shape, bbox, stride)
-  
-  def defuse(self, fusedRange, indices, I):
-    positions = indices.positions(I)
-    s = self._subShape(positions)
-    ranges = dict()
-    start = fusedRange.start
-    stop = fusedRange.stop-1
-    for p in reversed(positions):
-      s //= self._shape[p]
-      b = start // s
-      B = stop // s
-      ranges[ indices[p] ] = Range(b, B+1)
-      start -= b*s
-      stop -= B*s
-    return ranges
 
   def isCompatible(self, spp):
     return BoundingBox.fromSpp(spp) in self.bbox()
@@ -245,6 +256,13 @@ def __eq__(self, other):
 
   def __str__(self):
     return '{}(shape: {}, bounding box: {}, stride: {})'.format(type(self).__name__, self._shape, self._bbox, self._stride)
+  
+  def isSparse(self):
+    return False
+  
+  def hasValue(self, entry):
+    assert entry in self._bbox
+    return True
 
 class CSCMemoryLayout(MemoryLayout):
   def __init__(self, spp, alignStride=False):
@@ -318,6 +336,16 @@ def address(self, entry):
 
     return start + find[0]
   
+  def hasValue(self, entry):
+    assert entry in self._bbox
+
+    start = self._colPtr[ entry[1] ]
+    stop = self._colPtr[ entry[1]+1 ]
+    subRowInd = self._rowIndex[start:stop]
+ 
+    find = np.where(subRowInd == entry[0])[0]
+    return len(find) == 1
+  
   def subtensorOffset(self, topLeftEntry):
     assert topLeftEntry in self._bbox
     assert topLeftEntry[0] <= self._bbox[0].start
@@ -358,7 +386,186 @@ def isCompatible(self, spp):
   def __eq__(self, other):
     return self._bbox == other._bbox and np.array_equal(self._rowIndex, other._rowIndex) and np.array_equal(self._colPtr, other._colPtr)
 
+  def isSparse(self):
+    return True
+
+
+class PatternMemoryLayout(MemoryLayout):
+  def __init__(self, spp, alignStride=False, pattern=None):
+    super().__init__(spp.shape if spp is not None else pattern.shape)
+
+    if spp is None:
+      spp = aspp.general(pattern != 0)
+
+    self.aligned = alignStride
+
+    self._bbox = BoundingBox.fromSpp(spp)
+    if self.aligned:
+      range0 = self._bbox[0]
+      rnew = Range( DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(range0.start), DenseMemoryLayout.ALIGNMENT_ARCH.alignedUpper(range0.stop) )
+      self._bbox = BoundingBox([rnew] + self._bbox[1:])
+    
+    nonzeros = spp.nonzero()
+    nonzeros = sorted(zip(*nonzeros), key=lambda x: x[::-1])
+
+    if self.aligned:
+      nonzeros_pre = set(nonzeros)
+      for nonzero in nonzeros:
+        lower = DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(nonzero[0])
+        # no alignedUpper call here: avoid reduction to a single element when on alignment boundaries
+        upper = lower + DenseMemoryLayout.ALIGNMENT_ARCH.alignedReals
+
+        for i in range(lower, upper):
+          nonzeros_pre.add(tuple([np.int64(i)] + list(nonzero[1:])))
+      
+      nonzeros = list(nonzeros_pre)
+      nonzeros = sorted(zip(*[[nonzero[i] for nonzero in nonzeros] for i in range(len(self._shape))]), key=lambda x: x[::-1])
+    
+    self._pattern = np.zeros(self._shape, dtype=int, order='F')
+
+    for i, nonzero in enumerate(nonzeros):
+      self._pattern[tuple(nonzero)] = i + 1 if pattern is None else pattern[tuple(nonzero)]
+
+    self._nonzeros = nonzeros
+
+    # TODO: self._next = np.zeros(self._shape, dtype=int, order='F')
+    # point to the top-left entry
+
+  def requiredReals(self):
+    return len(self._nonzeros)
+  
+  def isSparse(self):
+    return True
+
+  def bbox(self):
+    return self._bbox
+
+  def bboxi(self, dim):
+    return self._bbox[dim]
+  
+  def hasValue(self, entry):
+    return self._pattern[tuple(entry)] > 0
+  
+  def address(self, entry):
+    assert entry in self._bbox
+    assert self._pattern[tuple(entry)] > 0
+
+    return self._pattern[tuple(entry)] - 1
+  
+  def subtensorOffset(self, topLeftEntry):
+    assert topLeftEntry in self._bbox
+    
+    return 0
+
+    #assert self._next[tuple(topLeftEntry)] > 0
+
+    #return self._next[tuple(topLeftEntry)] - 1
+
+  def entries(self, *rng):
+    return [tuple(e - r.start for e,r in zip(ex, rng)) for ex in self._nonzeros if
+      all(e >= r.start and e < r.stop for e,r in zip(ex, rng))]
+
+  def alignedStride(self):
+    return self.aligned
+
+  def mayVectorizeDim(self, dim):
+    return dim == 0 and self.aligned
+  
+  def pattern(self):
+    return self._pattern
+
+  @classmethod
+  def fromSpp(cls, spp, **kwargs):
+    return PatternMemoryLayout(spp, **kwargs)
+
+  def __contains__(self, entry):
+    return entry in self._bbox
+
+  def isCompatible(self, spp):
+    comp = self.fromSpp(spp, alignStride=self.aligned)
+
+    bboxOk = comp._bbox in self._bbox
+    sppOk = set(comp.entries(*comp._bbox)).issubset(set(self.entries(*comp._bbox)))
+
+    return bboxOk and sppOk
+  
+  def vec(self, indices, I, Z):
+    positionsI = indices.positions(I)
+    positionsZ = indices.positions(Z)
+
+    # I and Z need to partition perfectly
+
+    error = lambda: None
+    selector = [error for _ in range(len(self._shape))]
+
+    for p, z in zip(positionsZ, Z.values()):
+      selector[p] = z
+    for p in positionsI:
+      selector[p] = slice(None)
+    
+    pattern = self._pattern[tuple(selector)].transpose(positionsI).flatten()
+
+    return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern)
+
+  def withDummyDimension(self):
+    pattern = np.expand_dims(self._pattern, -1)
+    return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern)
+
+  def unfold(self, indices, I, J, Z):
+    positionsI = indices.positions(I)
+    positionsJ = indices.positions(J)
+    positionsZ = indices.positions(Z)
+
+    if positionsI[0] > positionsJ[0]:
+      positionsJ, positionsI = positionsI, positionsJ
+    
+    error = lambda: None
+    selector = [error for _ in range(len(self._shape))]
+    dimmap = [error for _ in range(len(self._shape))]
+
+    i = 0
+    sizeI = 1
+    sizeJ = 1
+    for p in positionsI:
+      selector[i] = slice(None)
+      dimmap[p] = i
+      sizeI *= self._pattern.shape[p]
+      i += 1
+    for p in positionsJ:
+      selector[i] = slice(None)
+      dimmap[p] = i
+      sizeJ *= self._pattern.shape[p]
+      i += 1
+    for p, z in zip(positionsZ, Z.values()):
+      selector[i] = z
+      dimmap[p] = i
+      i += 1
+
+    pattern = self._pattern.transpose(dimmap)[tuple(selector)].reshape((sizeI, sizeJ))
+
+    return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern)
+  
+  def addressString(self, indices, I = None, Z = None, prefix='_'):
+    # handled differently; via unrolling
+    return ''
+
+  def isAlignedAddressString(self, indices, I = None, Z = None):
+    # TODO
+    return self.aligned
+  
+  def mayFuse(self, positions):
+    # we can always generate a new pattern
+    return True
+
+  def __eq__(self, other):
+    return self._bbox == other._bbox and np.array_equal(self._pattern, other._pattern)
+
 class AlignedCSCMemoryLayout:
   @classmethod
   def fromSpp(cls, spp, **kwargs):
     return CSCMemoryLayout(spp, alignStride=True)
+
+class AlignedPatternMemoryLayout:
+  @classmethod
+  def fromSpp(cls, spp, **kwargs):
+    return PatternMemoryLayout(spp, alignStride=True)

From 28f5854414c8fccf4b9569b420ca07e2cc58cfdc Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Tue, 2 Dec 2025 16:59:08 +0100
Subject: [PATCH 04/15] Default-initialize some arrays

---
 include/yateto/TensorView.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h
index f15c438..5948699 100644
--- a/include/yateto/TensorView.h
+++ b/include/yateto/TensorView.h
@@ -47,7 +47,7 @@ namespace yateto {
     }
 
   protected:
-    uint_t m_shape[Dim];
+    uint_t m_shape[Dim]{};
   };
 
   template<typename real_t, typename uint_t>
@@ -113,7 +113,7 @@ namespace yateto {
 
     template<typename F>
     void forall(F&& function) {
-      uint_t entry[Dim];
+      uint_t entry[Dim]{};
       std::copy(m_start, m_start + Dim, entry);
       while (entry[Dim-1] != m_stop[Dim-1]) {
         auto values = &operator[](entry);
@@ -136,7 +136,7 @@ namespace yateto {
 
     template<typename F>
     void forall(F&& function) const {
-      uint_t entry[Dim];
+      uint_t entry[Dim]{};
       std::copy(m_start, m_start + Dim, entry);
       while (entry[Dim-1] != m_stop[Dim-1]) {
         auto values = &operator[](entry);
@@ -158,7 +158,7 @@ namespace yateto {
     }
     
     void setZero() {
-      uint_t entry[Dim];
+      uint_t entry[Dim]{};
       std::copy(m_start, m_start + Dim, entry);
       while (entry[Dim-1] != m_stop[Dim-1]) {
         auto values = &operator[](entry);
@@ -235,7 +235,7 @@ namespace yateto {
     void copyToView(view_t& other) const {
       assert(Dim == other.dim());
       
-      uint_t entry[Dim];
+      uint_t entry[Dim]{};
       for (uint_t d = 0; d < Dim; ++d) {
         assert(this->shape(d) == other.shape(d));
         entry[d] = m_start[d];
@@ -267,9 +267,9 @@ namespace yateto {
     auto subtensor(Entry... entry) -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t> const {
       static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
       constexpr auto nSlices = count_slices<uint_t, Entry...>::value;
-      uint_t begin[Dim];
-      uint_t size[nSlices];
-      uint_t stride[nSlices];
+      uint_t begin[Dim]{};
+      uint_t size[nSlices]{};
+      uint_t stride[nSlices]{};
       extractSubtensor(begin, size, stride, entry...);
       DenseTensorView<nSlices, real_t, uint_t> subtensor(&operator[](begin), size, stride);
       return subtensor;
@@ -331,9 +331,9 @@ namespace yateto {
     }
 
     real_t* m_values;
-    uint_t m_start[Dim];
-    uint_t m_stop[Dim];
-    uint_t m_stride[Dim];
+    uint_t m_start[Dim]{};
+    uint_t m_stop[Dim]{};
+    uint_t m_stride[Dim]{};
   };
 
   template<typename real_t, typename uint_t>

From e78a05e3bf166c8f8393a3f5e72e40e8bd5e01fc Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Wed, 3 Dec 2025 05:04:57 +0100
Subject: [PATCH 05/15] Adjust subtensor offset

---
 yateto/memory.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/yateto/memory.py b/yateto/memory.py
index dbefe19..7093af0 100644
--- a/yateto/memory.py
+++ b/yateto/memory.py
@@ -453,9 +453,13 @@ def address(self, entry):
     return self._pattern[tuple(entry)] - 1
   
   def subtensorOffset(self, topLeftEntry):
+    tle = topLeftEntry
     assert topLeftEntry in self._bbox
+
+    subpat = [self._pattern[tle] for ex in self._nonzeros if
+      all(e >= tle[i] for i,e in enumerate(ex))]
     
-    return 0
+    return subpat[0] - 1 if len(subpat) > 0 else 0
 
     #assert self._next[tuple(topLeftEntry)] > 0
 

From b5d10a500f797acb9a8f4124e043e14504c099af Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Mon, 22 Dec 2025 20:58:56 +0100
Subject: [PATCH 06/15] Sparse CSA; initial tests

---
 .github/workflows/yateto-ci.yml        |  2 +-
 tests/code-gen/sparsity.py             | 37 ++++++++++++++++++++++
 yateto/codegen/copyscaleadd/generic.py | 44 ++++++++++++++++++++++----
 3 files changed, 75 insertions(+), 8 deletions(-)
 create mode 100644 tests/code-gen/sparsity.py

diff --git a/.github/workflows/yateto-ci.yml b/.github/workflows/yateto-ci.yml
index b637aba..dfe0bbc 100644
--- a/.github/workflows/yateto-ci.yml
+++ b/.github/workflows/yateto-ci.yml
@@ -73,7 +73,7 @@ jobs:
     - name: Codegen Tests
       run: |
         cd ./tests/code-gen
-        for example in matmul minimal indices slicing; do
+        for example in matmul minimal indices slicing sparsity; do
           for build_type in Debug Release; do
             for precision in single double; do
               echo " ====== Test Config: ======"
diff --git a/tests/code-gen/sparsity.py b/tests/code-gen/sparsity.py
new file mode 100644
index 0000000..4e998c5
--- /dev/null
+++ b/tests/code-gen/sparsity.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+from yateto import *
+
+from yateto.memory import *
+
+import numpy as np
+
+def add(g):
+  N = 4
+  A = Tensor('A', (N,), spp=np.array([1,0,0,1]), memoryLayoutClass=PatternMemoryLayout)
+  B = Tensor('B', (N,), spp=np.array([1,0,1,0]), memoryLayoutClass=PatternMemoryLayout)
+  D = Tensor('D', (N,), spp=np.ones((N,)), memoryLayoutClass=PatternMemoryLayout)
+  B2 = Tensor('B2', (N, N), spp=np.ones((N, N)), memoryLayoutClass=PatternMemoryLayout)
+  B3 = Tensor('B3', (N, N), spp=np.ones((N, N)), memoryLayoutClass=CSCMemoryLayout)
+  Z = Tensor('Z', (N, N, N), spp=np.ones((N, N, N)), memoryLayoutClass=PatternMemoryLayout)
+  C = Tensor('C', (N, N))
+  C2 = Tensor('C2', (N, N, N))
+  C3 = Tensor('C3', (N, N, N, N, N, N))
+
+  class Counter:
+    def __init__(self):
+      self.counter = 0
+
+  counter = Counter()
+
+  def _(kernel):
+    counter.counter += 1
+    g.add(f'kernel{counter.counter}', kernel)
+
+  _(C['ab'] <= A['a'])
+  _(C['ab'] <= A['a'] + B['a'])
+  _(C['ab'] <= A['a'] + B['b'])
+  _(C['ab'] <= A['a'] * B['b'])
+  _(C2['abc'] <= A['a'] + B['b'])
+  _(C['ij'] <= B2['ik'] * B3['kj'])
+  _(C2['zij'] <= B2['ik'] * Z['kjz'])
diff --git a/yateto/codegen/copyscaleadd/generic.py b/yateto/codegen/copyscaleadd/generic.py
index 44bf704..a857cd3 100644
--- a/yateto/codegen/copyscaleadd/generic.py
+++ b/yateto/codegen/copyscaleadd/generic.py
@@ -5,24 +5,40 @@ def __init__(self, arch, descr):
     self._arch = arch
     self._descr = descr
   
-  def _formatTerm(self, alpha, term):
+  def _formatTerm(self, alpha, term, entry):
     prefix = ''
     if alpha == 0.0:
       return ''
     if alpha == 1.0:
       prefix = term.name
     else:
-      prefix = '{} * {}'.format(alpha, term.name)
-    return '{}[{}]'.format(prefix, term.memoryLayout.addressString(term.indices))
+      prefix = f'{alpha} * {term.name}'
+
+    if entry is None:
+      return f'{prefix}[{term.memoryLayout.addressString(term.indices)}]'
+    else:
+      if term.memoryLayout.hasValue(entry):
+        return f'{prefix}[{term.memoryLayout.address(entry)}]'
+      else:
+        # needed for some temporaries
+        return self._arch.formatConstant(0.0)
 
   def generate(self, cpp, routineCache):
     d = self._descr
 
     if d.beta == 0.0:
-      writeBB = boundingBoxFromLoopRanges(d.result.indices, d.loopRanges)
-      initializeWithZero(cpp, self._arch, d.result, writeBB)
+      if d.term.memoryLayout.isSparse():
+        initializeWithZero(cpp, self._arch, d.result)
+      else:
+        writeBB = boundingBoxFromLoopRanges(d.result.indices, d.loopRanges)
+        initializeWithZero(cpp, self._arch, d.result, writeBB)
+    
 
     class CopyScaleAddBody(object):
+      def __init__(self, resultEntry, termEntry):
+        self.resultEntry = resultEntry
+        self.termEntry = termEntry
+
       def __call__(s):
         op = '='
         flop = 0
@@ -38,8 +54,22 @@ def __call__(s):
           flop += 1
         elif d.beta != 0.0:
           raise NotImplementedError
-        cpp( '{} {} {};'.format(self._formatTerm(1.0, d.result), op, self._formatTerm(alpha, d.term)) )
+        cpp( f'{self._formatTerm(1.0, d.result, s.resultEntry)} {op} {self._formatTerm(alpha, d.term, s.termEntry)};' )
 
         return flop
 
-    return forLoops(cpp, d.result.indices, d.loopRanges, CopyScaleAddBody())
+    if d.term.memoryLayout.isSparse():
+
+      indexmap = d.result.indices.positions(d.term.indices, sort=False)
+
+      flops = 0
+      nonzeros = d.result.eqspp.nonzero()
+      for entryR in sorted(zip(*nonzeros), key=lambda x: x[::-1]):
+        entry = tuple(entryR[ pos ] for pos in indexmap)
+        flops += CopyScaleAddBody(entryR, entry)()
+      
+      return flops
+
+    else:
+
+      return forLoops(cpp, d.result.indices, d.loopRanges, CopyScaleAddBody(None, None))

From 68997fa5713e0a46e944e28a5de8f8897f5c4184 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Tue, 23 Dec 2025 00:31:06 +0100
Subject: [PATCH 07/15] Fix argument count

---
 yateto/memory.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/yateto/memory.py b/yateto/memory.py
index d095f68..84e91ec 100644
--- a/yateto/memory.py
+++ b/yateto/memory.py
@@ -290,6 +290,9 @@ def storage(self):
   
   def alignmentOffset(self, dim):
     return 0
+  
+  def equalStride(self, dim):
+    return True
 
 class CSCMemoryLayout(MemoryLayout):
   def __init__(self, spp, alignStride=False):
@@ -352,7 +355,7 @@ def rowIndex(self):
   def colPointer(self):
     return self._colPtr
   
-  def isAlignedAddressString(self, indices, I = None):
+  def isAlignedAddressString(self, indices, I = None, Z = None):
     if I is None:
       I = set(indices)
     positions = indices.positions(I)
@@ -434,6 +437,9 @@ def alignmentOffset(self, dim):
 
   def isSparse(self):
     return True
+  
+  def equalStride(self, dim):
+    return False
 
 
 class PatternMemoryLayout(MemoryLayout):
@@ -609,6 +615,9 @@ def mayFuse(self, positions):
 
   def __eq__(self, other):
     return self._bbox == other._bbox and np.array_equal(self._pattern, other._pattern)
+  
+  def equalStride(self, dim):
+    return False
 
 class AlignedCSCMemoryLayout:
   @classmethod
@@ -678,14 +687,14 @@ def isCompatible(self, spp):
   def mayVectorizeDim(self, dim):
     return self.base.mayVectorizeDim(dim)
   
-  def isAlignedAddressString(self, indices, I = None):
-    return self.base.isAlignedAddressString(indices, I)
+  def isAlignedAddressString(self, indices, I = None, Z = None):
+    return self.base.isAlignedAddressString(indices, I, Z)
   
-  def addressString(self, indices, I = None, prefix='_', offsets=()):
+  def addressString(self, indices, I = None, Z = None, prefix='_', offsets=()):
     if len(offsets) == 0:
       offsets = [0] * len(self._shape)
     newOffsets = tuple(offsets[i] if self.index != i else offsets[i] + self.start for i in range(len(self._shape)))
-    return self.base.addressString(indices, I, prefix, newOffsets)
+    return self.base.addressString(indices, I, Z, prefix, newOffsets)
   
   def subslice(self, index, start, end):
     return MemoryLayoutView(self, index, start, end)

From d5925906cfb0b392d50f7c12e3bce55ac36c27fb Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Mon, 5 Jan 2026 14:57:42 +0100
Subject: [PATCH 08/15] Update tests

---
 tests/code-gen/sparsity.py | 16 +++++++++++++++-
 yateto/memory.py           | 20 +++++++++++---------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/tests/code-gen/sparsity.py b/tests/code-gen/sparsity.py
index 4e998c5..8366e14 100644
--- a/tests/code-gen/sparsity.py
+++ b/tests/code-gen/sparsity.py
@@ -6,14 +6,25 @@
 
 import numpy as np
 
+def checkerboard(shape):
+  # cf. https://stackoverflow.com/a/51715491
+  return np.indices(tuple(shape)).sum(axis=0) % 2
+
+def invert(pattern):
+  return 1 - pattern
+
 def add(g):
   N = 4
   A = Tensor('A', (N,), spp=np.array([1,0,0,1]), memoryLayoutClass=PatternMemoryLayout)
   B = Tensor('B', (N,), spp=np.array([1,0,1,0]), memoryLayoutClass=PatternMemoryLayout)
   D = Tensor('D', (N,), spp=np.ones((N,)), memoryLayoutClass=PatternMemoryLayout)
+  E = Tensor('E', (N,), spp=checkerboard((N,)), memoryLayoutClass=PatternMemoryLayout)
   B2 = Tensor('B2', (N, N), spp=np.ones((N, N)), memoryLayoutClass=PatternMemoryLayout)
   B3 = Tensor('B3', (N, N), spp=np.ones((N, N)), memoryLayoutClass=CSCMemoryLayout)
+  B4 = Tensor('B4', (N, N), spp=checkerboard((N, N)), memoryLayoutClass=PatternMemoryLayout)
+  B5 = Tensor('B5', (N, N), spp=invert(checkerboard((N, N))), memoryLayoutClass=PatternMemoryLayout)
   Z = Tensor('Z', (N, N, N), spp=np.ones((N, N, N)), memoryLayoutClass=PatternMemoryLayout)
+  C0 = Tensor('C0', (N,))
   C = Tensor('C', (N, N))
   C2 = Tensor('C2', (N, N, N))
   C3 = Tensor('C3', (N, N, N, N, N, N))
@@ -29,9 +40,12 @@ def _(kernel):
     g.add(f'kernel{counter.counter}', kernel)
 
   _(C['ab'] <= A['a'])
-  _(C['ab'] <= A['a'] + B['a'])
   _(C['ab'] <= A['a'] + B['b'])
   _(C['ab'] <= A['a'] * B['b'])
   _(C2['abc'] <= A['a'] + B['b'])
+  _(C2['abc'] <= B4['ac'])
   _(C['ij'] <= B2['ik'] * B3['kj'])
+  _(C['ij'] <= B4['ik'] * B3['kj'])
+  _(C['ij'] <= B4['ik'] * B3['kj'] * B4['ij'])
   _(C2['zij'] <= B2['ik'] * Z['kjz'])
+  _(C0['k'] <= B4['ik'] * A['i'])
diff --git a/yateto/memory.py b/yateto/memory.py
index 84e91ec..1145e35 100644
--- a/yateto/memory.py
+++ b/yateto/memory.py
@@ -480,9 +480,6 @@ def __init__(self, spp, alignStride=False, pattern=None):
 
     self._nonzeros = nonzeros
 
-    # TODO: self._next = np.zeros(self._shape, dtype=int, order='F')
-    # point to the top-left entry
-
   def requiredReals(self):
     return len(self._nonzeros)
   
@@ -508,14 +505,14 @@ def subtensorOffset(self, topLeftEntry):
     tle = topLeftEntry
     assert topLeftEntry in self._bbox
 
-    subpat = [self._pattern[tle] for ex in self._nonzeros if
+    subpat = [self._pattern[ex] for ex in self._nonzeros if
       all(e >= tle[i] for i,e in enumerate(ex))]
     
-    return subpat[0] - 1 if len(subpat) > 0 else 0
+    result = subpat[0] - 1 if len(subpat) > 0 else 0
 
-    #assert self._next[tuple(topLeftEntry)] > 0
+    assert result >= 0
 
-    #return self._next[tuple(topLeftEntry)] - 1
+    return result
 
   def entries(self, *rng):
     return [tuple(e - r.start for e,r in zip(ex, rng)) for ex in self._nonzeros if
@@ -615,9 +612,11 @@ def mayFuse(self, positions):
 
   def __eq__(self, other):
     return self._bbox == other._bbox and np.array_equal(self._pattern, other._pattern)
-  
+
   def equalStride(self, dim):
-    return False
+    # search for: all zeros
+    nzp = (self._pattern != 0)
+    return nzp.sum(axis=dim) == nzp.max(axis=dim) * nzp.shape[dim]
 
 class AlignedCSCMemoryLayout:
   @classmethod
@@ -775,3 +774,6 @@ def alignmentOffset(self, dim):
       newval = val + self.start
       val = newval - DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(newval)
     return val
+  
+  def equalStride(self, dim):
+    return self.base.equalStride(dim)

From d379bea8b426d0086a09551151dd01ebca41fa70 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Sat, 10 Jan 2026 22:40:05 +0100
Subject: [PATCH 09/15] Fix ordering

---
 yateto/codegen/visitor.py | 2 +-
 yateto/memory.py          | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py
index 1b0c364..ad4c0d0 100644
--- a/yateto/codegen/visitor.py
+++ b/yateto/codegen/visitor.py
@@ -649,7 +649,7 @@ def generate(cpp, group, memLayout):
     
     def listToInitializerList(self, lst):
       if isinstance(lst, np.ndarray):
-        lst = lst.flatten()
+        lst = lst.flatten(order='K')
       return '{{{}}}'.format(', '.join([str(l) for l in lst]))
     
     def formatArray(self, numberType, name, values, declarationOnly):
diff --git a/yateto/memory.py b/yateto/memory.py
index 1145e35..ad88972 100644
--- a/yateto/memory.py
+++ b/yateto/memory.py
@@ -452,7 +452,7 @@ def __init__(self, spp, alignStride=False, pattern=None):
     self.aligned = alignStride
 
     self._bbox = BoundingBox.fromSpp(spp)
-    if self.aligned:
+    if alignStride:
       range0 = self._bbox[0]
       rnew = Range( DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(range0.start), DenseMemoryLayout.ALIGNMENT_ARCH.alignedUpper(range0.stop) )
       self._bbox = BoundingBox([rnew] + self._bbox[1:])
@@ -460,7 +460,7 @@ def __init__(self, spp, alignStride=False, pattern=None):
     nonzeros = spp.nonzero()
     nonzeros = sorted(zip(*nonzeros), key=lambda x: x[::-1])
 
-    if self.aligned:
+    if alignStride:
       nonzeros_pre = set(nonzeros)
       for nonzero in nonzeros:
         lower = DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(nonzero[0])
@@ -473,6 +473,7 @@ def __init__(self, spp, alignStride=False, pattern=None):
       nonzeros = list(nonzeros_pre)
       nonzeros = sorted(zip(*[[nonzero[i] for nonzero in nonzeros] for i in range(len(self._shape))]), key=lambda x: x[::-1])
     
+    # keep everything in F order
     self._pattern = np.zeros(self._shape, dtype=int, order='F')
 
     for i, nonzero in enumerate(nonzeros):
@@ -556,7 +557,7 @@ def vec(self, indices, I, Z):
     for p in positionsI:
       selector[p] = slice(None)
     
-    pattern = self._pattern[tuple(selector)].transpose(positionsI).flatten()
+    pattern = self._pattern[tuple(selector)].transpose(positionsI).flatten(order='F')
 
     return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern)
 

From 3af8c4568cd0482c434c69eaf70eacb9effa8994 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Mon, 1 Dec 2025 04:00:52 +0100
Subject: [PATCH 10/15] Update CMake standard for interface tests

---
 tests/interface/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/interface/CMakeLists.txt b/tests/interface/CMakeLists.txt
index eb604d7..acadc58 100644
--- a/tests/interface/CMakeLists.txt
+++ b/tests/interface/CMakeLists.txt
@@ -3,6 +3,10 @@ project(inteface-test)
 
 find_package(CxxTest REQUIRED)
 
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
 enable_testing()
 # generate and add an interface test
  add_custom_command(COMMAND ${CXXTEST_PYTHON_TESTGEN_EXECUTABLE} 
@@ -12,5 +16,4 @@ enable_testing()
 
 add_executable(tensor-view-target TensorView.t.cpp)
 target_include_directories(tensor-view-target PUBLIC ${CMAKE_SOURCE_DIR}/../../include ${CXXTEST_INCLUDE_DIRS})
-target_compile_options(tensor-view-target PUBLIC "-std=c++11")
-add_test(tensor-view tensor-view-target)
\ No newline at end of file
+add_test(tensor-view tensor-view-target)

From d785aadcc16336867049b57652bad64aaa1b889f Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Sat, 10 Jan 2026 22:48:02 +0100
Subject: [PATCH 11/15] Extend interface tests

---
 include/yateto/TensorView.h    | 30 +++++++++----
 tests/interface/TensorView.t.h | 79 ++++++++++++++++++++++++++++++----
 yateto/codegen/visitor.py      |  4 +-
 3 files changed, 94 insertions(+), 19 deletions(-)

diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h
index 5948699..f1be097 100644
--- a/include/yateto/TensorView.h
+++ b/include/yateto/TensorView.h
@@ -480,23 +480,35 @@ namespace yateto {
   };
 
   template<unsigned Dim, typename real_t, typename uint_t>
-  class PatternMatrixView : public TensorView<Dim, real_t, uint_t> {
+  class PatternTensorView : public TensorView<Dim, real_t, uint_t> {
   public:
     // TODO: remove const_cast<uint_t*>(pattern)
 
-    explicit PatternMatrixView(real_t* values, std::initializer_list<uint_t> shape, uint_t const* pattern)
+    explicit PatternTensorView(real_t* values, std::initializer_list<uint_t> shape, uint_t const* pattern)
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(const_cast<uint_t*>(pattern), shape) {
 
-      m_pattern.forall([&](const auto& index, const auto& idxval) {
-        if (idxval > 0) {
-          ++m_size;
-        }
-      });
+      computeSize();
     }
 
-    explicit PatternMatrixView(real_t* values, uint_t const shape[], uint_t const* pattern)
+    explicit PatternTensorView(real_t* values, uint_t const shape[], uint_t const* pattern)
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(const_cast<uint_t*>(pattern), shape) {
 
+      computeSize();
+    }
+
+    explicit PatternTensorView(real_t* values, std::initializer_list<uint_t> shape, DenseTensorView<Dim, uint_t, uint_t> pattern)
+      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(std::move(pattern)) {
+
+      computeSize();
+    }
+
+    explicit PatternTensorView(real_t* values, uint_t const shape[], DenseTensorView<Dim, uint_t, uint_t> pattern)
+      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(std::move(pattern)) {
+
+      computeSize();
+    }
+
+    void computeSize() {
       m_pattern.forall([&](const auto& index, const auto& idxval) {
         if (idxval > 0) {
           ++m_size;
@@ -581,7 +593,7 @@ namespace yateto {
     auto subtensor(Entry... entry) const {
       static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
       const auto patternSubtensor = m_pattern.subtensor(entry...);
-      return PatternMatrixView<count_slices<uint_t, Entry...>::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor);
+      return PatternTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor);
     }
 
   protected:
diff --git a/tests/interface/TensorView.t.h b/tests/interface/TensorView.t.h
index 6487c93..c5f1159 100644
--- a/tests/interface/TensorView.t.h
+++ b/tests/interface/TensorView.t.h
@@ -1,24 +1,26 @@
 #include <cxxtest/TestSuite.h>
 #include <yateto/TensorView.h>
 
+#include <cstdint>
+
 using namespace yateto;
 
-class TensorViewTestSuite : public CxxTest::TestSuite
+class DenseTensorViewTestSuite : public CxxTest::TestSuite
 {
 private:
-  double m_data[12];
+  double data_[12];
 
 public:
   void setUp()
   {
     for (int i = 0; i < 12; ++i) {
-      m_data[i] = static_cast<double>(i+1);
+      data_[i] = static_cast<double>(i+1);
     }
   }
 
   void testAccess()
   {
-    DenseTensorView<3, double> tensor(m_data, {3,2,2});
+    DenseTensorView<3, double> tensor(data_, {3,2,2});
     TS_ASSERT_EQUALS(tensor(0,0,0), 1.0);
     TS_ASSERT_EQUALS(tensor(1,1,0), 5.0);
     TS_ASSERT_EQUALS(tensor(2,1,1), 12.0);
@@ -26,7 +28,7 @@ class TensorViewTestSuite : public CxxTest::TestSuite
 
 	void testSubtensor()
 	{
-    DenseTensorView<3, double> tensor(m_data, {3,2,2});
+    DenseTensorView<3, double> tensor(data_, {3,2,2});
     auto sub = tensor.subtensor(1, slice<>(), slice<>());
     TS_ASSERT_EQUALS(sub(0,0), 2.0);
     TS_ASSERT_EQUALS(sub(1,0), 5.0);
@@ -46,14 +48,75 @@ class TensorViewTestSuite : public CxxTest::TestSuite
 
   void testSetZero()
   {
-    DenseTensorView<3, double> tensor(m_data, {3,2,2});
+    DenseTensorView<3, double> tensor(data_, {3,2,2});
     auto sub = tensor.subtensor(1, slice<>(), slice<>());
     sub.setZero();
     for (int i = 0; i < 12; ++i) {
       if ((i-1) % 3 == 0) {
-        TS_ASSERT_EQUALS(m_data[i], 0.0);
+        TS_ASSERT_EQUALS(data_[i], 0.0);
+      } else {
+        TS_ASSERT_EQUALS(data_[i], static_cast<double>(i+1));
+      }
+    }
+  }
+};
+
+
+class PatternTensorViewTestSuite : public CxxTest::TestSuite
+{
+private:
+  double data_[6];
+  uint32_t pattern_[12];
+
+public:
+  void setUp()
+  {
+    for (int i = 0; i < 6; ++i) {
+      data_[i] = static_cast<double>(2*i+1);
+    }
+    for (int i = 0; i < 12; ++i) {
+      pattern_[i] = (i % 2 == 0) ? (i + 1) : 0;
+    }
+  }
+
+  void testAccess()
+  {
+    PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_);
+    TS_ASSERT_EQUALS(tensor(0,0,0), 0.0);
+    TS_ASSERT_EQUALS(tensor(1,1,0), 0.0);
+    TS_ASSERT_EQUALS(tensor(2,1,1), 12.0);
+  }
+
+	void testSubtensor()
+	{
+    PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_);
+    auto sub = tensor.subtensor(1, slice<>(), slice<>());
+    TS_ASSERT_EQUALS(sub(0,0), 2.0);
+    TS_ASSERT_EQUALS(sub(1,0), 0.0);
+    TS_ASSERT_EQUALS(sub(0,1), 8.0);
+    TS_ASSERT_EQUALS(sub(1,1), 0.0);
+
+    auto sub2 = sub.subtensor(1, slice<>());
+    TS_ASSERT_EQUALS(sub2(0), 0.0);
+    TS_ASSERT_EQUALS(sub2(1), 0.0);
+
+    auto sub3 = tensor.subtensor(slice<>(1,3), slice<>(), slice<>()); 
+    TS_ASSERT_EQUALS(sub3(0,0,0), 2.0);
+    TS_ASSERT_EQUALS(sub3(0,1,0), 0.0);
+    TS_ASSERT_EQUALS(sub3(1,0,1), 0.0);
+    TS_ASSERT_EQUALS(sub3(1,1,1), 12.0);
+	}
+
+  void testSetZero()
+  {
+    PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_);
+    auto sub = tensor.subtensor(1, slice<>(), slice<>());
+    sub.setZero();
+    for (int i = 0; i < 12; ++i) {
+      if ((i-1) % 3 == 0 || i % 2 == 1) {
+        TS_ASSERT_EQUALS(data_[i], 0.0);
       } else {
-        TS_ASSERT_EQUALS(m_data[i], static_cast<double>(i+1));
+        TS_ASSERT_EQUALS(data_[i], static_cast<double>(i+1));
       }
     }
   }
diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py
index ad4c0d0..c64d19d 100644
--- a/yateto/codegen/visitor.py
+++ b/yateto/codegen/visitor.py
@@ -696,7 +696,7 @@ def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declaration
       cpp(self.formatArray(numberType, namespace + self.ROWIND_NAME + index, memLayout.rowIndex(), declarationOnly))
       cpp(self.formatArray(numberType, namespace + self.COLPTR_NAME + index, memLayout.colPointer(), declarationOnly))
 
-  class PatternMatrixView(TensorView):
+  class PatternTensorView(TensorView):
     PATTERN_NAME = 'Pattern'
     
     def typename(self, dim, arch):
@@ -757,7 +757,7 @@ def _tensorViewGenerator(self, memoryLayout):
     memLayoutMap = {
       'DenseMemoryLayout': self.DenseTensorView,
       'CSCMemoryLayout': self.CSCMatrixView,
-      'PatternMemoryLayout': self.PatternMatrixView
+      'PatternMemoryLayout': self.PatternTensorView
     }
     return memLayoutMap[type(memoryLayout).__name__]()
   

From 4a47223ac5e031d66caa46056b0146716499bca9 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Sat, 10 Jan 2026 23:07:36 +0100
Subject: [PATCH 12/15] Un-const `subtensor` function

---
 include/yateto/TensorView.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h
index f1be097..a07546b 100644
--- a/include/yateto/TensorView.h
+++ b/include/yateto/TensorView.h
@@ -206,14 +206,14 @@ namespace yateto {
     }
 
     template<typename... Entry>
-    real_t operator()(Entry... entry) const {
+    const real_t& operator()(Entry... entry) const {
       static_assert(sizeof...(entry) == Dim,
                         "Number of arguments to operator() const does not match the tensor dimension.");
       assert(isInRange(entry...));
       return m_values[address(entry...)];
     }
 
-    real_t operator[](uint_t const entry[Dim]) const {
+    const real_t& operator[](uint_t const entry[Dim]) const {
       uint_t addr = 0;
       for (uint_t d = 0; d < Dim; ++d) {
         assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]);
@@ -264,7 +264,7 @@ namespace yateto {
     }
 
     template<typename... Entry>
-    auto subtensor(Entry... entry) -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t> const {
+    auto subtensor(Entry... entry) -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t> {
       static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
       constexpr auto nSlices = count_slices<uint_t, Entry...>::value;
       uint_t begin[Dim]{};
@@ -590,7 +590,7 @@ namespace yateto {
     }
 
     template<typename... Entry>
-    auto subtensor(Entry... entry) const {
+    auto subtensor(Entry... entry) {
       static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
       const auto patternSubtensor = m_pattern.subtensor(entry...);
       return PatternTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor);

From 41812b1fe21b2d4e40e4ee1477cd8722e2d0b0bd Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Sat, 10 Jan 2026 23:20:50 +0100
Subject: [PATCH 13/15] Fix interface tests

---
 .github/workflows/yateto-ci.yml |  2 +-
 include/yateto/TensorView.h     |  4 +--
 tests/interface/TensorView.t.h  | 55 ++++++++++++++++++++++-----------
 3 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/yateto-ci.yml b/.github/workflows/yateto-ci.yml
index dfe0bbc..d2b0736 100644
--- a/.github/workflows/yateto-ci.yml
+++ b/.github/workflows/yateto-ci.yml
@@ -33,7 +33,7 @@ jobs:
           mkdir -p ./build-${build_type} && cd ./build-${build_type}
           cmake .. -GNinja -DCMAKE_BUILD_TYPE=${build_type}
           ninja
-          ninja test
+          ctest -j --rerun-failed --output-on-failure
           cd ..
         done
 
diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h
index a07546b..990df6c 100644
--- a/include/yateto/TensorView.h
+++ b/include/yateto/TensorView.h
@@ -529,7 +529,7 @@ namespace yateto {
     }
 
     template<typename ...Args>
-    real_t operator()(Args... index) const {
+    const real_t& operator()(Args... index) const {
       static_assert((std::is_integral_v<Args> && ...));
       const auto idx = m_pattern(index...);
       return m_values[idx - 1];
@@ -554,7 +554,7 @@ namespace yateto {
       return m_values[idx - 1];
     }
 
-    real_t operator[](const uint_t entry[Dim]) const {
+    const real_t& operator[](const uint_t entry[Dim]) const {
       const auto idx = m_pattern[entry];
       return m_values[idx - 1];
     }
diff --git a/tests/interface/TensorView.t.h b/tests/interface/TensorView.t.h
index c5f1159..4eb0778 100644
--- a/tests/interface/TensorView.t.h
+++ b/tests/interface/TensorView.t.h
@@ -75,36 +75,55 @@ class PatternTensorViewTestSuite : public CxxTest::TestSuite
       data_[i] = static_cast<double>(2*i+1);
     }
     for (int i = 0; i < 12; ++i) {
-      pattern_[i] = (i % 2 == 0) ? (i + 1) : 0;
+      pattern_[i] = (i % 2 == 0) ? (i / 2 + 1) : 0;
     }
   }
 
+  void testBasic()
+  {
+    PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_);
+    TS_ASSERT_EQUALS(tensor.size(), 6);
+  }
+
   void testAccess()
   {
     PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_);
-    TS_ASSERT_EQUALS(tensor(0,0,0), 0.0);
-    TS_ASSERT_EQUALS(tensor(1,1,0), 0.0);
-    TS_ASSERT_EQUALS(tensor(2,1,1), 12.0);
+    TS_ASSERT(tensor.isInRange(0,0,0));
+    TS_ASSERT_EQUALS(tensor(0,0,0), 1.0);
+    TS_ASSERT(tensor.isInRange(1,1,0));
+    TS_ASSERT_EQUALS(tensor(1,1,0), 5.0);
+    TS_ASSERT(!tensor.isInRange(2,1,1));
+    // TS_ASSERT_EQUALS(tensor(2,1,1), 0.0);
   }
 
 	void testSubtensor()
 	{
     PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_);
     auto sub = tensor.subtensor(1, slice<>(), slice<>());
-    TS_ASSERT_EQUALS(sub(0,0), 2.0);
-    TS_ASSERT_EQUALS(sub(1,0), 0.0);
-    TS_ASSERT_EQUALS(sub(0,1), 8.0);
-    TS_ASSERT_EQUALS(sub(1,1), 0.0);
+    TS_ASSERT(!sub.isInRange(0,0));
+    // TS_ASSERT_EQUALS(sub(0,0), 0.0);
+    TS_ASSERT(sub.isInRange(1,0));
+    TS_ASSERT_EQUALS(sub(1,0), 5.0);
+    TS_ASSERT(!sub.isInRange(0,1));
+    // TS_ASSERT_EQUALS(sub(0,1), 0.0);
+    TS_ASSERT(sub.isInRange(1,1));
+    TS_ASSERT_EQUALS(sub(1,1), 11.0);
 
     auto sub2 = sub.subtensor(1, slice<>());
-    TS_ASSERT_EQUALS(sub2(0), 0.0);
-    TS_ASSERT_EQUALS(sub2(1), 0.0);
+    TS_ASSERT(sub2.isInRange(0));
+    TS_ASSERT_EQUALS(sub2(0), 5.0);
+    TS_ASSERT(sub2.isInRange(1));
+    TS_ASSERT_EQUALS(sub2(1), 11.0);
 
-    auto sub3 = tensor.subtensor(slice<>(1,3), slice<>(), slice<>()); 
-    TS_ASSERT_EQUALS(sub3(0,0,0), 2.0);
-    TS_ASSERT_EQUALS(sub3(0,1,0), 0.0);
-    TS_ASSERT_EQUALS(sub3(1,0,1), 0.0);
-    TS_ASSERT_EQUALS(sub3(1,1,1), 12.0);
+    auto sub3 = tensor.subtensor(slice<>(1,3), slice<>(), slice<>());
+    TS_ASSERT(!sub3.isInRange(0,0,0));
+    // TS_ASSERT_EQUALS(sub3(0,0,0), 0.0);
+    TS_ASSERT(sub3.isInRange(0,1,0));
+    TS_ASSERT_EQUALS(sub3(0,1,0), 5.0);
+    TS_ASSERT(sub3.isInRange(1,0,1));
+    TS_ASSERT_EQUALS(sub3(1,0,1), 9.0);
+    TS_ASSERT(!sub3.isInRange(1,1,1));
+    // TS_ASSERT_EQUALS(sub3(1,1,1), 0.0);
 	}
 
   void testSetZero()
@@ -112,11 +131,11 @@ class PatternTensorViewTestSuite : public CxxTest::TestSuite
     PatternTensorView<3, double, uint32_t> tensor(data_, {3,2,2}, pattern_);
     auto sub = tensor.subtensor(1, slice<>(), slice<>());
     sub.setZero();
-    for (int i = 0; i < 12; ++i) {
-      if ((i-1) % 3 == 0 || i % 2 == 1) {
+    for (int i = 0; i < 6; ++i) {
+      if ((2*i-1) % 3 == 0) {
         TS_ASSERT_EQUALS(data_[i], 0.0);
       } else {
-        TS_ASSERT_EQUALS(data_[i], static_cast<double>(i+1));
+        TS_ASSERT_EQUALS(data_[i], static_cast<double>(2*i+1));
       }
     }
   }

From 12877e7391dc87af26da700af0040d4bce325df1 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Sat, 10 Jan 2026 23:48:26 +0100
Subject: [PATCH 14/15] Test and (mostly) fix unrolling

---
 tests/code-gen/sparsity.py     | 15 ++++++++++++++-
 yateto/ast/indices.py          |  3 +++
 yateto/codegen/gemm/gemmgen.py |  2 +-
 yateto/codegen/gemm/generic.py | 14 +++++++-------
 yateto/codegen/log/generic.py  |  6 ------
 yateto/memory.py               | 32 +++++++++++++++++++++-----------
 6 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/tests/code-gen/sparsity.py b/tests/code-gen/sparsity.py
index 8366e14..dec5a68 100644
--- a/tests/code-gen/sparsity.py
+++ b/tests/code-gen/sparsity.py
@@ -15,6 +15,7 @@ def invert(pattern):
 
 def add(g):
   N = 4
+  M = 2
   A = Tensor('A', (N,), spp=np.array([1,0,0,1]), memoryLayoutClass=PatternMemoryLayout)
   B = Tensor('B', (N,), spp=np.array([1,0,1,0]), memoryLayoutClass=PatternMemoryLayout)
   D = Tensor('D', (N,), spp=np.ones((N,)), memoryLayoutClass=PatternMemoryLayout)
@@ -27,7 +28,15 @@ def add(g):
   C0 = Tensor('C0', (N,))
   C = Tensor('C', (N, N))
   C2 = Tensor('C2', (N, N, N))
-  C3 = Tensor('C3', (N, N, N, N, N, N))
+
+  YA = Tensor('YA', (M, M, M), spp=checkerboard((M, M, M)), memoryLayoutClass=PatternMemoryLayout)
+  YB = Tensor('YB', (M, M, M), spp=checkerboard((M, M, M)), memoryLayoutClass=PatternMemoryLayout)
+  YC = Tensor('YC', (M, M, M))
+  C4 = Tensor('C4', (M, M, M, M))
+
+  ZA = Tensor('ZA', (M, M, M, M, M, M), spp=checkerboard((M, M, M, M, M, M)), memoryLayoutClass=PatternMemoryLayout)
+  ZB = Tensor('ZB', (M, M, M, M, M, M), spp=invert(checkerboard((M, M, M, M, M, M))), memoryLayoutClass=PatternMemoryLayout)
+  ZC = Tensor('ZC', (M, M, M, M, M, M))
 
   class Counter:
     def __init__(self):
@@ -49,3 +58,7 @@ def _(kernel):
   _(C['ij'] <= B4['ik'] * B3['kj'] * B4['ij'])
   _(C2['zij'] <= B2['ik'] * Z['kjz'])
   _(C0['k'] <= B4['ik'] * A['i'])
+
+  _(C4['ijXY'] <= YA['Zik'] * YB['kXY'] * YC['Zkj'])
+  _(ZC['abcxyz'] <= ZA['abcijk'] * ZB['ijkxyz'])
+  _(ZC['abcxyz'] <= ZA['aibjck'] * ZB['zkyjxi'])
diff --git a/yateto/ast/indices.py b/yateto/ast/indices.py
index 6f2ab02..80c6e00 100644
--- a/yateto/ast/indices.py
+++ b/yateto/ast/indices.py
@@ -44,6 +44,9 @@ def positions(self, I, sort=True):
       return sorted(pos)
     return pos
   
+  def positionsIncomplete(self, I):
+    return [self.find(i) for i in I if i in self]
+  
   def __eq__(self, other):
     return other != None and self._indices == other._indices and self._size == other._size
     
diff --git a/yateto/codegen/gemm/gemmgen.py b/yateto/codegen/gemm/gemmgen.py
index 35b0b95..3eb5a96 100644
--- a/yateto/codegen/gemm/gemmgen.py
+++ b/yateto/codegen/gemm/gemmgen.py
@@ -109,7 +109,7 @@ def generate(self, cpp, routineCache):
     if d.isBCsc:
       sppB = d.rightTerm.memoryLayout.entries(k, n)
       sppBRows = d.rightTerm.memoryLayout.shape()[0]
-    
+
     if d.isACsc and d.isBCsc:
       # count the flops by splitting into outer products (i.e. partition by k)
       # for each outer product, we need to compute all-by-all nonzero entries for m and n
diff --git a/yateto/codegen/gemm/generic.py b/yateto/codegen/gemm/generic.py
index 7d9d92d..7b6f0f7 100644
--- a/yateto/codegen/gemm/generic.py
+++ b/yateto/codegen/gemm/generic.py
@@ -61,10 +61,10 @@ def _generateSparseSparse(self, cpp):
     Caccess = self._accessFun(d.result, (m.start, n.start), False, False)
 
     rows, cols = (k, m) if d.transA else (m, k)
-    sppA = d.leftTerm.memoryLayout.entries(rows, cols)
+    sppA = d.leftTerm.memoryLayout.entriesRel(rows, cols)
 
     rows, cols = (n, k) if d.transB else (k, n)
-    sppB = d.rightTerm.memoryLayout.entries(rows, cols)
+    sppB = d.rightTerm.memoryLayout.entriesRel(rows, cols)
 
     if d.beta != 1.0:
       with cpp.For(f'int n = 0; n < {n.size()}; ++n'):
@@ -80,9 +80,9 @@ def _generateSparseSparse(self, cpp):
     # note that we explicitly count all nonzero operations
 
     nzcount = 0
-    for idxA, entry in enumerate(sppA):
+    for idxA, entry in sppA:
       eA = entry[::-1] if d.transA else entry
-      for idxB, entry in enumerate(sppB):
+      for idxB, entry in sppB:
         eB = entry[::-1] if d.transB else entry
 
         if eA[1] == eB[0]:
@@ -112,7 +112,7 @@ def _generateSparseDense(self, cpp):
 
     if d.isACsc:
       rows, cols = (k, m) if d.transA else (m, k)
-      spp = d.leftTerm.memoryLayout.entries(rows, cols)
+      spp = d.leftTerm.memoryLayout.entriesRel(rows, cols)
       sparse = Aaccess
       result = lambda e: Caccess(e[0], self.OUTER_INDEX)
       dense = lambda e: Baccess(e[1], self.OUTER_INDEX)
@@ -120,7 +120,7 @@ def _generateSparseDense(self, cpp):
       trans = d.transA
     elif d.isBCsc:
       rows, cols = (n, k) if d.transB else (k, n)
-      spp = d.rightTerm.memoryLayout.entries(rows, cols)
+      spp = d.rightTerm.memoryLayout.entriesRel(rows, cols)
       sparse = Baccess
       result = lambda e: Caccess(self.OUTER_INDEX, e[1])
       dense = lambda e: Aaccess(self.OUTER_INDEX, e[0])
@@ -137,7 +137,7 @@ def _generateSparseDense(self, cpp):
               ' * ' + CAddr if d.beta != 0.0 else ''
             )
           )
-      for idx, entry in enumerate(spp):
+      for idx, entry in spp:
         e = entry[::-1] if trans else entry
         if e[0] < sizes[0] and e[1] < sizes[1]:
           cpp( '{result} += {alpha} * {dense} * {sparse};'.format(
diff --git a/yateto/codegen/log/generic.py b/yateto/codegen/log/generic.py
index 8d0d527..8e81078 100644
--- a/yateto/codegen/log/generic.py
+++ b/yateto/codegen/log/generic.py
@@ -31,12 +31,6 @@ def _memLayout(self, term, I, J, fixed):
     elif len(term.indices) == 2:
       return term.memoryLayout
     return term.memoryLayout.unfold(term.indices, I, J, fixed)
-  
-  def _unroll(self, term, I):
-    if not term.memoryLayout.isSparse():
-      return set()
-
-    return I
 
   def _reduce(self, term, subset, memLayout, fixed):
     return reduceSpp(term.eqspp, term.indices, subset, fixed).reshape(memLayout.shape())
diff --git a/yateto/memory.py b/yateto/memory.py
index ad88972..8d7feda 100644
--- a/yateto/memory.py
+++ b/yateto/memory.py
@@ -394,6 +394,10 @@ def entries(self, rowRange, colRange):
     for col in colRange:
       e.extend([(self._rowIndex[i]-rowRange.start, col-colRange.start) for i in range(self._colPtr[col], self._colPtr[col+1])])
     return e
+  
+  def entriesRel(self, *rng):
+    entries = self.entries(*rng)
+    return list(enumerate(entries))
 
   def alignedStride(self):
     return self.aligned
@@ -479,7 +483,7 @@ def __init__(self, spp, alignStride=False, pattern=None):
     for i, nonzero in enumerate(nonzeros):
       self._pattern[tuple(nonzero)] = i + 1 if pattern is None else pattern[tuple(nonzero)]
 
-    self._nonzeros = nonzeros
+    self._nonzeros = list(nonzeros)
 
   def requiredReals(self):
     return len(self._nonzeros)
@@ -519,6 +523,11 @@ def entries(self, *rng):
     return [tuple(e - r.start for e,r in zip(ex, rng)) for ex in self._nonzeros if
       all(e >= r.start and e < r.stop for e,r in zip(ex, rng))]
 
+  def entriesRel(self, *rng):
+    offset = self.subtensorOffset(tuple(r.start for r in rng))
+    return [(self._pattern[ex] - 1 - offset, tuple(e - r.start for e,r in zip(ex, rng))) for ex in self._nonzeros if
+      all(e >= r.start and e < r.stop for e,r in zip(ex, rng))]
+
   def alignedStride(self):
     return self.aligned
 
@@ -545,7 +554,7 @@ def isCompatible(self, spp):
   
   def vec(self, indices, I, Z):
     positionsI = indices.positions(I)
-    positionsZ = indices.positions(Z)
+    positionsZ = indices.positionsIncomplete(Z)
 
     # I and Z need to partition perfectly
 
@@ -568,7 +577,7 @@ def withDummyDimension(self):
   def unfold(self, indices, I, J, Z):
     positionsI = indices.positions(I)
     positionsJ = indices.positions(J)
-    positionsZ = indices.positions(Z)
+    positionsZ = indices.positionsIncomplete(Z)
 
     if positionsI[0] > positionsJ[0]:
       positionsJ, positionsI = positionsI, positionsJ
@@ -595,7 +604,7 @@ def unfold(self, indices, I, J, Z):
       dimmap[p] = i
       i += 1
 
-    pattern = self._pattern.transpose(dimmap)[tuple(selector)].reshape((sizeI, sizeJ))
+    pattern = self._pattern.transpose(dimmap)[tuple(selector)].reshape((sizeI, sizeJ), order='F')
 
     return PatternMemoryLayout(None, alignStride=self.aligned, pattern=pattern)
   
@@ -618,6 +627,9 @@ def equalStride(self, dim):
     # search for: all zeros
     nzp = (self._pattern != 0)
     return nzp.sum(axis=dim) == nzp.max(axis=dim) * nzp.shape[dim]
+  
+  def alignmentOffset(self, dim):
+    return 0
 
 class AlignedCSCMemoryLayout:
   @classmethod
@@ -755,13 +767,11 @@ def storage(self):
   def permuted(self, permutation):
     return MemoryLayoutView(self.base.permuted(permutation), permutation[self.index], self.start, self.end)
   
-  def entries(self, rowRange, colRange):
-    if self.index == 0:
-      return self.base.entries(Range(rowRange.start + self.start, rowRange.stop + self.start), colRange)
-    elif self.index == 1:
-      return self.base.entries(rowRange, Range(colRange.start + self.start, colRange.stop + self.start))
-    else:
-      raise NotImplementedError()
+  def entries(self, *rng):
+    return self.base.entries([Range(r.start + self.start, r.stop + self.start) if self.index == i else r for i,r in enumerate(rng)])
+  
+  def entriesRel(self, *rng):
+    return self.base.entriesRel([Range(r.start + self.start, r.stop + self.start) if self.index == i else r for i,r in enumerate(rng)])
   
   def mayFuse(self, positions):
     return (self.index not in positions or positions[-1] == self.index) and self.base.mayFuse(positions)

From 90b4ce98d077c078c6a00407eff1421dad6902c5 Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Fri, 13 Feb 2026 19:01:34 +0100
Subject: [PATCH 15/15] Adjust the `PatternTensorView` to support const

---
 include/yateto/TensorView.h | 40 ++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h
index 002a4f1..431b23e 100644
--- a/include/yateto/TensorView.h
+++ b/include/yateto/TensorView.h
@@ -500,37 +500,38 @@ namespace yateto {
     uint_t const* m_colPtr;
   };
 
-  template<unsigned Dim, typename real_t, typename uint_t>
+  template<unsigned Dim, typename real_t, typename uint_t, bool Const = false>
   class PatternTensorView : public TensorView<Dim, real_t, uint_t> {
   public:
-    // TODO: remove const_cast<uint_t*>(pattern)
+    using data_t = std::conditional_t<Const, const real_t*, real_t*>;
+    using dataref_t = std::conditional_t<Const, const real_t&, real_t&>;
 
-    explicit PatternTensorView(real_t* values, std::initializer_list<uint_t> shape, uint_t const* pattern)
-      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(const_cast<uint_t*>(pattern), shape) {
+    explicit PatternTensorView(data_t values, std::initializer_list<uint_t> shape, uint_t const* pattern)
+      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(pattern, shape) {
 
       computeSize();
     }
 
-    explicit PatternTensorView(real_t* values, uint_t const shape[], uint_t const* pattern)
-      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(const_cast<uint_t*>(pattern), shape) {
+    explicit PatternTensorView(data_t values, uint_t const shape[], uint_t const* pattern)
+      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(pattern, shape) {
 
       computeSize();
     }
 
-    explicit PatternTensorView(real_t* values, std::initializer_list<uint_t> shape, DenseTensorView<Dim, uint_t, uint_t> pattern)
+    explicit PatternTensorView(data_t values, std::initializer_list<uint_t> shape, DenseTensorView<Dim, uint_t, uint_t, true> pattern)
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(std::move(pattern)) {
 
       computeSize();
     }
 
-    explicit PatternTensorView(real_t* values, uint_t const shape[], DenseTensorView<Dim, uint_t, uint_t> pattern)
+    explicit PatternTensorView(data_t values, uint_t const shape[], DenseTensorView<Dim, uint_t, uint_t, true> pattern)
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_pattern(std::move(pattern)) {
 
       computeSize();
     }
 
     void computeSize() {
-      m_pattern.forall([&](const auto& index, const auto& idxval) {
+      m_pattern.forall([&](const auto& /*index*/, const auto& idxval) {
         if (idxval > 0) {
           ++m_size;
         }
@@ -542,7 +543,7 @@ namespace yateto {
     }
 
     void setZero() {
-      m_pattern.forall([&](const auto& index, const auto& idxval) {
+      m_pattern.forall([&](const auto& /*index*/, const auto& idxval) {
         if (idxval > 0) {
           m_values[idxval - 1] = 0;
         }
@@ -557,7 +558,7 @@ namespace yateto {
     }
 
     template<typename ...Args>
-    real_t& operator()(Args... index) {
+    dataref_t operator()(Args... index) {
       static_assert((std::is_integral_v<Args> && ...));
       const auto idx = m_pattern(index...);
       return m_values[idx - 1];
@@ -570,7 +571,7 @@ namespace yateto {
       return idx > 0;
     }
 
-    real_t& operator[](const uint_t entry[Dim]) {
+    dataref_t operator[](const uint_t entry[Dim]) {
       const auto idx = m_pattern[entry];
       return m_values[idx - 1];
     }
@@ -599,7 +600,7 @@ namespace yateto {
     }
 
     template<class view_t>
-    void copyToView(view_t& other) {
+    void copyToView(view_t& other) const {
       m_pattern.forall([&](const auto& index, const auto& idxval) {
         if (idxval > 0) {
           other[index] = m_values[idxval - 1];
@@ -614,12 +615,19 @@ namespace yateto {
     auto subtensor(Entry... entry) {
       static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
       const auto patternSubtensor = m_pattern.subtensor(entry...);
-      return PatternTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t>(m_values, this->m_shape, patternSubtensor);
+      return PatternTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t, Const>(m_values, this->m_shape, patternSubtensor);
+    }
+
+    template<typename... Entry>
+    auto subtensor(Entry... entry) const {
+      static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
+      const auto patternSubtensor = m_pattern.subtensor(entry...);
+      return PatternTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t, true>(m_values, this->m_shape, patternSubtensor);
     }
 
   protected:
-    real_t* m_values{nullptr};
-    DenseTensorView<Dim, uint_t, uint_t> m_pattern;
+    data_t m_values{nullptr};
+    DenseTensorView<Dim, uint_t, uint_t, true> m_pattern;
     std::size_t m_size{0};
   };
 } // namespace yateto