diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..79aabf1
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: 2021 SeisSol Group
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+Language:        Cpp
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Right
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Regroup
+IncludeCategories:
+  # keep the doctest headers in front
+  - Regex:           '^(<|")doctest'
+    Priority:        1
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        3
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        4
+  - Regex:           '.*'
+    Priority:        2
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentPPDirectives: None
+IndentWidth:     2
+IndentWrappedFunctionNames: true
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+QualifierAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        c++17
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseTab:          Never
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 0000000..240b03b
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: 2021 SeisSol Group
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+# apply clang-format/basic pre-commit
+511bc1513e3a6d5ceb4b338750f57253ec429e9b
+# update clang-format config
+fb0b99c3454b5dad40ae0cff13099e4e8a5bf894
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000..1fa3922
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: 2025 SeisSol Group
+#
+# SPDX-License-Identifier: BSD-3-Clause
+# SPDX-LicenseComments: Full text under /LICENSE and /LICENSES/
+#
+# SPDX-FileContributor: Author lists in /AUTHORS and /CITATION.cff
+
+name: pre-commit
+on:
+  - push
+
+jobs:
+  pre-commit:
+    name: pre-commit
+    runs-on: ubuntu-24.04
+    steps:
+      - uses: actions/checkout@v6
+      - uses: pre-commit/action@v3.0.1
diff --git a/.github/workflows/yateto-ci.yml b/.github/workflows/yateto-cpu.yml
similarity index 92%
rename from .github/workflows/yateto-ci.yml
rename to .github/workflows/yateto-cpu.yml
index b637aba..e9c719a 100644
--- a/.github/workflows/yateto-ci.yml
+++ b/.github/workflows/yateto-cpu.yml
@@ -1,23 +1,23 @@
-name: Yateto CI
+name: yateto-cpu
 
 on: push
 
 jobs:
   general:
-    runs-on: ubuntu-latest
-    container: 
+    runs-on: ubuntu-24.04
+    container:
       image: seissol/gha-cpu:davschneller-gpu-image
     steps:
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v6
 
     - name: Install Yateto
       run: |
         pip3 install -e . --break-system-packages
-    
+
     - name: Python Tests
       run: |
         python3 -m unittest tests/internals/*.py
-    
+
     - name: install-packages
       run: |
         apt-get update -y
@@ -25,7 +25,7 @@ jobs:
         add-apt-repository ppa:deadsnakes/ppa
         apt-get update -y
         apt-get install -y cxxtest
-        
+
     - name: Interface Tests
       run: |
         cd ./tests/interface
@@ -38,8 +38,8 @@ jobs:
         done
 
   codegen:
-    runs-on: ubuntu-latest
-    container: 
+    runs-on: ubuntu-24.04
+    container:
       image: seissol/gha-cpu:davschneller-gpu-image
     env:
       CTEST_OUTPUT_ON_FAILURE: 1
@@ -47,14 +47,14 @@ jobs:
       fail-fast: false
       matrix:
         generator: [none, Eigen, LIBXSMM, LIBXSMM_JIT, OpenBLAS, PSpaMM]
-        
+
     steps:
-    - uses: actions/checkout@v5
+    - uses: actions/checkout@v6
 
     - name: Install Yateto
       run: |
         pip3 install -e . --break-system-packages
-        
+
     - name: install-packages
       run: |
         apt-get update -y
@@ -62,7 +62,7 @@ jobs:
         add-apt-repository ppa:deadsnakes/ppa
         apt-get update -y
         apt-get install -y cxxtest
-    
+
     - name: install-libxsmm
       if: ${{ matrix.generator == 'LIBXSMM_JIT' }}
       run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..37ca028
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,86 @@
+# SPDX-FileCopyrightText: 2025 SeisSol Group
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+---
+
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+    -   id: check-merge-conflict
+        name: '[GENERIC] merge conflict check'
+    -   id: check-symlinks
+        name: '[GENERIC] symlink check'
+    -   id: destroyed-symlinks
+        name: '[GENERIC] detect broken symlinks'
+    -   id: detect-private-key
+        name: '[GENERIC] detect private keys uploaded by accident'
+    -   id: check-case-conflict
+        name: '[GENERIC] detect OS file naming case conflicts'
+    -   id: check-executables-have-shebangs
+        name: '[GENERIC] check for shebangs in executable files'
+    -   id: check-illegal-windows-names
+        name: '[GENERIC] detect illegal Windows file names'
+    -   id: check-json
+        name: '[JSON] check'
+    -   id: check-xml
+        name: '[XML] check'
+
+-   repo: https://github.com/DavidAnson/markdownlint-cli2
+    rev: v0.18.1
+    hooks:
+    -   id: markdownlint-cli2
+        name: '[MARKDOWN] lint'
+
+#-   repo: https://github.com/fsfe/reuse-tool
+#    rev: v6.0.0
+#    hooks:
+#    -   id: reuse
+#        name: '[GENERIC] REUSE compatibiltiy'
+
+#-   repo: https://github.com/psf/black-pre-commit-mirror
+#    rev: 25.1.0
+#    hooks:
+#    -   id: black
+#        files: ^(?!preprocessing|postprocessing)
+#        name: '[PYTHON] black'
+#-   repo: https://github.com/pycqa/isort
+#    rev: 6.0.1
+#    hooks:
+#    -   id: isort
+#        files: ^(?!preprocessing|postprocessing)
+#        args: ["--profile", "black"]
+#        name: '[PYTHON] isort'
+-   repo: https://github.com/pycqa/bandit
+    rev: 1.8.6
+    hooks:
+    -   id: bandit
+        args: ["--confidence-level", "high", "--severity-level", "high"]
+        name: '[PYTHON] bandit'
+#-   repo: https://github.com/pycqa/flake8
+#    rev: '7.3.0'
+#    hooks:
+#    -   id: flake8
+#        files: ^(?!preprocessing|postprocessing)
+#        name: '[PYTHON] Flake8'
+
+-   repo: https://github.com/sphinx-contrib/sphinx-lint
+    rev: 'v1.0.0'
+    hooks:
+    -   id: sphinx-lint
+        name: '[SPHINX/RST] sphinx lint'
+
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: 'v21.1.0'
+    hooks:
+    -   id: clang-format
+        name: '[C++] clang-format'
+
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+    -   id: end-of-file-fixer
+        name: '[GENERIC] newline eof'
+    -   id: trailing-whitespace
+        name: '[GENERIC] remove trailing whitespace'
diff --git a/README.md b/README.md
index 3919b6a..90510e2 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,9 @@
 # YATeTo
 
-It is **Y**et **A**nother **Te**nsor **To**olbox for discontinuous Galerkin methods and other 
-applications. You can find much more information about the package 
-[here](https://arxiv.org/abs/1903.11521).
+It is **Y**et **A**nother **Te**nsor **To**olbox for
+discontinuous Galerkin methods and other
+applications. You can find much more information about the package
+[in this paper](https://arxiv.org/abs/1903.11521).
 
 ## Installation
 
@@ -24,7 +25,7 @@ def add(g):
   B = Tensor('B', (N, N, N))
   w = Tensor('w', (N,))
   C = Tensor('C', (N, N))
-  
+
   kernel = C['ij'] <= 2.0 * C['ij'] + A['lj'] * B['ikl'] * w['k']
   g.add(name='kernel', ast=kernel)
 
diff --git a/examples/SConstruct b/examples/SConstruct
index 3b37d18..18a0fb2 100644
--- a/examples/SConstruct
+++ b/examples/SConstruct
@@ -26,7 +26,7 @@ if 'CXX' in env['ENV']:
 
 # generate help text
 Help(vars.GenerateHelpText(env))
-  
+
 # handle unknown, maybe misspelled variables
 unknownVariables = vars.UnknownVariables()
 
diff --git a/examples/common/Stopwatch.h b/examples/common/Stopwatch.h
index 7d7a5f1..064b1d9 100644
--- a/examples/common/Stopwatch.h
+++ b/examples/common/Stopwatch.h
@@ -3,8 +3,10 @@
  * This file is part of SeisSol.
  *
  * @author Alexander Heinecke (Alexander.Heinecke@mytum.de)
- * @author Sebastian Rettenberger (sebastian.rettenberger AT tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger)
- * @author Carsten Uphoff (c.uphoff AT tum.de, http://www5.in.tum.de/wiki/index.php/Carsten_Uphoff,_M.Sc.)
+ * @author Sebastian Rettenberger (sebastian.rettenberger AT tum.de,
+ * http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger)
+ * @author Carsten Uphoff (c.uphoff AT tum.de,
+ * http://www5.in.tum.de/wiki/index.php/Carsten_Uphoff,_M.Sc.)
  *
  * @section LICENSE
  * Copyright (c) 2016-2017, SeisSol Group
@@ -48,97 +50,81 @@
 /**
  * Stopwatch
  *
- * Part of SeisSol, so you can easily calculate the needed time of SeisSol computations with a high precision
+ * Part of SeisSol, so you can easily calculate the needed time of SeisSol
+ * computations with a high precision
  */
-class Stopwatch
-{
-private:
-	struct timespec m_start;
-
-	/** Time already spent */
-	long long m_time;
-  
+class Stopwatch {
+  private:
+  struct timespec m_start;
+
+  /** Time already spent */
+  long long m_time;
+
   /** Returns the time difference in nanoseconds. */
-  long long difftime(struct timespec const& end)
-  {
+  long long difftime(struct timespec const& end) {
     return 1000000000L * (end.tv_sec - m_start.tv_sec) + end.tv_nsec - m_start.tv_nsec;
   }
-  
-  double seconds(long long time) 
-  {
-    return 1.0e-9 * time;
-  }
 
-public:
-	/**
-	 * Constructor
-	 *
-	 * resets the Stopwatch
-	 */
-	Stopwatch() : m_time(0)
-  {}
-
-	/**
-	 * Destructor
-	 */
-	~Stopwatch()
-	{}
-
-	/**
-	 * Reset the stopwatch to zero
-	 */
-	void reset()
-	{
-		m_time = 0;
-	}
-
-	/**
-	 * starts the time measuring
-	 */
-	void start()
-	{
-		clock_gettime(CLOCK_MONOTONIC, &m_start);
-	}
-
-	/**
-	 * get time measuring
-	 *
-	 * @return measured time (until now) in seconds
-	 */
-	double split()
-	{
-		struct timespec end;
-		clock_gettime(CLOCK_MONOTONIC, &end);
-    
+  double seconds(long long time) { return 1.0e-9 * time; }
+
+  public:
+  /**
+   * Constructor
+   *
+   * resets the Stopwatch
+   */
+  Stopwatch() : m_time(0) {}
+
+  /**
+   * Destructor
+   */
+  ~Stopwatch() {}
+
+  /**
+   * Reset the stopwatch to zero
+   */
+  void reset() { m_time = 0; }
+
+  /**
+   * starts the time measuring
+   */
+  void start() { clock_gettime(CLOCK_MONOTONIC, &m_start); }
+
+  /**
+   * get time measuring
+   *
+   * @return measured time (until now) in seconds
+   */
+  double split() {
+    struct timespec end;
+    clock_gettime(CLOCK_MONOTONIC, &end);
+
     return seconds(difftime(end));
-	}
-
-	/**
-	 * pauses the measuring
-	 *
-	 * @return measured time (until now) in seconds
-	 */
-	double pause()
-	{
-		struct timespec end;
-		clock_gettime(CLOCK_MONOTONIC, &end);
-
-		m_time += difftime(end);
-		return seconds(m_time);
-	}
-
-	/**
-	 * stops time measuring
-	 *
-	 * @return measured time in seconds
-	 */
-	double stop()
-	{
-		double time = pause();
-		reset();
-		return time;
-	}
+  }
+
+  /**
+   * pauses the measuring
+   *
+   * @return measured time (until now) in seconds
+   */
+  double pause() {
+    struct timespec end;
+    clock_gettime(CLOCK_MONOTONIC, &end);
+
+    m_time += difftime(end);
+    return seconds(m_time);
+  }
+
+  /**
+   * stops time measuring
+   *
+   * @return measured time in seconds
+   */
+  double stop() {
+    double time = pause();
+    reset();
+    return time;
+  }
 };
 
 #endif // STOPWATCH_H
-
diff --git a/examples/common/Util.h b/examples/common/Util.h
index 6a83a92..08242b6 100644
--- a/examples/common/Util.h
+++ b/examples/common/Util.h
@@ -7,12 +7,11 @@ typedef double real;
 #elif REAL_SIZE == 4
 typedef float real;
 #else
-#  error REAL_SIZE not supported.
+#error REAL_SIZE not supported.
 #endif
 
 void fillWithStuff(real* A, unsigned reals) {
   for (unsigned j = 0; j < reals; ++j) {
-      A[j] = drand48();
+    A[j] = drand48();
   }
 }
-
diff --git a/examples/optimal_ind.py b/examples/optimal_ind.py
index be5941f..3a2f8e1 100755
--- a/examples/optimal_ind.py
+++ b/examples/optimal_ind.py
@@ -29,6 +29,3 @@ def add(g):
                         tmp2[i2]  <= tmp1[i1]  * C['dfjk'],
                         S['abij'] <= tmp2[i2]  * A['acik'] ]
             g.add('kernel_{}_{}'.format(i1,i2), kernel)
-
-
-
diff --git a/examples/seissol_eqspp.py b/examples/seissol_eqspp.py
index 7e3e15f..6f439b3 100755
--- a/examples/seissol_eqspp.py
+++ b/examples/seissol_eqspp.py
@@ -10,7 +10,7 @@ def printEqspp():
 
 def add(g):
   db = parseXMLMatrixFile('seissol_matrices.xml')
-  
+
   Q = Tensor('Q', (8, 20, 15))
   I = Tensor('I', (8, 20, 15))
   g.add('seissol_stiffness', Q['skp'] <= db.kXiTDivM['lk'] * I['slq'] * db.star['qp'])
diff --git a/examples/site_scons/arch.py b/examples/site_scons/arch.py
index a5e9d4d..072a648 100644
--- a/examples/site_scons/arch.py
+++ b/examples/site_scons/arch.py
@@ -8,21 +8,21 @@
 # @section LICENSE
 # Copyright (c) 2016, SeisSol Group
 # All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
-# 
+#
 # 1. Redistributions of source code must retain the above copyright notice,
 #    this list of conditions and the following disclaimer.
-# 
+#
 # 2. Redistributions in binary form must reproduce the above copyright notice,
 #    this list of conditions and the following disclaimer in the documentation
 #    and/or other materials provided with the distribution.
-# 
+#
 # 3. Neither the name of the copyright holder nor the names of its
 #    contributors may be used to endorse or promote products derived from this
 #    software without specific prior written permission.
-# 
+#
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -57,7 +57,7 @@ def getRealSize(architecture):
 
 def getCpu(architecture):
   return architecture[1:]
-  
+
 def getAlignment(architecture):
   alignments = {
       'noarch': 16,
@@ -69,13 +69,13 @@ def getAlignment(architecture):
       'skx': 64
   }
   return alignments[ getCpu(architecture) ]
-  
+
 def getFlags(architecture, compiler = 'intel'):
   if architecture not in getArchitectures():
     raise ValueError('Unknown architecture.')
-  
+
   cpu = getCpu(architecture)
-  
+
   if cpu == 'wsm':
     flags = ['-msse3']
   elif cpu == 'snb':
@@ -93,13 +93,13 @@ def getFlags(architecture, compiler = 'intel'):
     flags = ['-xCORE-AVX512', '-fma']
   else:
     flags = []
-  
+
   # enable interproc. opts for small cores
   if cpu in ['knc', 'knl', 'skx']:
     flags.append('-ip')
 
   flags.append('-fopenmp')
-              
+
   return flags
 
 def getDefines(architecture):
diff --git a/examples/site_scons/site_tools/cxxtest.py b/examples/site_scons/site_tools/cxxtest.py
index 65f2891..0fe97d0 100644
--- a/examples/site_scons/site_tools/cxxtest.py
+++ b/examples/site_scons/site_tools/cxxtest.py
@@ -15,7 +15,7 @@
 # Maintainer: Gašper Ažman <gasper.azman@gmail.com>
 #
 # This file is maintained as a part of the CxxTest test suite.
-# 
+#
 # == About ==
 #
 # This builder correctly tracks dependencies and supports just about every
@@ -171,7 +171,7 @@ def defaultCxxTestGenLocation(env):
 
 def findCxxTestGen(env):
     """locate the cxxtestgen script by checking environment, path and project"""
- 
+
     # check the SCons environment...
     # Then, check the OS environment...
     cxxtest = envget(env, 'CXXTEST', None)
@@ -201,7 +201,7 @@ def findCxxTestGen(env):
         # make sure it was correct
         if isValidScriptPath(cxxtest):
            return os.path.realpath(cxxtest)
- 
+
     # No valid environment variable found, so...
     # Next, check the path...
     # Next, check the project
@@ -209,7 +209,7 @@ def findCxxTestGen(env):
             envget(env, 'CXXTEST_INSTALL_DIR'),
             envget(env, 'CXXTEST_CXXTESTGEN_DEFAULT_LOCATION'))
 
-    cxxtest = (env.WhereIs(envget(env, 'CXXTEST_CXXTESTGEN_SCRIPT_NAME')) or 
+    cxxtest = (env.WhereIs(envget(env, 'CXXTEST_CXXTESTGEN_SCRIPT_NAME')) or
                env.WhereIs(envget(env, 'CXXTEST_CXXTESTGEN_SCRIPT_NAME'),
                    path=[Dir(check_path).abspath]))
 
@@ -286,7 +286,7 @@ def generate(env, **kwargs):
     #
     # Expected behaviour: keyword arguments override environment variables;
     # environment variables override default settings.
-    # 
+    #
     env.SetDefault( CXXTEST_RUNNER  = 'ErrorPrinter'        )
     env.SetDefault( CXXTEST_OPTS    = ''                    )
     env.SetDefault( CXXTEST_SUFFIX  = '.t.h'                )
@@ -316,7 +316,7 @@ def generate(env, **kwargs):
 
     # find and add the CxxTest headers to the path.
     env.AppendUnique( CXXTEST_CPPPATH = findCxxTestHeaders(env) )
- 
+
     cxxtest = env['CXXTEST']
     if cxxtest:
         #
@@ -397,4 +397,3 @@ def CxxTest(env, target, source = None, **kwargs):
 
 def exists(env):
     return os.path.exists(env['CXXTEST'])
-
diff --git a/examples/springer.py b/examples/springer.py
index 5a152f8..c88597b 100755
--- a/examples/springer.py
+++ b/examples/springer.py
@@ -14,7 +14,7 @@ def cold():
 def add_tensor(name, ind, size):
   shape = tuple(size[k] for k in ind)
   return Tensor(name + str(_bench_no), shape)
-  
+
 def add_bench(g, descr, sizes):
   global _bench_no
 
diff --git a/examples/stock.py b/examples/stock.py
index c50e644..e530d05 100755
--- a/examples/stock.py
+++ b/examples/stock.py
@@ -41,4 +41,3 @@ def add(g):
 
       stock = R['ijk'] <= S['xyz'] * XLTP['lx'] * XRTP['il'] * YL['ym'] * YR['mj'] * ZL['zn'] * ZR['nk']
       g.add('stock{}_trans_pad'.format(pqx), stock)
-
diff --git a/include/yateto.h b/include/yateto.h
index 46c68db..bee5074 100644
--- a/include/yateto.h
+++ b/include/yateto.h
@@ -1,9 +1,9 @@
 #ifndef YATETO_H_
 #define YATETO_H_
 
-#include "yateto/TensorView.h"
 #include "yateto/InitTools.h"
 #include "yateto/LinearAllocator.h"
 #include "yateto/Misc.h"
+#include "yateto/TensorView.h"
 
 #endif
diff --git a/include/yateto/CopyPolicy.h b/include/yateto/CopyPolicy.h
index 1af26db..fae24e6 100644
--- a/include/yateto/CopyPolicy.h
+++ b/include/yateto/CopyPolicy.h
@@ -4,14 +4,14 @@
 #include <algorithm>
 
 namespace yateto {
-  template <typename float_t>
-  class SimpleCopyPolicy {
+template <typename float_t>
+class SimpleCopyPolicy {
   public:
-    float_t* copy(float_t const* first, float_t const* last, float_t*& mem) {
-      mem = std::copy(first, last, mem);
-      return mem;
-    }
-  };
-}
+  float_t* copy(const float_t* first, const float_t* last, float_t*& mem) {
+    mem = std::copy(first, last, mem);
+    return mem;
+  }
+};
+} // namespace yateto
 
-#endif  // YATETO_COPY_POLICY_H_
+#endif // YATETO_COPY_POLICY_H_
diff --git a/include/yateto/InitTools.h b/include/yateto/InitTools.h
index d582b01..966f1bb 100644
--- a/include/yateto/InitTools.h
+++ b/include/yateto/InitTools.h
@@ -2,137 +2,144 @@
 #define YATETO_INITTOOLS_H_
 
 #include "CopyPolicy.h"
+
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 
 namespace yateto {
-    /** Computes a number of tensors inside of a tensor family.
-     *
-     * @return a number of tensors.
-     * */
-    template<class T>
-    constexpr size_t numFamilyMembers() {
-        return sizeof(T::Size) / sizeof(T::Size[0]);
-    }
-
-
-    /** Computes the next closest aligned memory address for a provided relative address.
-     *
-     * @param size a pointer address as integer.
-     * @param alignment a size of a vector register.
-     * @return the next closest aligned relative address.
-     * */
-    template<typename int_t>
-    constexpr size_t alignedUpper(int_t size, size_t alignment) {
-        return size + (alignment - size % alignment) % alignment;
-    }
+/** Computes a number of tensors inside of a tensor family.
+ *
+ * @return a number of tensors.
+ * */
+template <class T>
+constexpr size_t numFamilyMembers() {
+  return sizeof(T::Size) / sizeof(T::Size[0]);
+}
 
+/** Computes the next closest aligned memory address for a provided relative
+ * address.
+ *
+ * @param size a pointer address as integer.
+ * @param alignment a size of a vector register.
+ * @return the next closest aligned relative address.
+ * */
+template <typename int_t>
+constexpr size_t alignedUpper(int_t size, size_t alignment) {
+  return size + (alignment - size % alignment) % alignment;
+}
 
-    /** Computes a number of real number which fits into a vector register.
-     *
-     *  NOTE: a size of real number depends of floating number representation i.e. double or float.
-     *
-     *  @param alignment a size of a vector register in bytes
-     *  @return number of real numbers inside of a vector register
-     * */
-    template<typename float_t>
-    constexpr size_t alignedReals(size_t alignment) {
-        return alignment / sizeof(float_t);
-    }
+/** Computes a number of real number which fits into a vector register.
+ *
+ *  NOTE: a size of real number depends of floating number representation i.e.
+ * double or float.
+ *
+ *  @param alignment a size of a vector register in bytes
+ *  @return number of real numbers inside of a vector register
+ * */
+template <typename float_t>
+constexpr size_t alignedReals(size_t alignment) {
+  return alignment / sizeof(float_t);
+}
 
+/** Computes a size occupied by a tensor family including data alignment b/w
+ * tensors in terms of real numbers.
+ *
+ * NOTE: recursive function.
+ *
+ * @param alignedReals number of real numbers inside of a vector register.
+ * @param n a tensor index inside of a tensor family.
+ * @return a size of a tensor family
+ * */
+template <class T>
+constexpr size_t computeFamilySize(size_t alignedReals = 1, size_t n = numFamilyMembers<T>()) {
+  return n == 0 ? 0
+                : alignedUpper(T::Size[n - 1], alignedReals) +
+                      computeFamilySize<T>(alignedReals, n - 1);
+}
 
-    /** Computes a size occupied by a tensor family including data alignment b/w tensors in terms of real numbers.
-     *
-     * NOTE: recursive function.
-     *
-     * @param alignedReals number of real numbers inside of a vector register.
-     * @param n a tensor index inside of a tensor family.
-     * @return a size of a tensor family
-     * */
-    template<class T>
-    constexpr size_t computeFamilySize(size_t alignedReals = 1, size_t n = numFamilyMembers<T>()) {
-        return n == 0 ? 0 : alignedUpper(T::Size[n-1], alignedReals) + computeFamilySize<T>(alignedReals, n-1);
+template <typename float_t, typename CopyPolicyT>
+class CopyManager {
+  public:
+  /** Copies data from a tensor to a given memory chunk.
+   *
+   *  NOTE: The function shifts and aligns a pointer w.r.t. to a given vector
+   * register size.
+   *
+   *  @param mem an address to a chunk of memory.
+   *         NOTE: the address is going to be incremented every time
+   *         when new information is written.
+   *  @param alignment a size of a vector register (in bytes).
+   *  @param ptr.
+   *  @param alignment.
+   * */
+  template <class T>
+  void copyTensorToMemAndSetPtr(float_t*& mem, float_t*& ptr, size_t alignment = 1) {
+    ptr = mem;
+    copyValuesToMem(mem, T::Values, T::Values + T::Size, alignment);
+  }
+
+  /** Copies data from tensors from a tensor family to a given memory chunk.
+   *
+   * NOTE: The function writes the actual address (where aligned tensor data
+   * stored) back to a tensor family
+   *
+   *  @param container a reference to a container which contains tensor family
+   * data.
+   *  @param mem an address to an allocated chunk of memory.
+   *         NOTE: the address is going to be incremented every time
+   *         when new information is written.
+   *  @param alignment a size of a vector register (in bytes).
+   * */
+  template <class T>
+  void copyFamilyToMemAndSetPtr(float_t*& mem,
+                                typename T::template Container<const float_t*>& container,
+                                size_t alignment = 1) {
+
+    // determine a size of the container i.e a number of tensor that it holds
+    size_t n = sizeof(T::Size) / sizeof(T::Size[0]);
+
+    for (size_t i = 0; i < n; ++i) {
+      // init pointer of each tensor to the allocated memeory
+      container.data[i] = mem;
+
+      // copy values and shift pointer
+      copyValuesToMem(mem, T::Values[i], T::Values[i] + T::Size[i], alignment);
     }
-
-
-    template<typename float_t, typename CopyPolicyT>
-    class CopyManager {
-    public:
-
-        /** Copies data from a tensor to a given memory chunk.
-         *
-         *  NOTE: The function shifts and aligns a pointer w.r.t. to a given vector register size.
-         *
-         *  @param mem an address to a chunk of memory.
-         *         NOTE: the address is going to be incremented every time
-         *         when new information is written.
-         *  @param alignment a size of a vector register (in bytes).
-         *  @param ptr.
-         *  @param alignment.
-         * */
-        template<class T>
-        void copyTensorToMemAndSetPtr(float_t*& mem, float_t*& ptr, size_t alignment = 1) {
-            ptr = mem;
-            copyValuesToMem(mem, T::Values, T::Values + T::Size, alignment);
-        }
-
-
-        /** Copies data from tensors from a tensor family to a given memory chunk.
-         *
-         * NOTE: The function writes the actual address (where aligned tensor data stored)
-         * back to a tensor family
-         *
-         *  @param container a reference to a container which contains tensor family data.
-         *  @param mem an address to an allocated chunk of memory.
-         *         NOTE: the address is going to be incremented every time
-         *         when new information is written.
-         *  @param alignment a size of a vector register (in bytes).
-         * */
-        template<class T>
-        void copyFamilyToMemAndSetPtr(float_t*& mem,
-                                      typename T::template Container<float_t const*>& container,
-                                      size_t alignment = 1) {
-
-            // determine a size of the container i.e a number of tensor that it holds
-            size_t n = sizeof(T::Size) / sizeof(T::Size[0]);
-
-            for (size_t i = 0; i < n; ++i) {
-                // init pointer of each tensor to the allocated memeory
-                container.data[i] = mem;
-
-                // copy values and shift pointer
-                copyValuesToMem(mem, T::Values[i], T::Values[i] + T::Size[i], alignment);
-            }
-        }
-
-    protected:
-        /** Copies a tensor to a given memory chunk, and shifts a given pointer.
-         *
-         *  NOTE: The function shifts and aligns a pointer w.r.t. to a given vector register size.
-         *
-         *  @param mem an address to a chunk of memory.
-         *         NOTE: the address is going to be incremented every time
-         *         when new information is written.
-         *  @param first a pointer to the beginning of tensor data.
-         *  @param last a pointer to the end of tensor data.
-         *  @param alignment a size of a vector register (in bytes).
-         * */
-        virtual void copyValuesToMem(float_t*& mem, float_t const* first, float_t const* last, size_t alignment) {
-
-            // copy data
-            mem = copier.copy(first, last, mem);
-
-            // shift pointer
-            mem += (alignedUpper(reinterpret_cast<uintptr_t>(mem), alignment) - reinterpret_cast<uintptr_t>(mem)) / sizeof(float_t);
-            assert(reinterpret_cast<uintptr_t>(mem) % alignment == 0);
-        }
-
-    private:
-      CopyPolicyT copier{};
-    };
-
-    template<class float_t> using DefaultCopyManager = CopyManager<float_t, SimpleCopyPolicy<float_t>>;
-}
+  }
+
+  protected:
+  /** Copies a tensor to a given memory chunk, and shifts a given pointer.
+   *
+   *  NOTE: The function shifts and aligns a pointer w.r.t. to a given vector
+   * register size.
+   *
+   *  @param mem an address to a chunk of memory.
+   *         NOTE: the address is going to be incremented every time
+   *         when new information is written.
+   *  @param first a pointer to the beginning of tensor data.
+   *  @param last a pointer to the end of tensor data.
+   *  @param alignment a size of a vector register (in bytes).
+   * */
+  virtual void
+      copyValuesToMem(float_t*& mem, const float_t* first, const float_t* last, size_t alignment) {
+
+    // copy data
+    mem = copier.copy(first, last, mem);
+
+    // shift pointer
+    mem += (alignedUpper(reinterpret_cast<uintptr_t>(mem), alignment) -
+            reinterpret_cast<uintptr_t>(mem)) /
+           sizeof(float_t);
+    assert(reinterpret_cast<uintptr_t>(mem) % alignment == 0);
+  }
+
+  private:
+  CopyPolicyT copier{};
+};
+
+template <class float_t>
+using DefaultCopyManager = CopyManager<float_t, SimpleCopyPolicy<float_t>>;
+} // namespace yateto
 
 #endif
diff --git a/include/yateto/LinearAllocator.h b/include/yateto/LinearAllocator.h
index d3621ab..a0c41fd 100644
--- a/include/yateto/LinearAllocator.h
+++ b/include/yateto/LinearAllocator.h
@@ -5,31 +5,31 @@
 #include <cstddef>
 
 namespace yateto {
-template<typename T>
+template <typename T>
 struct LinearAllocatorT {
-public:
+  public:
   void initialize(T* ptr) {
-      isInit = true;
-      userSpaceMem = ptr;
+    isInit = true;
+    userSpaceMem = ptr;
   }
 
   T* allocate(size_t size) {
-      assert(isInit && "YATETO: Temporary-Memory manager hasn't been initialized");
-      int currentByteCount = byteCount;
-      byteCount += size;
-      return &userSpaceMem[currentByteCount];
+    assert(isInit && "YATETO: Temporary-Memory manager hasn't been initialized");
+    int currentByteCount = byteCount;
+    byteCount += size;
+    return &userSpaceMem[currentByteCount];
   }
 
   void free() {
-      isInit = false;
-      byteCount = 0;
-      userSpaceMem = nullptr;
+    isInit = false;
+    byteCount = 0;
+    userSpaceMem = nullptr;
   }
 
-private:
+  private:
   size_t byteCount{0};
   bool isInit{false};
-  T *userSpaceMem{nullptr};
+  T* userSpaceMem{nullptr};
 };
-} // yateto
-#endif // YATETO_LINEAR_ALLOCATED_H_
\ No newline at end of file
+} // namespace yateto
+#endif // YATETO_LINEAR_ALLOCATED_H_
diff --git a/include/yateto/Misc.h b/include/yateto/Misc.h
index 8362b2b..3913c82 100644
--- a/include/yateto/Misc.h
+++ b/include/yateto/Misc.h
@@ -5,14 +5,13 @@
 
 namespace yateto {
 
-template<typename KernelType>
+template <typename KernelType>
 auto getMaxTmpMemRequired(KernelType& krnl) {
   return KernelType::TmpMaxMemRequiredInBytes;
 }
 
-template<typename KernelType, typename... OtherKernelTypes>
-auto getMaxTmpMemRequired(KernelType& krnl,
-                          OtherKernelTypes&... otherKrnls) {
+template <typename KernelType, typename... OtherKernelTypes>
+auto getMaxTmpMemRequired(KernelType& krnl, OtherKernelTypes&... otherKrnls) {
   auto currentTmpMem = KernelType::TmpMaxMemRequiredInBytes;
   auto otherTmpMem = getMaxTmpMemRequired(otherKrnls...);
   return (currentTmpMem > otherTmpMem) ? currentTmpMem : otherTmpMem;
@@ -28,6 +27,6 @@ constexpr size_t leadDim() noexcept {
   return dimSize<Tensor, 0>();
 }
 
-} // yateto
+} // namespace yateto
 
 #endif // YATETO_MISC_H_
diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h
index 73ddc86..573172e 100644
--- a/include/yateto/TensorView.h
+++ b/include/yateto/TensorView.h
@@ -1,432 +1,432 @@
 #ifndef YATETO_MATRIXVIEW_H_
 #define YATETO_MATRIXVIEW_H_
 
+#include <algorithm>
 #include <cassert>
 #include <cstring>
-#include <algorithm>
 #include <iterator>
 #include <limits>
 #include <type_traits>
 
 namespace yateto {
-  template<typename uint_t=unsigned>
-  class slice {
+template <typename uint_t = unsigned>
+class slice {
   public:
-    explicit slice(uint_t start = 0, uint_t stop = std::numeric_limits<uint_t>::max())
-      : start(start), stop(stop)
-        {}
-
-    uint_t start;
-    uint_t stop;
-  };
-
-  template<typename uint_t, typename... Entry>
-  struct count_slices : std::integral_constant<uint_t,0> {};
-  template<typename uint_t, typename Head, typename... Tail>
-  struct count_slices<uint_t, Head, Tail...> : std::integral_constant<uint_t, ((std::is_same_v<Head, slice<uint_t>>) ? 1 : 0) + count_slices<uint_t, Tail...>::value> {};
-
-  template<unsigned Dim, typename real_t, typename uint_t>
-  class TensorView {
+  explicit slice(uint_t start = 0, uint_t stop = std::numeric_limits<uint_t>::max())
+      : start(start), stop(stop) {}
+
+  uint_t start;
+  uint_t stop;
+};
+
+template <typename uint_t, typename... Entry>
+struct count_slices : std::integral_constant<uint_t, 0> {};
+template <typename uint_t, typename Head, typename... Tail>
+struct count_slices<uint_t, Head, Tail...>
+    : std::integral_constant<uint_t,
+                             ((std::is_same_v<Head, slice<uint_t>>) ? 1 : 0) +
+                                 count_slices<uint_t, Tail...>::value> {};
+
+template <unsigned Dim, typename real_t, typename uint_t>
+class TensorView {
   public:
-    explicit TensorView(std::initializer_list<uint_t> shape) {
-      std::copy(shape.begin(), shape.end(), m_shape);
-    }
+  explicit TensorView(std::initializer_list<uint_t> shape) {
+    std::copy(shape.begin(), shape.end(), m_shape);
+  }
 
-    explicit TensorView(uint_t const shape[]) {
-      for (uint_t d = 0; d < Dim; ++d) {
-        m_shape[d] = shape[d];
-      }
-    }
-    
-    static constexpr uint_t dim() {
-      return Dim;
+  explicit TensorView(const uint_t shape[]) {
+    for (uint_t d = 0; d < Dim; ++d) {
+      m_shape[d] = shape[d];
     }
+  }
 
-    uint_t shape(uint_t dim) const {
-      return m_shape[dim];
-    }
+  static constexpr uint_t dim() { return Dim; }
+
+  uint_t shape(uint_t dim) const { return m_shape[dim]; }
 
   protected:
-    uint_t m_shape[Dim];
-  };
+  uint_t m_shape[Dim];
+};
 
-  template<typename real_t, typename uint_t>
-  class TensorView<0, real_t, uint_t> {
+template <typename real_t, typename uint_t>
+class TensorView<0, real_t, uint_t> {
   public:
-    explicit TensorView(std::initializer_list<uint_t> shape) {} 
+  explicit TensorView(std::initializer_list<uint_t> shape) {}
 
-    explicit TensorView(uint_t const shape[]) {}
-    
-    static constexpr uint_t dim() {
-      return 0;
-    }
+  explicit TensorView(const uint_t shape[]) {}
 
-    uint_t shape(uint_t dim) const {
-      return 0;
-    }
-  };
+  static constexpr uint_t dim() { return 0; }
+
+  uint_t shape(uint_t dim) const { return 0; }
+};
 
-  template<unsigned Dim, typename real_t, typename uint_t=unsigned, bool Const = false>
-  class DenseTensorView : public TensorView<Dim, real_t, uint_t> {
+template <unsigned Dim, typename real_t, typename uint_t = unsigned, bool Const = false>
+class DenseTensorView : public TensorView<Dim, real_t, uint_t> {
   public:
-    using data_t = std::conditional_t<Const, const real_t*, real_t*>;
-    using dataref_t = std::conditional_t<Const, const real_t&, real_t&>;
+  using data_t = std::conditional_t<Const, const real_t*, real_t*>;
+  using dataref_t = std::conditional_t<Const, const real_t&, real_t&>;
 
-    explicit DenseTensorView(data_t values, std::initializer_list<uint_t> shape, std::initializer_list<uint_t> start, std::initializer_list<uint_t> stop)
+  explicit DenseTensorView(data_t values,
+                           std::initializer_list<uint_t> shape,
+                           std::initializer_list<uint_t> start,
+                           std::initializer_list<uint_t> stop)
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values) {
-      std::copy(start.begin(), start.end(), m_start);
-      std::copy(stop.begin(), stop.end(), m_stop);
-      computeStride();
-    }
+    std::copy(start.begin(), start.end(), m_start);
+    std::copy(stop.begin(), stop.end(), m_stop);
+    computeStride();
+  }
 
-    explicit DenseTensorView(data_t values, std::initializer_list<uint_t> shape)
+  explicit DenseTensorView(data_t values, std::initializer_list<uint_t> shape)
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_start{} {
-      std::copy(shape.begin(), shape.end(), m_stop);
-      computeStride();
-    }
-
-    explicit DenseTensorView(data_t values, uint_t const shape[], uint_t const start[], uint_t const stop[])
+    std::copy(shape.begin(), shape.end(), m_stop);
+    computeStride();
+  }
+
+  explicit DenseTensorView(data_t values,
+                           const uint_t shape[],
+                           const uint_t start[],
+                           const uint_t stop[])
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values) {
-      for (uint_t d = 0; d < Dim; ++d) {
-        m_start[d] = start[d];
-        m_stop[d] = stop[d];
-      }
-      computeStride();
+    for (uint_t d = 0; d < Dim; ++d) {
+      m_start[d] = start[d];
+      m_stop[d] = stop[d];
     }
+    computeStride();
+  }
 
-    explicit DenseTensorView(data_t values, uint_t const shape[])
-      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_start{} {
-      for (uint_t d = 0; d < Dim; ++d) {
-        m_stop[d] = shape[d];
-      }
-      computeStride();
-    }
- 
-    explicit DenseTensorView(data_t values, uint_t const shape[], uint_t const stride[])
+  explicit DenseTensorView(data_t values, const uint_t shape[])
       : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_start{} {
-      for (uint_t d = 0; d < Dim; ++d) {
-        m_stop[d] = shape[d];
-        m_stride[d] = stride[d];
-      }
-    }
-
-    uint_t size() const {
-      return (m_stop[Dim-1]-m_start[Dim-1]) * m_stride[Dim-1];
-    }
-    
-    void setZero() {
-      uint_t entry[Dim];
-      std::copy(m_start, m_start + Dim, entry);
-      while (entry[Dim-1] != m_stop[Dim-1]) {
-        auto values = &operator[](entry);
-        for (uint_t i = 0.0; i < m_stop[0]-m_start[0]; ++i) {
-          values[i*m_stride[0]] = 0.0;
-        }
-        if (Dim == 1) {
-          break;
-        }
-
-        uint_t d = 0;
-        do {
-          entry[d] = m_start[d];
-          d++;
-          ++entry[d];
-        } while (entry[d] == m_stop[d] && d < Dim-1);
-      }
-    }
-
-    template<typename Head>
-    bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head) const {
-      return static_cast<uint_t>(head) >= start[dim] && static_cast<uint_t>(head) < stop[dim];
-    }
-
-    template<typename Head, typename... Tail>
-    bool isInRange(const uint_t  start[Dim], const uint_t stop[Dim], int dim, Head head, Tail... tail) const {
-      return static_cast<uint_t>(head) >= start[dim]
-             && static_cast<uint_t>(head) < stop[dim]
-             && isInRange(start, stop, dim+1, tail...);
-    }
-
-    template<typename... Entry>
-    bool isInRange(Entry... entry) const {
-      static_assert(sizeof...(entry) == Dim,
-                  "Number of arguments to isInRange(...) does not match the tensor dimension.");
-      return isInRange(m_start, m_stop, 0, entry...);
+    for (uint_t d = 0; d < Dim; ++d) {
+      m_stop[d] = shape[d];
     }
+    computeStride();
+  }
 
-    template<typename... Entry>
-    dataref_t operator()(Entry... entry) {
-      static_assert(sizeof...(entry) == Dim,
-                        "Number of arguments to operator() does not match the tensor dimension.");
-      assert(isInRange(entry...));
-      return m_values[address(entry...)];
+  explicit DenseTensorView(data_t values, const uint_t shape[], const uint_t stride[])
+      : TensorView<Dim, real_t, uint_t>(shape), m_values(values), m_start{} {
+    for (uint_t d = 0; d < Dim; ++d) {
+      m_stop[d] = shape[d];
+      m_stride[d] = stride[d];
     }
+  }
 
-    template<typename... Entry>
-    const real_t& operator()(Entry... entry) const {
-      static_assert(sizeof...(entry) == Dim,
-                        "Number of arguments to operator() const does not match the tensor dimension.");
-      assert(isInRange(entry...));
-      return m_values[address(entry...)];
-    }
+  uint_t size() const { return (m_stop[Dim - 1] - m_start[Dim - 1]) * m_stride[Dim - 1]; }
 
-    const real_t& operator[](uint_t const entry[Dim]) const {
-      uint_t addr = 0;
-      for (uint_t d = 0; d < Dim; ++d) {
-        assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]);
-        addr += (entry[d] - m_start[d]) * m_stride[d];
+  void setZero() {
+    uint_t entry[Dim];
+    std::copy(m_start, m_start + Dim, entry);
+    while (entry[Dim - 1] != m_stop[Dim - 1]) {
+      auto values = &operator[](entry);
+      for (uint_t i = 0.0; i < m_stop[0] - m_start[0]; ++i) {
+        values[i * m_stride[0]] = 0.0;
       }
-      return m_values[addr];
-    }
-
-    dataref_t operator[](uint_t const entry[Dim]) {
-      uint_t addr = 0;
-      for (uint_t d = 0; d < Dim; ++d) {
-        assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]);
-        addr += (entry[d] - m_start[d]) * m_stride[d];
+      if (Dim == 1) {
+        break;
       }
-      return m_values[addr];
-    }
 
-    template<class view_t>
-    void copyToView(view_t& other) const {
-      assert(Dim == other.dim());
-      
-      uint_t entry[Dim];
-      for (uint_t d = 0; d < Dim; ++d) {
-        assert(this->shape(d) == other.shape(d));
+      uint_t d = 0;
+      do {
         entry[d] = m_start[d];
+        d++;
+        ++entry[d];
+      } while (entry[d] == m_stop[d] && d < Dim - 1);
+    }
+  }
+
+  template <typename Head>
+  bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head) const {
+    return static_cast<uint_t>(head) >= start[dim] && static_cast<uint_t>(head) < stop[dim];
+  }
+
+  template <typename Head, typename... Tail>
+  bool isInRange(
+      const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head, Tail... tail) const {
+    return static_cast<uint_t>(head) >= start[dim] && static_cast<uint_t>(head) < stop[dim] &&
+           isInRange(start, stop, dim + 1, tail...);
+  }
+
+  template <typename... Entry>
+  bool isInRange(Entry... entry) const {
+    static_assert(sizeof...(entry) == Dim,
+                  "Number of arguments to isInRange(...) does not match the "
+                  "tensor dimension.");
+    return isInRange(m_start, m_stop, 0, entry...);
+  }
+
+  template <typename... Entry>
+  dataref_t operator()(Entry... entry) {
+    static_assert(sizeof...(entry) == Dim,
+                  "Number of arguments to operator() does not match the tensor "
+                  "dimension.");
+    assert(isInRange(entry...));
+    return m_values[address(entry...)];
+  }
+
+  template <typename... Entry>
+  const real_t& operator()(Entry... entry) const {
+    static_assert(sizeof...(entry) == Dim,
+                  "Number of arguments to operator() const does not match the "
+                  "tensor dimension.");
+    assert(isInRange(entry...));
+    return m_values[address(entry...)];
+  }
+
+  const real_t& operator[](const uint_t entry[Dim]) const {
+    uint_t addr = 0;
+    for (uint_t d = 0; d < Dim; ++d) {
+      assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]);
+      addr += (entry[d] - m_start[d]) * m_stride[d];
+    }
+    return m_values[addr];
+  }
+
+  dataref_t operator[](const uint_t entry[Dim]) {
+    uint_t addr = 0;
+    for (uint_t d = 0; d < Dim; ++d) {
+      assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]);
+      addr += (entry[d] - m_start[d]) * m_stride[d];
+    }
+    return m_values[addr];
+  }
+
+  template <class view_t>
+  void copyToView(view_t& other) const {
+    assert(Dim == other.dim());
+
+    uint_t entry[Dim];
+    for (uint_t d = 0; d < Dim; ++d) {
+      assert(this->shape(d) == other.shape(d));
+      entry[d] = m_start[d];
+    }
+
+    uint_t stop0 = std::min(m_stop[0], this->shape(0));
+    data_t val = m_values;
+    while (entry[Dim - 1] != m_stop[Dim - 1]) {
+      for (uint_t i = m_start[0]; i < stop0; ++i) {
+        entry[0] = i;
+        other[entry] = *(val++);
       }
-      
-      uint_t stop0 = std::min(m_stop[0], this->shape(0));
-      data_t val = m_values;
-      while (entry[Dim-1] != m_stop[Dim-1]) {
-        for (uint_t i = m_start[0]; i < stop0; ++i) {
-          entry[0] = i;
-          other[entry] = *(val++);
-        }
-        val += (m_stop[0]-stop0);
-
-        if (Dim == 1) {
-          break;
-        }
-
-        uint_t d = 0;
-        do {
-          entry[d] = m_start[d];
-          d++;
-          ++entry[d];
-        } while (entry[d] == m_stop[d] && d < Dim-1);
-      }
-    }
-
-    template<typename... Entry>
-    auto subtensor(Entry... entry) -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t, Const> {
-      static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
-      constexpr auto nSlices = count_slices<uint_t, Entry...>::value;
-      uint_t begin[Dim];
-      uint_t size[nSlices];
-      uint_t stride[nSlices];
-      extractSubtensor(begin, size, stride, entry...);
-      DenseTensorView<nSlices, real_t, uint_t, Const> subtensor(&operator[](begin), size, stride);
-      return subtensor;
-    }
+      val += (m_stop[0] - stop0);
 
-    template<typename... Entry>
-    auto subtensor(Entry... entry) const -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t, true> {
-      static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension.");
-      constexpr auto nSlices = count_slices<uint_t, Entry...>::value;
-      uint_t begin[Dim];
-      uint_t size[nSlices];
-      uint_t stride[nSlices];
-      extractSubtensor(begin, size, stride, entry...);
-      DenseTensorView<nSlices, real_t, uint_t, true> subtensor(&operator[](begin), size, stride);
-      return subtensor;
-    }
-
-    data_t data() {
-      return m_values;
-    }
-
-    const real_t* data() const {
-      return m_values;
-    }
-
-  protected:
-    void computeStride() {
-      m_stride[0] = 1;
-      for (uint_t d = 0; d < Dim-1; ++d) {
-        m_stride[d+1] = m_stride[d] * (m_stop[d] - m_start[d]);
+      if (Dim == 1) {
+        break;
       }
-    }
-
-    template<typename Head>
-    uint_t address(Head head) const {
-      assert(static_cast<uint_t>(head) >= m_start[Dim-1] && static_cast<uint_t>(head) < m_stop[Dim-1]);
-      return (head - m_start[Dim-1]) * m_stride[Dim-1];
-    }
-
-    template<typename Head, typename... Tail>
-    uint_t address(Head head, Tail... tail) const {
-      uint_t const d = (Dim-1) - sizeof...(tail);
-      assert(static_cast<uint_t>(head) >= m_start[d] && static_cast<uint_t>(head) < m_stop[d]);
-      return (head - m_start[d]) * m_stride[d] + address(tail...);
-    }
-
-    template<typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
-    void extractDim(uint_t*& begin, uint_t*&, uint_t*&, uint_t dimNo, T entry) const {
-      assert(static_cast<uint_t>(entry) >= m_start[dimNo] && static_cast<uint_t>(entry) < m_stop[dimNo]);
-      *begin++ = entry;
-    }
-
-    template<typename T, std::enable_if_t<std::is_same_v<T, slice<uint_t>>, int> = 0>
-    void extractDim(uint_t*& begin, uint_t*& size, uint_t*& stride, uint_t dimNo, T dim) const {
-      *begin = std::max(m_start[dimNo], dim.start);
-      *size++ = std::min(m_stop[dimNo], dim.stop) - *begin;
-      ++begin;
-      *stride++ = m_stride[dimNo];
-    }
-
-    template<typename Head>
-    void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head) const {
-      extractDim<Head>(begin, size, stride, Dim-1, head);
-    }
-
-    template<typename Head, typename... Tail>
-    void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head, Tail... tail) const {
-      uint_t const d = (Dim-1) - sizeof...(tail);
-      extractDim<Head>(begin, size, stride, d, head);
-      extractSubtensor(begin, size, stride, tail...);
-    }
 
-    data_t m_values;
-    uint_t m_start[Dim];
-    uint_t m_stop[Dim];
-    uint_t m_stride[Dim];
-  };
+      uint_t d = 0;
+      do {
+        entry[d] = m_start[d];
+        d++;
+        ++entry[d];
+      } while (entry[d] == m_stop[d] && d < Dim - 1);
+    }
+  }
+
+  template <typename... Entry>
+  auto subtensor(Entry... entry)
+      -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t, Const> {
+    static_assert(sizeof...(entry) == Dim,
+                  "Number of arguments to subtensor() does not match tensor dimension.");
+    constexpr auto nSlices = count_slices<uint_t, Entry...>::value;
+    uint_t begin[Dim];
+    uint_t size[nSlices];
+    uint_t stride[nSlices];
+    extractSubtensor(begin, size, stride, entry...);
+    DenseTensorView<nSlices, real_t, uint_t, Const> subtensor(&operator[](begin), size, stride);
+    return subtensor;
+  }
+
+  template <typename... Entry>
+  auto subtensor(Entry... entry) const
+      -> DenseTensorView<count_slices<uint_t, Entry...>::value, real_t, uint_t, true> {
+    static_assert(sizeof...(entry) == Dim,
+                  "Number of arguments to subtensor() does not match tensor dimension.");
+    constexpr auto nSlices = count_slices<uint_t, Entry...>::value;
+    uint_t begin[Dim];
+    uint_t size[nSlices];
+    uint_t stride[nSlices];
+    extractSubtensor(begin, size, stride, entry...);
+    DenseTensorView<nSlices, real_t, uint_t, true> subtensor(&operator[](begin), size, stride);
+    return subtensor;
+  }
+
+  data_t data() { return m_values; }
+
+  const real_t* data() const { return m_values; }
 
-  template<typename real_t, typename uint_t, bool Const>
-  class DenseTensorView<0,real_t,uint_t,Const> : public TensorView<0, real_t, uint_t> {
+  protected:
+  void computeStride() {
+    m_stride[0] = 1;
+    for (uint_t d = 0; d < Dim - 1; ++d) {
+      m_stride[d + 1] = m_stride[d] * (m_stop[d] - m_start[d]);
+    }
+  }
+
+  template <typename Head>
+  uint_t address(Head head) const {
+    assert(static_cast<uint_t>(head) >= m_start[Dim - 1] &&
+           static_cast<uint_t>(head) < m_stop[Dim - 1]);
+    return (head - m_start[Dim - 1]) * m_stride[Dim - 1];
+  }
+
+  template <typename Head, typename... Tail>
+  uint_t address(Head head, Tail... tail) const {
+    const uint_t d = (Dim - 1) - sizeof...(tail);
+    assert(static_cast<uint_t>(head) >= m_start[d] && static_cast<uint_t>(head) < m_stop[d]);
+    return (head - m_start[d]) * m_stride[d] + address(tail...);
+  }
+
+  template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+  void extractDim(uint_t*& begin, uint_t*&, uint_t*&, uint_t dimNo, T entry) const {
+    assert(static_cast<uint_t>(entry) >= m_start[dimNo] &&
+           static_cast<uint_t>(entry) < m_stop[dimNo]);
+    *begin++ = entry;
+  }
+
+  template <typename T, std::enable_if_t<std::is_same_v<T, slice<uint_t>>, int> = 0>
+  void extractDim(uint_t*& begin, uint_t*& size, uint_t*& stride, uint_t dimNo, T dim) const {
+    *begin = std::max(m_start[dimNo], dim.start);
+    *size++ = std::min(m_stop[dimNo], dim.stop) - *begin;
+    ++begin;
+    *stride++ = m_stride[dimNo];
+  }
+
+  template <typename Head>
+  void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head) const {
+    extractDim<Head>(begin, size, stride, Dim - 1, head);
+  }
+
+  template <typename Head, typename... Tail>
+  void
+      extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head, Tail... tail) const {
+    const uint_t d = (Dim - 1) - sizeof...(tail);
+    extractDim<Head>(begin, size, stride, d, head);
+    extractSubtensor(begin, size, stride, tail...);
+  }
+
+  data_t m_values;
+  uint_t m_start[Dim];
+  uint_t m_stop[Dim];
+  uint_t m_stride[Dim];
+};
+
+template <typename real_t, typename uint_t, bool Const>
+class DenseTensorView<0, real_t, uint_t, Const> : public TensorView<0, real_t, uint_t> {
   public:
-    using data_t = std::conditional_t<Const, const real_t*, real_t*>;
-    using dataref_t = std::conditional_t<Const, const real_t&, real_t&>;
+  using data_t = std::conditional_t<Const, const real_t*, real_t*>;
+  using dataref_t = std::conditional_t<Const, const real_t&, real_t&>;
 
-    explicit DenseTensorView(data_t values, std::initializer_list<uint_t> shape, std::initializer_list<uint_t> start, std::initializer_list<uint_t> stop)
-      : TensorView<0, real_t, uint_t>(shape), m_values(values) {
-    }
+  explicit DenseTensorView(data_t values,
+                           std::initializer_list<uint_t> shape,
+                           std::initializer_list<uint_t> start,
+                           std::initializer_list<uint_t> stop)
+      : TensorView<0, real_t, uint_t>(shape), m_values(values) {}
 
-    uint_t size() const {
-      return 1;
-    }
+  uint_t size() const { return 1; }
 
-    void setZero() {
-      m_values[0] = 0.0;
-    }
+  void setZero() { m_values[0] = 0.0; }
 
-    template<class view_t>
-    void copyToView(view_t& other) {
-      other.m_values[0] = m_values[0];
-    }
+  template <class view_t>
+  void copyToView(view_t& other) {
+    other.m_values[0] = m_values[0];
+  }
 
   protected:
-    data_t m_values;
-  };
+  data_t m_values;
+};
 
-  template<typename real_t, typename uint_t, bool Const = false>
-  class CSCMatrixView : public TensorView<2, real_t, uint_t> {
+template <typename real_t, typename uint_t, bool Const = false>
+class CSCMatrixView : public TensorView<2, real_t, uint_t> {
   public:
-    using data_t = std::conditional_t<Const, const real_t*, real_t*>;
-    using dataref_t = std::conditional_t<Const, const real_t&, real_t&>;
+  using data_t = std::conditional_t<Const, const real_t*, real_t*>;
+  using dataref_t = std::conditional_t<Const, const real_t&, real_t&>;
 
-    explicit CSCMatrixView(data_t values, std::initializer_list<uint_t> shape, uint_t const* rowInd, uint_t const* colPtr)
+  explicit CSCMatrixView(data_t values,
+                         std::initializer_list<uint_t> shape,
+                         const uint_t* rowInd,
+                         const uint_t* colPtr)
       : TensorView<2, real_t, uint_t>(shape), m_values(values), m_rowInd(rowInd), m_colPtr(colPtr) {
-    }
+  }
 
-    explicit CSCMatrixView(data_t values, uint_t const shape[], uint_t const* rowInd, uint_t const* colPtr)
+  explicit CSCMatrixView(data_t values,
+                         const uint_t shape[],
+                         const uint_t* rowInd,
+                         const uint_t* colPtr)
       : TensorView<2, real_t, uint_t>(shape), m_values(values), m_rowInd(rowInd), m_colPtr(colPtr) {
-    }
+  }
 
-    uint_t size() const {
-      return m_colPtr[ this->shape(1) ];
-    }
+  uint_t size() const { return m_colPtr[this->shape(1)]; }
 
-    void setZero() {
-      memset(m_values, 0, size() * sizeof(real_t));
-    }
+  void setZero() { memset(m_values, 0, size() * sizeof(real_t)); }
 
-    const real_t& operator()(uint_t row, uint_t col) const {
-      assert(col >= 0 && col < this->shape(1));
-      uint_t addr = m_colPtr[ col ];
-      uint_t stop = m_colPtr[ col+1 ];
-      while (addr < stop) {
-        if (m_rowInd[addr] == row) {
-          break;
-        }
-        ++addr;
+  const real_t& operator()(uint_t row, uint_t col) const {
+    assert(col >= 0 && col < this->shape(1));
+    uint_t addr = m_colPtr[col];
+    uint_t stop = m_colPtr[col + 1];
+    while (addr < stop) {
+      if (m_rowInd[addr] == row) {
+        break;
       }
-      assert(addr != stop);
-
-      return m_values[addr];
+      ++addr;
     }
+    assert(addr != stop);
 
-    dataref_t operator()(uint_t row, uint_t col) {
-      assert(col >= 0 && col < this->shape(1));
-      uint_t addr = m_colPtr[ col ];
-      uint_t stop = m_colPtr[ col+1 ];
-      while (addr < stop) {
-        if (m_rowInd[addr] == row) {
-          break;
-        }
-        ++addr;
-      }
-      assert(addr != stop);
+    return m_values[addr];
+  }
 
-      return m_values[addr];
+  dataref_t operator()(uint_t row, uint_t col) {
+    assert(col >= 0 && col < this->shape(1));
+    uint_t addr = m_colPtr[col];
+    uint_t stop = m_colPtr[col + 1];
+    while (addr < stop) {
+      if (m_rowInd[addr] == row) {
+        break;
+      }
+      ++addr;
     }
+    assert(addr != stop);
 
-    bool isInRange(uint_t row, uint_t col) const {
-      assert(col >= 0 && col < this->shape(1));
-      uint_t addr = m_colPtr[ col ];
-      uint_t stop = m_colPtr[ col+1 ];
-      while (addr < stop) {
-        if (m_rowInd[addr] == row) {
-          return true;
-        }
-        ++addr;
-      }
+    return m_values[addr];
+  }
 
-      return false;
+  bool isInRange(uint_t row, uint_t col) const {
+    assert(col >= 0 && col < this->shape(1));
+    uint_t addr = m_colPtr[col];
+    uint_t stop = m_colPtr[col + 1];
+    while (addr < stop) {
+      if (m_rowInd[addr] == row) {
+        return true;
+      }
+      ++addr;
     }
 
-    dataref_t operator[](const uint_t entry[2]) {
-      return operator()(entry[0], entry[1]);
-    }
+    return false;
+  }
 
-    const real_t& operator[](const uint_t entry[2]) const {
-      return operator()(entry[0], entry[1]);
-    }
+  dataref_t operator[](const uint_t entry[2]) { return operator()(entry[0], entry[1]); }
+
+  const real_t& operator[](const uint_t entry[2]) const { return operator()(entry[0], entry[1]); }
+
+  template <class view_t>
+  void copyToView(view_t& other) {
+    assert(2 == other.dim());
+    assert(this->shape(0) == other.shape(0) && this->shape(1) == other.shape(1));
 
-    template<class view_t>
-    void copyToView(view_t& other) {
-      assert(2 == other.dim());
-      assert(this->shape(0) == other.shape(0) && this->shape(1) == other.shape(1));
-
-      uint_t entry[2];
-      uint_t ncols = this->shape(1);
-      for (uint_t col = 0; col < ncols; ++col) {
-        entry[1] = col;
-        for (uint_t i = m_colPtr[col]; i < m_colPtr[col+1]; ++i) {
-          entry[0] = m_rowInd[i];
-          other[entry] = m_values[i];
-        }
+    uint_t entry[2];
+    uint_t ncols = this->shape(1);
+    for (uint_t col = 0; col < ncols; ++col) {
+      entry[1] = col;
+      for (uint_t i = m_colPtr[col]; i < m_colPtr[col + 1]; ++i) {
+        entry[0] = m_rowInd[i];
+        other[entry] = m_values[i];
       }
     }
+  }
 
   protected:
-    data_t m_values;
-    uint_t const* m_rowInd;
-    uint_t const* m_colPtr;
-  };
-}
+  data_t m_values;
+  const uint_t* m_rowInd;
+  const uint_t* m_colPtr;
+};
+} // namespace yateto
 
 #endif
diff --git a/tests/Dockerfile-setup b/tests/Dockerfile-setup
index 19d26fb..f5987ad 100644
--- a/tests/Dockerfile-setup
+++ b/tests/Dockerfile-setup
@@ -11,7 +11,7 @@ ARG GID=1000
 
 RUN addgroup --gid $GID tester
 RUN adduser --disabled-password --gecos '' --uid $UID --gid $GID tester
-RUN chown $UID:$GID /local_workspace /cache 
+RUN chown $UID:$GID /local_workspace /cache
 USER tester
 
 # copy repo from the local current directory (fetched with Jenkins) to the workdir of the image
diff --git a/tests/Jenkinsfile b/tests/Jenkinsfile
index 811d349..2a4ca99 100644
--- a/tests/Jenkinsfile
+++ b/tests/Jenkinsfile
@@ -1,27 +1,27 @@
 properties([
     parameters([string(
-                    defaultValue: 'runner', 
-                    description: 'agent name which tells where to run a job', 
+                    defaultValue: 'runner',
+                    description: 'agent name which tells where to run a job',
                     name: 'AGENT',
                     trim: true),
                 string(
-                    defaultValue: '', 
-                    description: 'target architecture (according to yateto format). If not given then taken from Jenkins env-vars', 
-                    name: 'ARCH', 
+                    defaultValue: '',
+                    description: 'target architecture (according to yateto format). If not given then taken from Jenkins env-vars',
+                    name: 'ARCH',
                     trim: true),
                 string(
-                    defaultValue: 'matmul minimal', 
-                    description: 'whitespace separate list of examples', 
-                    name: 'EXAMPLES', 
+                    defaultValue: 'matmul minimal',
+                    description: 'whitespace separate list of examples',
+                    name: 'EXAMPLES',
                     trim: true),
                 string(
-                    defaultValue: 'Eigen LIBXSMM OpenBLAS', 
-                    description: 'whitespace separate list of generators', 
-                    name: 'GENERATORS', 
+                    defaultValue: 'Eigen LIBXSMM OpenBLAS',
+                    description: 'whitespace separate list of generators',
+                    name: 'GENERATORS',
                     trim: true),
                 booleanParam(
-                    defaultValue: false, 
-                    description: 'if true the environment image will be build. Note: it will take a considerable amount of time', 
+                    defaultValue: false,
+                    description: 'if true the environment image will be build. Note: it will take a considerable amount of time',
                     name: 'BUILD_ENV_IMAGE')
     ])
 ])
@@ -52,12 +52,12 @@ pipeline {
                 }
             }
             steps {
-                // Make sure that Jenkins knows the location of Spack. 
+                // Make sure that Jenkins knows the location of Spack.
                 // You will need to add it to the Jenkins settings
                 dir("tests") {
                      script {
-                        withCredentials([usernamePassword(credentialsId: 'docker-hub', 
-                                                      usernameVariable: 'USERNAME', 
+                        withCredentials([usernamePassword(credentialsId: 'docker-hub',
+                                                      usernameVariable: 'USERNAME',
                                                       passwordVariable: 'PASSWORD')]) {
                             sh """
                             docker run --rm -v \$(pwd):/home -w /home ${USERNAME}/spack-ubuntu-1804:latest containerize > ./Dockerfile-env
@@ -79,10 +79,10 @@ pipeline {
             steps {
                 script {
                     def dockerFilePath = "tests/Dockerfile-setup"
-                    def buildParams = String.format("--no-cache --build-arg UID=%s --build-arg GID=%s --file %s .", 
+                    def buildParams = String.format("--no-cache --build-arg UID=%s --build-arg GID=%s --file %s .",
                                                     env.USER_ID, env.GROUP_ID, dockerFilePath)
                     def customImage = docker.build("yateto:latest", buildParams)
-                }                
+                }
             }
         }
         stage('RunTest') {
@@ -105,15 +105,15 @@ for example in ${EXAMPLES}; do
                 echo " Host Arch: ${ARCH}"
                 echo " Example: ${example}"
                 echo "==========================="
-                
+
                 cmake .. -DEXAMPLES=$example -DCMAKE_BUILD_TYPE=$build_type -DPRECISION=$precision -DVARIANT=$backend -DARCH=$ARCH
                 make
-                
+
                 STORAGE=/cache/$example-$backend-$precision-$build_type
                 echo $STORAGE
                 mkdir -p $STORAGE
                 cp -r ./$example/*/* $STORAGE
-                
+
                 make test
                 rm -rf ./*
             done
@@ -129,11 +129,11 @@ cmake .. && make && make test
                     writeFile(file: "run_tests.sh", text: TestScript)
                 }
                 sh "mkdir ./cache"
-                
+
                 script {
                     // define test arch. for testing
                     //  if the user specifies ARCH as parameter it is going to be used for testing
-                    //  if not, we will try to get ARCH from the Jenkins env. variables 
+                    //  if not, we will try to get ARCH from the Jenkins env. variables
                     //  if the user didn't set env.HOST_ARCH in his/her Jenkins settings, then 'noarch' will be used
                     env.TEST_ARCH="noarch"
                     if (!env.ARCH.allWhitespace) {
@@ -147,9 +147,9 @@ cmake .. && make && make test
             }
         }
     }
-    post { 
+    post {
         always {
             sh "docker image rm yateto:latest"
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/README.md b/tests/README.md
index 0b4bc93..b949b4a 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -1,45 +1,49 @@
-## Testing
+# Testing
+
 Testing is divided into 3 parts: interface, generic and code-gen.
 
-- *interface* - yateto comes with some helper structures that 
-external projects can use. The structures are defined 
-in the *\<package root\>*/**include** directory and allow a user to copy data from one 
-tensor to another if they have the same dimensionality but 
-different sizes. For example, a target tensor may be padded along the leading dimension 
-to achieve efficient vectorization. This part of the testing is supposed to 
+- *interface* - yateto comes with some helper structures that
+external projects can use. The structures are defined
+in the *\<package root\>*/**include** directory and allow
+a user to copy data from one
+tensor to another if they have the same dimensionality but
+different sizes. For example, a target tensor may be padded along the leading dimension
+to achieve efficient vectorization. This part of the testing is supposed to
 check whether the provided structures operate correctly.
 
-- *generic* - yateto generates an optimized tensor contraction source code based on the
-*Loop over GEMM* algorithm. To achieve better performance, the generated source code 
-contains calls to optimized GEMM libraries and generators. However, yateto can 
-also generate not-optimized source code using simple *for-loops* 
+- *generic* - yateto generates an optimized tensor
+contraction source code based on the
+*Loop over GEMM* algorithm. To achieve better performance, the generated source code
+contains calls to optimized GEMM libraries and generators. However, yateto can
+also generate not-optimized source code using simple *for-loops*
 which can be used either for performance comparison or for testing with respect
-to an optimized one. This part of the testing is supposed to check whether the 
+to an optimized one. This part of the testing is supposed to check whether the
 *generic* implementation of tensor contraction is numerically correct.
 
-
 - *code-gen* - checks wither an optimized tensor contraction code produces the same
 numerical results as the *generic* implementation.
 
-The testing is supposed to only be performed with **GNU** tools and, therefore, 
+The testing is supposed to only be performed with **GNU** tools and, therefore,
 no platform specific libraries (*like intel MKL*) are included. Moreover, only the
-following architectures are supported for testing: *sandy bridge, haswell, skylake, 
-ThunderX2*. 
+following architectures are supported for testing: *sandy bridge, haswell, skylake,
+ThunderX2*.
 
 In order to compile tests, make sure that you have **CxxTest** installed and visible
 in you current working environment.
 
 ## Current status
+
 |    Part   |      Status     |
 |:---------:|:---------------:|
-| interface |     1 test     |
+| interface |     1 test      |
 | generic   | not implemented |
 | code-gen  |     2 tests     |
 
-
 ## Running tests manually
+
 ### Interface
-```console
+
+```bash
 cd mkdir interface/build && cd interface/build
 cmake ..
 make
@@ -47,29 +51,39 @@ ctest
 ```
 
 ### Code-gen
-Code-gen allows a user to test yateto with one of the following GEMM libraries/generators: Eigen, OpenBLAS,  LIBXSMM. Make sure that you have them installed on your system and visible in your current working environment.
 
-##### CMake options
+Code-gen allows a user to test yateto with one of the
+following GEMM libraries/generators: Eigen, OpenBLAS,
+LIBXSMM. Make sure that you have them installed on your
+system and visible in your current working environment.
+
+#### CMake options
+
 | CMake Variable Name |  Type  |         Allowed Values         |
 |:-------------------:|:------:|:------------------------------:|
-| ARCH                | string |    snb / hsw / skx / thunderx2t99    |
-| EXAMPLES            |  list  | matmul / minimal / matmult;minimal |
-| PRECISION           | string |          double / single         |
-| VARIANT             | string |     Eigen / OpenBLAS / LIBXSMM     |
+| ARCH                | string |    snb / hsw / skx / thunderx2t99   |
+| EXAMPLES            |  list  | matmul / minimal / matmult;minimal  |
+| PRECISION           | string |          double / single            |
+| VARIANT             | string |     Eigen / OpenBLAS / LIBXSMM / PSpaMM  |
+
+#### Default
 
-##### Default
-Uses: **haswell** architecture, **matmul** and **minimal** as examples, **Eigen** 
+Uses: **haswell** architecture, **matmul** and **minimal** as examples, **Eigen**
 as a GEMM implementation, **double** precision.
-```console
+
+```bash
 cd mkdir code-gen/build && cd code-gen/build
 cmake ..
 make
 ctest
 ```
-##### A Specific Example
-For **haswell** architecture with **single** precision and **libxsmm** 
+
+#### A Specific Example
+
+For **haswell** architecture with **single** precision and **libxsmm**
 as a GEMM generator.
-```console
+
+```bash
 cd mkdir code-gen/build && cd code-gen/build
 cmake .. -DPRECISION=single -DVARIANT=LIBXSMM
 make
@@ -77,12 +91,8 @@ ctest
 ```
 
 ## Running tests automatically
-The following [pipeline](Jenkinsfile) has been implemented to run the aforementioned tests automatically. As a regular user, you can see results of the last few runs of the pipeline [here](http://vmbungartz10.informatik.tu-muenchen.de/seissol/view/Yateto/job/yateto-codegen/). 
-
-You can trigger the pipeline and thus run all tests if you a member of SeisSol in github. To achive this, please, perform the following steps:
 
-- open this [page](http://vmbungartz10.informatik.tu-muenchen.de/seissol/view/Yateto/job/yateto-codegen/)
-- click on `log in` button at the top right corner and follow the authentication procedure
-- click on `Build with Parameters` button. You will be forwarded to the next page where you can adjust parameters. We do not recommend to make any changes in `AGENT` and `BUILD_ENV_IMAGE` fields
-- click on `Build` to trigger the pipeline. 
-- After that, you will see a new flashing entry at the very top of `Build History` field. If you want to see a detail status information about all steps involved in the pipeline then click on a dropdown widget of the flashing entry and select `Console Output`
\ No newline at end of file
+See the `.github/workflows` folder for a
+file which runs the tests automatically.
+The local `Jenkinsfile` exists as well,
+but is probably outdated as of now (early 2026).
diff --git a/tests/code-gen/cmake/FindLIBXSMM.cmake b/tests/code-gen/cmake/FindLIBXSMM.cmake
index ced8a8b..10a0870 100644
--- a/tests/code-gen/cmake/FindLIBXSMM.cmake
+++ b/tests/code-gen/cmake/FindLIBXSMM.cmake
@@ -34,4 +34,4 @@ else()
 endif()
 mark_as_advanced(LIBXSMM_INCLUDE_DIRS LIBXSMM_LIBRARIES)
 
-find_package_handle_standard_args(LIBXSMM REQUIRED_VARS LIBXSMM_INCLUDE_DIRS LIBXSMM_LIBRARIES)
\ No newline at end of file
+find_package_handle_standard_args(LIBXSMM REQUIRED_VARS LIBXSMM_INCLUDE_DIRS LIBXSMM_LIBRARIES)
diff --git a/tests/code-gen/cmake/FindLibxsmm_executable.cmake b/tests/code-gen/cmake/FindLibxsmm_executable.cmake
index a3bb50b..00fdf7c 100644
--- a/tests/code-gen/cmake/FindLibxsmm_executable.cmake
+++ b/tests/code-gen/cmake/FindLibxsmm_executable.cmake
@@ -24,4 +24,4 @@ find_program(Libxsmm_executable_PROGRAM libxsmm_gemm_generator
         )
 
 find_package_handle_standard_args(Libxsmm_executable
-        REQUIRED_VARS Libxsmm_executable_PROGRAM)
\ No newline at end of file
+        REQUIRED_VARS Libxsmm_executable_PROGRAM)
diff --git a/tests/code-gen/generator.py b/tests/code-gen/generator.py
index 4cd21c3..2ec8f0a 100755
--- a/tests/code-gen/generator.py
+++ b/tests/code-gen/generator.py
@@ -62,4 +62,4 @@
   for kernel in g.kernels():
     d = os.path.join(outDir, kernel.name)
     os.makedirs(d, exist_ok=True)
-    PrintEquivalentSparsityPatterns(d).visit(kernel.ast)
\ No newline at end of file
+    PrintEquivalentSparsityPatterns(d).visit(kernel.ast)
diff --git a/tests/code-gen/stock.py b/tests/code-gen/stock.py
index c50e644..e530d05 100755
--- a/tests/code-gen/stock.py
+++ b/tests/code-gen/stock.py
@@ -41,4 +41,3 @@ def add(g):
 
       stock = R['ijk'] <= S['xyz'] * XLTP['lx'] * XRTP['il'] * YL['ym'] * YR['mj'] * ZL['zn'] * ZR['nk']
       g.add('stock{}_trans_pad'.format(pqx), stock)
-
diff --git a/tests/interface/CMakeLists.txt b/tests/interface/CMakeLists.txt
index acadc58..80f00ad 100644
--- a/tests/interface/CMakeLists.txt
+++ b/tests/interface/CMakeLists.txt
@@ -9,7 +9,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
 
 enable_testing()
 # generate and add an interface test
- add_custom_command(COMMAND ${CXXTEST_PYTHON_TESTGEN_EXECUTABLE} 
+ add_custom_command(COMMAND ${CXXTEST_PYTHON_TESTGEN_EXECUTABLE}
                             --error-printer -o TensorView.t.cpp ${CMAKE_SOURCE_DIR}/TensorView.t.h
                     OUTPUT  TensorView.t.cpp
                     COMMENT "Generating a test runner")
diff --git a/tests/interface/TensorView.t.h b/tests/interface/TensorView.t.h
index 6487c93..215bd0e 100644
--- a/tests/interface/TensorView.t.h
+++ b/tests/interface/TensorView.t.h
@@ -3,57 +3,52 @@
 
 using namespace yateto;
 
-class TensorViewTestSuite : public CxxTest::TestSuite
-{
-private:
+class TensorViewTestSuite : public CxxTest::TestSuite {
+  private:
   double m_data[12];
 
-public:
-  void setUp()
-  {
+  public:
+  void setUp() {
     for (int i = 0; i < 12; ++i) {
-      m_data[i] = static_cast<double>(i+1);
+      m_data[i] = static_cast<double>(i + 1);
     }
   }
 
-  void testAccess()
-  {
-    DenseTensorView<3, double> tensor(m_data, {3,2,2});
-    TS_ASSERT_EQUALS(tensor(0,0,0), 1.0);
-    TS_ASSERT_EQUALS(tensor(1,1,0), 5.0);
-    TS_ASSERT_EQUALS(tensor(2,1,1), 12.0);
+  void testAccess() {
+    DenseTensorView<3, double> tensor(m_data, {3, 2, 2});
+    TS_ASSERT_EQUALS(tensor(0, 0, 0), 1.0);
+    TS_ASSERT_EQUALS(tensor(1, 1, 0), 5.0);
+    TS_ASSERT_EQUALS(tensor(2, 1, 1), 12.0);
   }
 
-	void testSubtensor()
-	{
-    DenseTensorView<3, double> tensor(m_data, {3,2,2});
+  void testSubtensor() {
+    DenseTensorView<3, double> tensor(m_data, {3, 2, 2});
     auto sub = tensor.subtensor(1, slice<>(), slice<>());
-    TS_ASSERT_EQUALS(sub(0,0), 2.0);
-    TS_ASSERT_EQUALS(sub(1,0), 5.0);
-    TS_ASSERT_EQUALS(sub(0,1), 8.0);
-    TS_ASSERT_EQUALS(sub(1,1), 11.0);
+    TS_ASSERT_EQUALS(sub(0, 0), 2.0);
+    TS_ASSERT_EQUALS(sub(1, 0), 5.0);
+    TS_ASSERT_EQUALS(sub(0, 1), 8.0);
+    TS_ASSERT_EQUALS(sub(1, 1), 11.0);
 
     auto sub2 = sub.subtensor(1, slice<>());
     TS_ASSERT_EQUALS(sub2(0), 5.0);
     TS_ASSERT_EQUALS(sub2(1), 11.0);
 
-    auto sub3 = tensor.subtensor(slice<>(1,3), slice<>(), slice<>()); 
-    TS_ASSERT_EQUALS(sub3(0,0,0), 2.0);
-    TS_ASSERT_EQUALS(sub3(0,1,0), 5.0);
-    TS_ASSERT_EQUALS(sub3(1,0,1), 9.0);
-    TS_ASSERT_EQUALS(sub3(1,1,1), 12.0);
-	}
+    auto sub3 = tensor.subtensor(slice<>(1, 3), slice<>(), slice<>());
+    TS_ASSERT_EQUALS(sub3(0, 0, 0), 2.0);
+    TS_ASSERT_EQUALS(sub3(0, 1, 0), 5.0);
+    TS_ASSERT_EQUALS(sub3(1, 0, 1), 9.0);
+    TS_ASSERT_EQUALS(sub3(1, 1, 1), 12.0);
+  }
 
-  void testSetZero()
-  {
-    DenseTensorView<3, double> tensor(m_data, {3,2,2});
+  void testSetZero() {
+    DenseTensorView<3, double> tensor(m_data, {3, 2, 2});
     auto sub = tensor.subtensor(1, slice<>(), slice<>());
     sub.setZero();
     for (int i = 0; i < 12; ++i) {
-      if ((i-1) % 3 == 0) {
+      if ((i - 1) % 3 == 0) {
         TS_ASSERT_EQUALS(m_data[i], 0.0);
       } else {
-        TS_ASSERT_EQUALS(m_data[i], static_cast<double>(i+1));
+        TS_ASSERT_EQUALS(m_data[i], static_cast<double>(i + 1));
       }
     }
   }
diff --git a/tests/spack.yaml b/tests/spack.yaml
index 9e8d72a..0b328cc 100644
--- a/tests/spack.yaml
+++ b/tests/spack.yaml
@@ -1,11 +1,11 @@
 spack:
   definitions:
-  - packages: 
+  - packages:
     - eigen@3.3.7
     - openblas@0.3.12
     - libxsmm+generator+shared@master
     - cmake@3.16.0
-  
+
   specs:
   - matrix:
     - [\$packages]
diff --git a/yateto/arch.py b/yateto/arch.py
index 07b1b2b..ff99cbc 100644
--- a/yateto/arch.py
+++ b/yateto/arch.py
@@ -114,7 +114,7 @@ def formatConstant(self, constant):
 
   def onHeap(self, numReals):
     return (numReals * self.bytesPerReal) > self._tmpStackLimit
-  
+
   def __eq__(self, other):
     return self.name == other.name
 
@@ -194,7 +194,7 @@ def getHeterogeneousArchitectureIdentifiedBy(host_arch, device_arch, device_back
 
   if device_arch.startswith('sm_'):
     alignment = 64
-  elif device_arch.startswith('gfx'): 
+  elif device_arch.startswith('gfx'):
     alignment = 128
   elif re.match(r"\d+_\d+_\d+", device_arch):
     alignment = 32
@@ -230,10 +230,10 @@ def deriveArchitecture(host_def: HostArchDefinition, device_def: Union[DeviceArc
     alignment = host_def.alignment
   if host_def.prefetch is not None:
     prefetch = host_def.prefetch
-  
+
   if alignment is None:
     raise NotImplementedError(f'The architecture {host_def.archname} is unknown to Yateto, and no custom alignment was given')
-  
+
   if prefetch is None:
     raise NotImplementedError(f'The architecture {host_def.archname} is unknown to Yateto, and no custom prefetching info was given')
 
diff --git a/yateto/aspp.py b/yateto/aspp.py
index 2b10d09..e2e21b5 100644
--- a/yateto/aspp.py
+++ b/yateto/aspp.py
@@ -41,7 +41,7 @@ def reshape(self, shape):
   @abstractmethod
   def transposed(self, shape):
     pass
-  
+
   @abstractmethod
   def broadcast(self, shape):
     pass
@@ -77,7 +77,7 @@ def reshape(self, shape):
 
   def transposed(self, perm):
     return type(self)(tuple(self.shape[p] for p in perm))
-  
+
   def broadcast(self, bcst):
     return type(self)(tuple(shp * bc for shp, bc in zip(self.shape, bcst)))
 
@@ -99,7 +99,7 @@ def einsum(description, a1, a2):
       sz1 = {i: a1.shape[A.find(i)] for i in A}
       sz2 = {i: a2.shape[B.find(i)] for i in B}
       intersect = filter(lambda x: x in sz1, sz2.keys())
-      assert all([sz1[i] == sz2[i] for i in intersect])    
+      assert all([sz1[i] == sz2[i] for i in intersect])
       sz1.update(sz2)
       return dense(tuple(sz1[i] for i in C))
     else:
@@ -166,7 +166,7 @@ def reshape(self, shape):
 
   def transposed(self, perm):
     return type(self)(self.pattern.transpose(perm).copy(order=self.NUMPY_DEFAULT_ORDER))
-  
+
   def broadcast(self, bcst):
     return type(self)(np.tile(self.pattern, bcst).copy(order=self.NUMPY_DEFAULT_ORDER))
 
diff --git a/yateto/ast/__init__.py b/yateto/ast/__init__.py
index 8b13789..e69de29 100644
--- a/yateto/ast/__init__.py
+++ b/yateto/ast/__init__.py
@@ -1 +0,0 @@
-
diff --git a/yateto/ast/cost.py b/yateto/ast/cost.py
index 7d6d83b..3d5dbfe 100644
--- a/yateto/ast/cost.py
+++ b/yateto/ast/cost.py
@@ -25,7 +25,7 @@ def estimate_Product(self, node):
     for size in node.shape():
       cost *= size
     return cost
-  
+
   def estimate_IndexSum(self, node):
     cost = node.sumIndex().shape()[0] - 1
     for size in node.indices.shape():
@@ -36,7 +36,7 @@ def estimate_IndexSum(self, node):
 class CachedCostEstimator(CostEstimator):
   def __init__(self):
     self._cost = dict()
-  
+
   def estimate(self, node):
     if node in self._cost:
       return self._cost[node]
@@ -95,7 +95,7 @@ def __init__(self):
     super().__init__()
     self._lead_dim = 0
     self._loaded_to_gpu_cache = {}
-  
+
   def generic_estimate(self, node):
     result = super().generic_estimate(node)
     self._loaded_to_gpu_cache[node] = set()
@@ -171,14 +171,14 @@ def __init__(self):
   def generic_estimate(self, node):
     self._cache[node] = node.eqspp()
     return 0
-  
+
   def estimate_Product(self, node):
     spp = node.computeSparsityPattern(self._cache[node.leftTerm()], self._cache[node.rightTerm()])
     self._cache[node] = spp
     return spp.count_nonzero()
-  
+
   def estimate_IndexSum(self, node):
     termSpp = self._cache[node.term()]
     spp = node.computeSparsityPattern(termSpp)
-    self._cache[node] = spp    
+    self._cache[node] = spp
     return termSpp.count_nonzero() - spp.count_nonzero()
diff --git a/yateto/ast/indices.py b/yateto/ast/indices.py
index 6f2ab02..7ac7ed6 100644
--- a/yateto/ast/indices.py
+++ b/yateto/ast/indices.py
@@ -6,72 +6,72 @@ class Indices(object):
   def __init__(self, indexNames = '', shape = ()):
     self._indices = tuple(indexNames)
     self._size = dict()
-    
+
     assert len(self._indices) == len(set(self._indices)), 'Repeated indices are not allowed ({}).'.format(indexNames)
     assert len(self._indices) == len(shape), 'Indices {} do not match tensor shape {}.'.format(str(self), shape)
 
     self._size = {self._indices[i]: size for i, size in enumerate(shape)}
-  
+
   def tostring(self):
     return ''.join(self._indices)
-  
+
   def extract(self, indexNames):
     return Indices(str(indexNames), self.subShape(indexNames))
-  
+
   def firstIndex(self):
     return self.extract(self._indices[0])
 
   def shape(self):
     return self.subShape(self._indices)
-  
+
   def subShape(self, indexNames):
     return tuple(self._size[index] for index in indexNames)
 
   def indexSize(self, index):
     return self._size[index]
-  
+
   def permuted(self, indexNames):
     assert set(indexNames) == set(self)
     return Indices(indexNames, self.subShape(indexNames))
-    
+
   def find(self, index):
     assert len(index) == 1
     return self._indices.index(index)
-  
+
   def positions(self, I, sort=True):
     pos = [self.find(i) for i in I]
     if sort:
       return sorted(pos)
     return pos
-  
+
   def __eq__(self, other):
     return other != None and self._indices == other._indices and self._size == other._size
-    
+
   def __ne__(self, other):
     return other == None or self._indices != other._indices or self._size != other._size
-  
+
   def __hash__(self):
     return hash((self._indices, self.shape()))
-  
+
   def __iter__(self):
     return iter(self._indices)
-  
+
   def __getitem__(self, key):
     return self._indices[key]
-    
+
   def __len__(self):
     return len(self._indices)
-  
+
   def __and__(self, other):
     return set(self) & set(other)
-  
+
   def __rand__(self, other):
     return self & other
-    
+
   def __le__(self, other):
     indexNamesContained = set(self._indices) <= set(other._indices)
     return indexNamesContained and all(self._size[index] == other._size[index] for index in self._indices)
-  
+
   def __sub__(self, other):
     indexNames = [index for index in self._indices if index not in other]
     return Indices(indexNames, self.subShape(indexNames))
@@ -80,7 +80,7 @@ def merged(self, other):
     indexNames = self._indices + other._indices
     shape = self.subShape(self._indices) + other.subShape(other._indices)
     return Indices(indexNames, shape)
-  
+
   def mergeStrict(self, other):
     indexNames = list(self._indices)
     shape = list(self.subShape(self._indices))
@@ -94,17 +94,17 @@ def mergeStrict(self, other):
         myShp = shape[myPos]
         assert myShp == shp, f"Index merge failed. {self} vs. {other} in {idx}: {myShp} vs. {shp}"
     return Indices(tuple(indexNames), tuple(shape))
-    
+
   def sorted(self):
     indexNames = sorted(self._indices)
     return Indices(indexNames, self.subShape(indexNames))
-  
+
   def __str__(self):
     return self.tostring()
-    
+
   def __repr__(self):
     return '({})'.format(','.join('{}={}'.format(index, self._size[index]) for index in self._indices))
-  
+
   def size(self):
     return self._size
 
@@ -112,31 +112,31 @@ class Range(object):
   def __init__(self, start, stop):
     self.start = start
     self.stop = stop
-  
+
   def size(self):
     return self.stop - self.start
-  
+
   def aligned(self, arch):
     return Range(arch.alignedLower(self.start), arch.alignedUpper(self.stop))
-  
+
   def __and__(self, other):
     return Range(max(self.start, other.start), min(self.stop, other.stop))
 
   def __or__(self, other):
     return Range(min(self.start, other.start), max(self.stop, other.stop))
-  
+
   def __contains__(self, other):
     return self.start <= other.start and self.stop >= other.stop
-  
+
   def __eq__(self, other):
     return self.start == other.start and self.stop == other.stop
-  
+
   def __str__(self):
     return 'Range({}, {})'.format(self.start, self.stop)
-  
+
   def __iter__(self):
     return iter(range(self.start, self.stop))
-      
+
 class BoundingBox(object):
   def __init__(self, listOfRanges):
     self._box = listOfRanges
@@ -150,7 +150,7 @@ def size(self):
     for r in self._box:
       s *= r.size()
     return s
-  
+
   def __contains__(self, entry):
     if len(entry) != len(self):
       return False
@@ -159,24 +159,24 @@ def __contains__(self, entry):
     if isinstance(entry[0], Range):
       return all(e in self[i] for i,e in enumerate(entry))
     return all(e >= self[i].start and e <= self[i].stop for i,e in enumerate(entry))
-  
+
   def __getitem__(self, key):
     return self._box[key]
-  
+
   def __len__(self):
     return len(self._box)
-    
+
   def __iter__(self):
     return iter(self._box)
-  
+
   def __eq__(self, other):
     return all(s == o for s,o in zip(self,other))
-  
+
   def __str__(self):
     return '{}({})'.format(type(self).__name__, ', '.join(str(r) for r in self))
 
 @functools.total_ordering
-class LoGCost(object):    
+class LoGCost(object):
   def __init__(self, stride = sys.maxsize, leftTranspose = sys.maxsize, rightTranspose = sys.maxsize, fusedIndices = 0):
     """
     stride (w.r.t. first dimension): 0 = unit stride, 1 non-unit stride (lower is better)
@@ -187,15 +187,15 @@ def __init__(self, stride = sys.maxsize, leftTranspose = sys.maxsize, rightTrans
     self._leftTranspose = leftTranspose
     self._rightTranspose = rightTranspose
     self._fusedIndices = fusedIndices
-  
+
   @staticmethod
   def addIdentity():
     return LoGCost(0, 0, 0, 0)
-    
+
   def _totuple(self):
     # minus sign before _fusedIndices as higher is better
     return (self._stride, self._leftTranspose + self._rightTranspose, -self._fusedIndices)
-  
+
   def __lt__(self, other):
     s = self._totuple()
     o = other._totuple()
@@ -205,9 +205,9 @@ def __lt__(self, other):
 
   def __eq__(self, other):
     return self._totuple() == other._totuple() and self._leftTranspose == other._leftTranspose
-  
+
   def __add__(self, other):
     return LoGCost(self._stride + other._stride, self._leftTranspose + other._leftTranspose, self._rightTranspose + other._rightTranspose, self._fusedIndices + other._fusedIndices)
-  
+
   def __repr__(self):
     return '{{stride: {}, left transpose: {}, right transpose: {}, fused indices: {}}}'.format(self._stride, self._leftTranspose, self._rightTranspose, self._fusedIndices)
diff --git a/yateto/ast/log.py b/yateto/ast/log.py
index c48f201..1f397fd 100644
--- a/yateto/ast/log.py
+++ b/yateto/ast/log.py
@@ -19,14 +19,14 @@ def fusedVariants(memLayout, I, P, M, prune = False):
   D = set(s for g in groupStrings for s in allSubstrings(g))
   if prune:
     D = set(d for d in D if d[0] == M[0])
-  D = set(d for d in D if memLayout.mayFuse(sorted(P[i] for i in d)))  
+  D = set(d for d in D if memLayout.mayFuse(sorted(P[i] for i in d)))
   return D
 
 def LoG(contraction, Aperm = None, Bperm = None, Cperm = None):
   L = contraction.leftTerm()
   R = contraction.rightTerm()
   I = contraction
-  
+
   if Aperm is not None:
     L = copy.copy(L)
     L.setIndexPermutation(Aperm, permuteEqspp=False)
@@ -53,7 +53,7 @@ def LoG(contraction, Aperm = None, Bperm = None, Cperm = None):
   Im = (set(A) & set(C)) - Icommon
   In = (set(B) & set(C)) - Icommon
   Ik = (set(A) & set(B)) - Icommon
-  
+
   PA = {idx: pos for pos, idx in enumerate(A)}
   PB = {idx: pos for pos, idx in enumerate(B)}
   PC = {idx: pos for pos, idx in enumerate(C)}
@@ -64,7 +64,7 @@ def LoG(contraction, Aperm = None, Bperm = None, Cperm = None):
   AK = fusedVariants(L.memoryLayout(), Ik, PA, A)
   BK = fusedVariants(R.memoryLayout(), Ik, PB, B)
   BN = fusedVariants(R.memoryLayout(), In, PB, B)
-  
+
   MC = CM & AM
   NC = CN & BN
   KC = AK & BK
diff --git a/yateto/ast/node.py b/yateto/ast/node.py
index 951ccfc..435ca9f 100644
--- a/yateto/ast/node.py
+++ b/yateto/ast/node.py
@@ -11,38 +11,38 @@ def __init__(self):
     self._children = []
     self._eqspp = None
     self.prefetch = None
-  
+
   def size(self):
     return self.indices.size()
-  
+
   def shape(self):
     return self.indices.shape()
-  
+
   @abstractmethod
   def nonZeroFlops(self):
     pass
 
   def __iter__(self):
     return iter(self._children)
-  
+
   def __getitem__(self, key):
     return self._children[key]
-  
+
   def __len__(self):
     return len(self._children)
-  
+
   def setChildren(self, children):
     self._children = children
 
   def eqspp(self):
     return self._eqspp
-  
+
   def setEqspp(self, spp):
     self._eqspp = spp
 
   def boundingBox(self):
     return BoundingBox.fromSpp(self._eqspp)
-  
+
   @abstractmethod
   def memoryLayout(self):
     pass
@@ -63,11 +63,11 @@ def setIndexPermutation(self, indices, permuteEqspp=True):
   def permute(self, indices, spp, strict=True):
     perm = tuple(indices.find(idx) for idx in self.indices if idx in indices or strict)
     return spp.transposed(perm)
-  
+
   def reshape(self, indices, spp):
     rshp = [indices.indexSize(idx) if idx in indices else 1 for idx in self.indices]
     return spp.reshape(rshp)
-  
+
   def broadcast(self, indices, spp):
     reshaped = self.reshape(indices, spp)
     bcst = [1 if idx in indices else self.indices.indexSize(idx) for idx in self.indices]
@@ -102,34 +102,34 @@ def __mul__(self, other):
       other.setTerm(self * other.term())
       return other
     return self._binOp(other, Einsum)
-  
+
   def __rmul__(self, other):
     return self.__mul__(other)
-  
+
   def __add__(self, other):
     if not isinstance(other, Node):
       raise ValueError('Unsupported operation: Cannot add {} to {}.'.format(self, other))
     return self._binOp(other, Add)
-  
+
   def __radd__(self, other):
     return self.__add__(other)
-  
+
   def __neg__(self):
     self._checkMultipleScalarMults()
     return ScalarMultiplication(-1.0, self)
 
   def __sub__(self, other):
     return self._binOp(-other, Add)
-    
+
   def __le__(self, other):
     return Assign(self, other)
-  
+
   def subslice(self, index, start, end):
     return SliceView(self, index, start, end)
-  
+
   def subselect(self, index, position):
     return SliceView(self, index, position, position + 1)
-  
+
   def viewed(self):
     return self
 
@@ -140,35 +140,35 @@ def __init__(self, subnode, index, start, end):
     self.index = index
     self.start = start
     self.end = end
-  
+
   def name(self):
     return self.term().name()
-  
+
   def viewed(self):
     return self.term().viewed()
-  
+
   def term(self):
     return self[0]
-  
+
   def nonZeroFlops(self):
     return 0
-  
+
   def setIndexPermutation(self, indices, permuteEqspp=True):
     assert str(indices) == str(self.indices)
 
   def memoryLayout(self):
     return self._memoryLayout
-  
+
   def getMemoryLayout(self, memoryLayout):
     return memoryLayout.subslice(list(self.indices).index(self.index), self.start, self.end)
 
   def computeMemoryLayout(self):
     self._memoryLayout = self.getMemoryLayout(self.term().memoryLayout())
-  
+
   def computeSparsityPattern(self, *spps):
     assert len(spps) in (0, 1)
     spp = spps[0] if len(spps) == 1 else self.term().eqspp()
-    
+
     if isinstance(spp, aspp.dense):
       nowshape = spp.shape
       subshape = tuple(self.end - self.start if self.indices[i] == self.index else nowshape[i] for i in range(spp.ndim))
@@ -177,7 +177,7 @@ def computeSparsityPattern(self, *spps):
       subslice = tuple(slice(self.start, self.end) if self.indices[i] == self.index else slice(None) for i in range(spp.ndim))
       subarray = spp.as_ndarray()[subslice]
       return aspp.general(subarray)
-  
+
   def __str__(self):
     return f'{type(self).__name__}[{self.index}: {self.start}..{self.end}]'
 
@@ -186,19 +186,19 @@ def __init__(self, tensor, indexNames):
     super().__init__()
     self.tensor = tensor
     self.indices = Indices(indexNames, self.tensor.shape())
-  
+
   def nonZeroFlops(self):
     return 0
-  
+
   def setIndexPermutation(self, indices, permuteEqspp=True):
     assert str(indices) == str(self.indices)
-  
+
   def spp(self, groupSpp=True):
     return self.tensor.spp(groupSpp)
-  
+
   def name(self):
     return self.tensor.name()
-  
+
   def memoryLayout(self):
     return self.tensor.memoryLayout()
 
@@ -216,7 +216,7 @@ def __init__(self, *args):
     super().__init__()
     self._children = list(args)
     self._memoryLayout = None
-  
+
   def memoryLayout(self):
     return self._memoryLayout
 
@@ -255,17 +255,17 @@ def setIndexPermutation(self, indices, permuteEqspp=True):
     if self._memoryLayout is not None:
       self._memoryLayout = self._memoryLayout.permuted(p)
     self.indices = self.indices.permuted(indices)
-  
+
   def __str__(self):
     return '{}[{}]'.format(type(self).__name__, self.indices if self.indices != None else '<not deduced>')
-  
+
   def computeSparsityPattern(self, *spps):
     raise NotImplementedError
 
 class Einsum(Op):
   def nonZeroFlops(self):
     raise NotImplementedError
-    
+
 class Add(Op):
   def computeSparsityPattern(self, *spps):
     if len(spps) == 0:
@@ -276,7 +276,7 @@ def computeSparsityPattern(self, *spps):
       add_spp = permute_summand(i)
       spp = aspp.add(spp, add_spp)
     return spp
-  
+
   def nonZeroFlops(self):
     nzFlops = 0
     for child in self:
@@ -296,7 +296,7 @@ def __init__(self, scalar, term):
 
   def fixedIndexPermutation(self):
     return self.term().fixedIndexPermutation()
-  
+
   def setTerm(self, term):
     self._children[0] = term
     if self.fixedIndexPermutation():
@@ -312,7 +312,7 @@ def is_constant(self):
 
   def scalar(self):
     return self._scalar
-  
+
   def computeSparsityPattern(self, *spps):
     if len(spps) == 0:
       return self.term().eqspp()
@@ -323,20 +323,20 @@ def nonZeroFlops(self):
     if self._isConstant and self._scalar in [-1.0, 1.0]:
       return 0
     return self.eqspp().count_nonzero()
-  
+
   def __str__(self):
     return '{}: {}'.format(super().__str__(), str(self._scalar))
 
 class BinOp(Op):
   def __init__(self, lTerm, rTerm):
     super().__init__(lTerm, rTerm)
-  
+
   def leftTerm(self):
     return self._children[0]
-  
+
   def rightTerm(self):
     return self._children[1]
-  
+
   def setChildren(self, children):
     if len(children) != 2:
       raise ValueError('BinOp node must have exactly 2 children.')
@@ -347,10 +347,10 @@ def setChildren(self, children):
     if not isinstance(children[0].viewed(), IndexedTensor):
       raise ValueError('First child of Assign node must be an IndexedTensor: ' + str(children[0].viewed()))
     super().setChildren(children)
-    
+
   def nonZeroFlops(self):
     return 0
-  
+
   def computeSparsityPattern(self, *spps):
     spp = spps[1] if len(spps) == 2 else self.rightTerm().eqspp()
     return self.broadcast(self.rightTerm().indices, self.permute(self.rightTerm().indices, spp, False))
@@ -370,7 +370,7 @@ def computeSparsityPattern(self, *spps):
     assert len(spps) <= 1
     spp = spps[0] if len(spps) == 1 else self.term().eqspp()
     return self.permute(self.term().indices, spp)
-  
+
   @classmethod
   def subPermute(cls, term, indices):
     subIndexNames = [idx for idx in indices if idx in term.indices]
@@ -408,10 +408,10 @@ def __init__(self, lTerm, rTerm):
     assert lTerm.indices.subShape(K) == rTerm.indices.subShape(K)
 
     self.indices = lTerm.indices.merged(rTerm.indices - K)
-  
+
   def nonZeroFlops(self):
     return self.eqspp().count_nonzero()
-  
+
   def computeSparsityPattern(self, *spps):
     if len(spps) == 0:
       spps = [node.eqspp() for node in self]
@@ -423,13 +423,13 @@ def __init__(self, term, sumIndex):
     super().__init__(term)
     self.indices = term.indices - set([sumIndex])
     self._sumIndex = term.indices.extract(sumIndex)
-  
+
   def nonZeroFlops(self):
     return self.term().eqspp().count_nonzero() - self.eqspp().count_nonzero()
-  
+
   def sumIndex(self):
     return self._sumIndex
-  
+
   def computeSparsityPattern(self, *spps):
     assert len(spps) <= 1
     spp = spps[0] if len(spps) == 1 else self.term().eqspp()
@@ -446,7 +446,7 @@ def __init__(self, indices, lTerm, rTerm, sumIndices):
 
   def nonZeroFlops(self):
     raise NotImplementedError
-  
+
   def computeSparsityPattern(self, *spps):
     if len(spps) == 0:
       spps = [node.eqspp() for node in self]
@@ -486,13 +486,13 @@ def nonZeroFlops(self):
     p = Product(self.leftTerm(), self.rightTerm())
     p.setEqspp( p.computeSparsityPattern() )
     return 2*p.nonZeroFlops() - self.eqspp().count_nonzero()
-  
+
   def computeSparsityPattern(self, *spps):
     if len(spps) == 0:
       spps = [node.eqspp() for node in self]
     assert len(spps) == 2
     return _productContractionLoGSparsityPattern(self, *spps)
-  
+
   def cost(self):
     A = self.leftTerm().indices
     B = self.rightTerm().indices
@@ -500,13 +500,13 @@ def cost(self):
     BstrideOne = (B.find(self._k[0]) == 0) if not self._transB else (B.find(self._n[0]) == 0)
     cost = LoGCost(int(not AstrideOne) + int(not BstrideOne), int(self._transA), int(self._transB), len(self._m) + len(self._n) + len(self._k))
     return cost
-  
+
   def loopIndices(self):
     i1 = self.indices - (self._m + self._n)
     i2 = (self.leftTerm().indices - (self._m + self._k)) - i1
     i3 = ((self.rightTerm().indices - (self._k + self._n)) - i1) - i2
     return i1.merged(i2).merged(i3)
-  
+
   def transA(self):
     return self._transA
 
@@ -538,7 +538,7 @@ def indexString(name, fused, indices, transpose=False):
     if batchedIndices:
       indexStr = re.sub(r'([{}])'.format(''.join(batchedIndices)), r'[\1]', indexStr)
     return '{}{}_{{{}}}'.format(name, '^T' if transpose else '', indexStr)
-  
+
   def __str__(self):
     Astr = self.indexString('A', [self._m, self._k], self.leftTerm().indices, self._transA)
     Bstr = self.indexString('B', [self._k, self._n], self.rightTerm().indices, self._transB)
@@ -577,4 +577,4 @@ def nonZeroFlops(self):
     return nzFlops
 
   def is_empty(self):
-    return len(self._children) == 0
\ No newline at end of file
+    return len(self._children) == 0
diff --git a/yateto/ast/opt.py b/yateto/ast/opt.py
index 54884f8..84034e1 100644
--- a/yateto/ast/opt.py
+++ b/yateto/ast/opt.py
@@ -3,11 +3,11 @@
 
 def strengthReduction(terms, target_indices, cost_estimator, split = 0):
   n = len(terms)
-  
+
   indexList = [index for term in terms for index in term.indices]
   uniqueIndices = set(indexList)
   summationIndices = set(index for index in uniqueIndices if indexList.count(index) == 1) - set(target_indices)
-  
+
   while len(summationIndices) != 0:
     i = split
     while i < n:
diff --git a/yateto/ast/transformer.py b/yateto/ast/transformer.py
index 08c21ef..505e6e4 100644
--- a/yateto/ast/transformer.py
+++ b/yateto/ast/transformer.py
@@ -10,7 +10,7 @@
 from .. import aspp
 
 # Similar as ast.NodeTransformer
-class Transformer(Visitor): 
+class Transformer(Visitor):
   def generic_visit(self, node, **kwargs):
     newChildren = [self.visit(child, **kwargs) for child in node]
     node.setChildren(newChildren)
@@ -20,7 +20,7 @@ class DeduceIndices(Transformer):
   def __init__(self, targetIndices: Union[str, Indices] = None):
     self._targetIndices = targetIndices
     self._indexSetVisitor = ComputeIndexSet()
-  
+
   def visit(self, node, bound=None):
     forceIndices = bound is None and self._targetIndices is not None
     if bound is None:
@@ -71,7 +71,7 @@ def visit_Einsum(self, node, bound):
     deduced = g - contractions
     node.indices = deduced.sorted()
     return node
-  
+
   def visit_Add(self, node, bound):
     for child in node:
       self.visit(child, bound)
@@ -97,7 +97,7 @@ def visit_ScalarMultiplication(self, node, bound):
     self.visit(node.term(), bound)
     node.indices = deepcopy(node.term().indices)
     return node
-  
+
   def visit_SliceView(self, node, bound):
     self.visit(node.term(), bound)
     node.indices = Indices(node.term().indices, [shape if index != node.index else (node.end - node.start) for index, shape in zip(node.term().indices, node.term().shape())])
@@ -106,7 +106,7 @@ def visit_SliceView(self, node, bound):
   def visit_Assign(self, node, bound):
     lhs = node[0]
     rhs = node[1]
-    
+
     lhsTensor = lhs.viewed()
     if not isinstance(lhsTensor, IndexedTensor):
       raise ValueError('Assign: Left-hand side must be of type IndexedTensor')
@@ -213,12 +213,12 @@ def visit_ScalarMultiplication(self, node):
     self.generic_visit(node)
     node.setEqspp(node.term().eqspp())
     return node
-  
+
   def visit_Assign(self, node):
     self.generic_visit(node)
     node.setEqspp( node.computeSparsityPattern() )
     return node
-  
+
   def getEqspp(self, terms, targetIndices):
     # Shortcut if all terms have dense eqspps
     if all(term.eqspp().is_dense() for term in terms):
@@ -230,19 +230,19 @@ def getEqspp(self, terms, targetIndices):
     minTree.setIndexPermutation(targetIndices)
     minTree = FindContractions().visit(minTree)
     return ComputeSparsityPattern(True).visit(minTree)
-  
+
   def visit_Einsum(self, node):
     self.generic_visit(node)
     terms = list(node)
     node.setEqspp( self.getEqspp(terms, node.indices) )
-    
+
     for child in node:
       child.setEqspp( self.getEqspp(terms, child.indices) )
 
     # TODO: Backtracking of equivalent sparsity pattern to children?
 
     return node
-  
+
   def visit_SliceView(self, node):
     self.generic_visit(node)
     node.setEqspp(node.computeSparsityPattern())
@@ -263,6 +263,6 @@ def generic_visit(self, node):
     node.setEqspp( node.computeSparsityPattern() )
     node.computeMemoryLayout()
     return node
-  
+
   def visit_IndexedTensor(self, node):
     return node
diff --git a/yateto/ast/visitor.py b/yateto/ast/visitor.py
index 6b16221..c333d58 100644
--- a/yateto/ast/visitor.py
+++ b/yateto/ast/visitor.py
@@ -15,7 +15,7 @@ def visit(self, node, **kwargs):
     method = 'visit_' + node.__class__.__name__
     visitor = getattr(self, method, self.generic_visit)
     return visitor(node, **kwargs)
-  
+
   def generic_visit(self, node, **kwargs):
     for child in node:
       self.visit(child, **kwargs)
@@ -37,7 +37,7 @@ def addIndent(string, indent):
 class PrettyPrinter(Visitor):
   def __init__(self):
     self._indent = 0
-    
+
   def generic_visit(self, node):
     print('  ' * self._indent + str(node))
     self._indent = self._indent + 1
@@ -54,7 +54,7 @@ def generic_visit(self, node):
     else:
       spps = [self.visit(child) for child in node]
     return node.computeSparsityPattern(*spps)
-  
+
   def visit_IndexedTensor(self, node):
     return node.eqspp()
 
@@ -89,7 +89,7 @@ def findVariants(self, node):
     for child in node:
       permutationVariants.update( self.visit(child) )
     return permutationVariants
-  
+
   def variantsFixedRootPermutation(self, node, fixedPerm, permutationVariants):
     variants = dict()
     minCost = LoGCost.addIdentity()
@@ -150,10 +150,10 @@ def visit_ScalarMultiplication(self, node):
     permutationVariants = self.visit(node.term())
     permutationVariants[node] = {key: self.Variant(variant._cost, [key]) for key,variant in permutationVariants[node.term()].items()}
     return permutationVariants
-  
+
   def visit_Product(self, node):
     return self.allPermutationsNoCostBinaryOp(node)
-    
+
   def visit_IndexSum(self, node):
     permutationVariants = self.findVariants(node)
     tV = permutationVariants[node.term()]
@@ -171,7 +171,7 @@ def visit_IndexSum(self, node):
 
   def visit_Contraction(self, node):
     permutationVariants = self.findVariants(node)
-    
+
     variants = dict()
     iterator = itertools.permutations(node.indices)
     for Cs in iterator:
@@ -228,7 +228,7 @@ def __init__(self, directory):
     self._directory = directory
     self._cmap = self.colors.ListedColormap(['white', 'black'])
     self._norm = self.colors.BoundaryNorm([0.0, 0.5, 1.0], 2, clip=True)
-  
+
   def generic_visit(self, node):
     nameFun = getattr(node, 'name', None)
     name = nameFun() if nameFun else '_result'
diff --git a/yateto/codegen/__init__.py b/yateto/codegen/__init__.py
index 8b13789..e69de29 100644
--- a/yateto/codegen/__init__.py
+++ b/yateto/codegen/__init__.py
@@ -1 +0,0 @@
-
diff --git a/yateto/codegen/cache.py b/yateto/codegen/cache.py
index d2af0e2..8369ebb 100644
--- a/yateto/codegen/cache.py
+++ b/yateto/codegen/cache.py
@@ -3,14 +3,14 @@
 class RoutineGenerator(object):
   def __call__(self, routineName, fileName):
     pass
-  
+
   def target(self):
     return 'cpu'
 
 class GpuRoutineGenerator(object):
   def __call__(self, routineName, fileName):
     pass
-  
+
   def target(self):
     return 'gpu'
 
@@ -18,17 +18,17 @@ class RoutineCache(object):
   def __init__(self):
     self._routines = dict()
     self._generators = dict()
-  
+
   def addRoutine(self, name, generator):
     if name in self._routines and not self._routines[name] == generator:
       raise RuntimeError(f'`{name}` is already in RoutineCache but the generator is not equal. '
                          f'(That is, a name was given twice for different routines.)')
     self._routines[name] = generator
-    
+
     generatorName = type(generator).__name__
     if generatorName not in self._generators:
       self._generators[generatorName] = generator
-  
+
   def generate(self, header, cppFileName, gpuFileName):
     with Cpp(gpuFileName) as gpucpp:
       with Cpp(cppFileName) as cpp:
diff --git a/yateto/codegen/code.py b/yateto/codegen/code.py
index da6f93c..31e7713 100644
--- a/yateto/codegen/code.py
+++ b/yateto/codegen/code.py
@@ -45,13 +45,13 @@ def __enter__(self):
 
   def __exit__(self, type, value, traceback):
     pass
-  
+
 class Block:
   def __init__(self, writer, argument, foot = ''):
     self.writer = writer
     self.argument = argument
     self.foot = foot
-    
+
   def __enter__(self):
     space = ' ' if self.argument else ''
     self.writer(self.argument + space + '{')
@@ -69,36 +69,36 @@ def __init__(self, writer, arguments, foot=None):
       self.foot = [''] * len(self.arguments)
     else:
       self.foot = foot
-  
+
   def __enter__(self):
     for arg in self.arguments:
       self.writer(arg + ' {')
       self.writer.indent += 1
-  
+
   def __exit__(self, type, value, traceback):
     # Blocks are closed in reverse order, thus reverse footer
     for arg, foot in zip(self.arguments, reversed(self.foot)):
       self.writer.indent -= 1
       self.writer('}' + foot)
-    
+
 class HeaderGuard:
   def __init__(self, writer, name):
     self.writer = writer
     self.name = name
-    
+
   def __enter__(self):
     self.writer('#ifndef ' + self.name)
     self.writer('#define ' + self.name)
 
   def __exit__(self, type, value, traceback):
     self.writer('#endif')
-    
+
 class PPIfBlock:
   def __init__(self, writer, name, typ):
     self.writer = writer
     self.name = name
     self.typ = typ
-    
+
   def __enter__(self):
     self.writer('#{} {}'.format(self.typ, self.name))
 
@@ -109,33 +109,33 @@ class Cpp:
   def __init__(self, streamOrFileName = sys.stdout):
     self.fileHandle = streamOrFileName
     self.indent = 0
-    
+
   def __enter__(self):
     self.out = open(self.fileHandle, 'w+') if isinstance(self.fileHandle, str) else self.fileHandle
     return self
-    
+
   def __exit__(self, type, value, traceback):
     if self.out is not sys.stdout:
       self.out.close()
     self.out = None
-    
+
   def __call__(self, code):
     indentSpace = self.indent * '  '
     for line in code.splitlines():
       self.out.write(indentSpace + line + '\n')
-  
+
   def emptyline(self):
     self.out.write('\n')
-      
+
   def If(self, argument):
     return Block(self, 'if ({})'.format(argument))
-      
+
   def For(self, argument):
     return Block(self, 'for ({})'.format(argument))
 
   def ForRange(self, variable, range):
     return self.For(f'int {variable} = {range.start}; {variable} < {range.end}; ++{variable}')
-    
+
   def Namespace(self, name):
     if len(name) == 0:
       return NoScope()
@@ -149,47 +149,47 @@ def Namespace(self, name):
 
   def AnonymousScope(self):
     return Block(self, '')
-    
+
   def Function(self, name, arguments = '', returnType = 'void', const = False):
     if returnType:
       returnType += ' '
     return Block(self, '{}{}({}){}'.format(returnType, name, arguments, ' const' if const else ''))
-    
+
   def functionDeclaration(self, name, arguments = '', returnType = 'void'):
     return self.__call__('{} {}({});'.format(returnType, name, arguments))
 
   def Class(self, name):
     return Block(self, 'class ' + name, foot=';')
-  
+
   def classDeclaration(self, name):
     return self.__call__('class {};'.format(name))
-  
+
   def forwardStruct(self, name):
     self.__call__('struct {};'.format(name))
 
   def Struct(self, name):
     return Block(self, 'struct ' + name, foot=';')
-    
+
   def HeaderGuard(self, name):
     return HeaderGuard(self, name)
-    
+
   def PPIfndef(self, name):
     return PPIfBlock(self, name, 'ifndef')
-    
+
   def PPIf(self, name):
     return PPIfBlock(self, name, 'if')
-    
+
   def label(self, name):
     self.indent -= 1
     self.__call__(name + ':')
     self.indent += 1
-    
+
   def includeSys(self, header):
     self.__call__('#include <{}>'.format(header))
 
   def include(self, header):
     self.__call__('#include "{}"'.format(header))
-    
+
   def includes(self, header_list):
     for header in header_list:
       self.include(header)
diff --git a/yateto/codegen/common.py b/yateto/codegen/common.py
index 8f36725..626bbb8 100644
--- a/yateto/codegen/common.py
+++ b/yateto/codegen/common.py
@@ -27,7 +27,7 @@ def __init__(self, name, memoryLayout, eqspp, is_compute_constant=False, is_temp
     self.values = values
     self.datatype = datatype
     self.addressing = addressing
-  
+
   @classmethod
   def fromNode(cls, name, node):
     return cls(name, node.memoryLayout(), node.eqspp())
@@ -83,7 +83,7 @@ def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo
       flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, indexNo-1)
     flops = flops * rng.size()
   return flops
-  
+
 def loopRanges(term: IndexedTensorDescription, loopIndices):
   overlap = set(loopIndices) & set(term.indices)
   bbox = BoundingBox.fromSpp(term.eqspp)
@@ -92,7 +92,7 @@ def loopRanges(term: IndexedTensorDescription, loopIndices):
 def testLoopRangesEqual(A, B):
   overlap = A.keys() & B.keys()
   return all([A[index] == B[index] for index in overlap])
-  
+
 def testLoopRangesAContainedInB(A, B):
   overlap = A.keys() & B.keys()
   return all([A[index] in B[index] for index in overlap])
@@ -188,7 +188,7 @@ def __init__(self, kernel: Function, arguments: list[TinytcKernelArgument | Tiny
       hasher = hashlib.sha512()
       hasher.update(self.source.encode('utf-8'))
       self.name = f'tinytc_wrapper_{hasher.hexdigest()}'
-    
+
     self.wrapper_args = [f'long {BatchedOperationsAux.NUM_ELEMENTS_NAME}', f'void* {BatchedOperationsAux.STREAM_PTR_NAME}']
     self.wrapper_call_args = []
     self.call_args = []
@@ -207,7 +207,7 @@ def __init__(self, kernel: Function, arguments: list[TinytcKernelArgument | Tiny
           if not arg.constant:
             self.wrapper_call_args.append(BatchedOperationsAux.NUM_ELEMENTS_NAME)
           if not arg.temporary and not arg.constant:
-            offset_name = f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{arg.name}' 
+            offset_name = f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{arg.name}'
             self.wrapper_args.append(f'long {offset_name}')
             self.wrapper_call_args.append(offset_name)
             self.call_args.append(f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{arg.call_expr}')
diff --git a/yateto/codegen/copyscaleadd/factory.py b/yateto/codegen/copyscaleadd/factory.py
index 5e30735..8c936f6 100644
--- a/yateto/codegen/copyscaleadd/factory.py
+++ b/yateto/codegen/copyscaleadd/factory.py
@@ -21,7 +21,7 @@ def __init__(self, alpha, beta, result: IndexedTensorDescription, term: IndexedT
     self.beta = beta
     self.result = result
     self.term = term
-    
+
     assert self.alpha != 0.0, 'copyscaleadd does not support alpha=0.0 at the moment.'
     assert self.beta == 1.0 or self.beta == 0.0, 'copyscaleadd supports only beta=0.0 or beta=1.0 at the moment.'
 
@@ -37,7 +37,7 @@ def __init__(self, alpha, beta, result: IndexedTensorDescription, term: IndexedT
     for idx in rB:
       if idx not in rA:
         rAB[idx] = rB[idx]
-    
+
     self.loopRanges = rAB
 
 
diff --git a/yateto/codegen/copyscaleadd/generic.py b/yateto/codegen/copyscaleadd/generic.py
index 44bf704..078199d 100644
--- a/yateto/codegen/copyscaleadd/generic.py
+++ b/yateto/codegen/copyscaleadd/generic.py
@@ -4,7 +4,7 @@ class Generic(object):
   def __init__(self, arch, descr):
     self._arch = arch
     self._descr = descr
-  
+
   def _formatTerm(self, alpha, term):
     prefix = ''
     if alpha == 0.0:
diff --git a/yateto/codegen/factory.py b/yateto/codegen/factory.py
index 5999542..bff1654 100644
--- a/yateto/codegen/factory.py
+++ b/yateto/codegen/factory.py
@@ -13,12 +13,12 @@ def __init__(self, cpp, arch, target):
     self._arch = arch
     self._freeList = list()
     self._target = target
-    
+
   def create(self, node, *args):
     method = 'create_' + node.__class__.__name__
     factory = getattr(self, method, self.generic_create)
     return factory(node, *args)
-  
+
   def generic_create(self, node, *args):
     raise NotImplementedError
 
@@ -59,7 +59,7 @@ def temporary(self, bufname, size, iniZero=False, memory=list()):
 
   def allocateTemporary(self):
     return True
-  
+
   def post_generate(self, routine_cache):
     pass
 
@@ -118,7 +118,7 @@ def create_FusedGEMMs(self, node, result, arguments, add, scalar, prefetchName,
     description = fused_gemms.Description(node, result, arguments, add, scalar)
     generator = fused_gemms.generator(self._arch, description, gemm_cfg, self._target)
     return generator.generate(self._cpp, routineCache, gemm_cfg)
-  
+
   def create_IndexSum(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     assert len(arguments) == 1
     description = indexsum.Description(
@@ -129,7 +129,7 @@ def create_IndexSum(self, node, result, arguments, add, scalar, prefetchName, ro
     )
     generator = indexsum.generator(self._arch, description, self._target)
     return generator.generate(self._cpp, routineCache)
-  
+
   def create_Product(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     assert len(arguments) == 2
     description = product.Description(
@@ -146,17 +146,17 @@ def create_Permute(self, node, result, arguments, add, scalar, prefetchName, rou
     result = IndexedTensorDescription.fromNode(result, node)
     term = IndexedTensorDescription.fromNode(arguments[0], node.term())
     return self._csa(result, term, add, scalar, routineCache, gemm_cfg)
-  
+
   def create_Broadcast(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     result = IndexedTensorDescription.fromNode(result, node)
     term = IndexedTensorDescription.fromNode(arguments[0], node.term())
     return self._csa(result, term, add, scalar, routineCache, gemm_cfg)
-  
+
   def simple(self, result, term, add, scalar, routineCache, gemm_cfg):
     result = IndexedTensorDescription.fromVar(result, self._indices(result))
     term = IndexedTensorDescription.fromVar(term, self._indices(term))
     return self._csa(result, term, add, scalar, routineCache, gemm_cfg)
-  
+
   def _csa(self, result, term, add, scalar, routineCache, gemm_cfg):
     description = copyscaleadd.Description(
       alpha = scalar,
@@ -177,30 +177,30 @@ def __init__(self, cpp, arch, nameFun, testFramework):
   def _formatTerm(self, var, indices):
     address = var.memoryLayout().addressString(indices)
     return '{}[{}]'.format(self._name(var), address)
-  
+
   def create_Einsum(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     g = node.indices
     for child in node:
       g = g.merged(child.indices - g)
-    
+
     ranges = {idx: Range(0, g.indexSize(idx)) for idx in g}
-    
+
     resultTerm = self._formatTerm(result, node.indices)
     terms = [self._formatTerm(arguments[i], child.indices) for i,child in enumerate(node)]
-    
+
     if scalar and scalar != 1.0:
       terms.insert(0, str(scalar))
-    
+
     if not add:
       self._cpp.memset(self._name(result), result.memoryLayout().requiredReals(), self._arch.typename)
-    
+
     class EinsumBody(object):
       def __call__(s):
         self._cpp( '{} += {};'.format(resultTerm, ' * '.join(terms)) )
         return len(terms)
 
     return forLoops(self._cpp, g, ranges, EinsumBody(), pragmaSimd=False)
-  
+
   def create_ScalarMultiplication(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     return self.simple(result, arguments[0], add, scalar, routineCache)
 
@@ -209,7 +209,7 @@ def create_Permute(self, node, result, arguments, add, scalar, prefetchName, rou
     resultTerm = self._formatTerm(result, node.indices)
     termTerm = self._formatTerm(arguments[0], node.term().indices)
     return self._simpleBody(resultTerm, termTerm, add, scalar, node.indices)
-  
+
   def create_Broadcast(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     assert node.term().indices <= node.indices
     resultTerm = self._formatTerm(result, node.indices)
@@ -282,10 +282,10 @@ class ExportGenerator:
 
   def __init__(self, arch):
     self.arch = arch
-  
+
   def generate(self, cpp, cache):
     pass
-  
+
   def add_linear_operation(self, dest, ops, target, permute, add):
     pass
 
@@ -297,25 +297,25 @@ def makeFactory(cls, generator):
   def __init__(self, generator, cpp, arch, target):
     super().__init__(cpp, arch, target)
     self.generator = generator
-  
+
   def post_generate(self, routine_cache):
     self.generator.generate(self._cpp, routine_cache)
 
   def allocateTemporary(self):
     return False
-  
+
   def create_LoopOverGEMM(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     assert len(arguments) == 2
     makeNode = IndexedTensorDescription.fromNode
     argnodes = [makeNode(arguments[0], node.leftTerm()), makeNode(arguments[1], node.rightTerm())]
     return self.handleLinear(makeNode(result, node), argnodes, add, scalar, node.transA(), node.transB())
-  
+
   def create_IndexSum(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     assert len(arguments) == 1
     makeNode = IndexedTensorDescription.fromNode
     argnodes = [makeNode(arguments[0], node.term())]
     return self.handleLinear(makeNode(result, node), argnodes, add, scalar, False, False)
-  
+
   def create_Product(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     assert len(arguments) == 2
     makeNode = IndexedTensorDescription.fromNode
@@ -329,7 +329,7 @@ def create_Permute(self, node, result, arguments, add, scalar, prefetchName, rou
   def create_Broadcast(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg):
     term = arguments[0]
     return self.handleLinear(IndexedTensorDescription.fromVar(result, node.indices), [IndexedTensorDescription.fromVar(term, node.term().indices)], add, scalar, False, False)
-  
+
   def simple(self, result, term, add, scalar, routineCache, gemm_cfg):
     return self.handleLinear(IndexedTensorDescription.fromVar(result, self._indices(result)), [IndexedTensorDescription.fromVar(term, self._indices(term))], add, scalar, False, False)
 
@@ -357,10 +357,10 @@ def handleLinear(self, dest, ops, add, scalar, transposeA, transposeB):
     # convert indices to loop numbers
 
     target, permute = self.getIndices(dest, ops)
-    
+
     if not (scalar == 1 or scalar == 1.0):
       ops += [scalar]
       target += [[]]
       permute += [[]]
-    
+
     return self.generator.add_linear_operation(dest, ops, target, permute, add)
diff --git a/yateto/codegen/gemm/factory.py b/yateto/codegen/gemm/factory.py
index 98c177c..0f38428 100644
--- a/yateto/codegen/gemm/factory.py
+++ b/yateto/codegen/gemm/factory.py
@@ -26,17 +26,17 @@ def __init__(self,
     self.alpha = alpha
     self.beta = beta
     self.prefetchName = prefetchName
-    
+
     self.isACsc = self.leftTerm.memoryLayout.isCSC()
     self.isBCsc = self.rightTerm.memoryLayout.isCSC()
-    
+
     bbA = BoundingBox.fromSpp(self.leftTerm.eqspp)
     bbB = BoundingBox.fromSpp(self.rightTerm.eqspp)
     bbC = BoundingBox.fromSpp(self.result.eqspp)
-    
+
     kA = 1 if not transA else 0
     kB = 0 if not transB else 1
-    
+
     k = bbA[kA] & bbB[kB]
     m = bbA[1-kA]
     n = bbB[1-kB]
@@ -46,19 +46,19 @@ def __init__(self,
 
     self.alignedA = alignedStartA and not transA and self.leftTerm.memoryLayout.alignedStride()
     self.alignedC = alignedStartC and self.result.memoryLayout.alignedStride()
-    
+
     if self.alignedA and self.alignedC:
       m = m.aligned(arch)
     else:
       mStartAligned = arch.checkAlignment(m.start)
       self.alignedA = self.alignedA & mStartAligned
       self.alignedC = self.alignedC & mStartAligned
-    
+
     self._mnk = (m, n, k)
 
   def mnk(self):
     return self._mnk
-  
+
   def setBeta(self, beta):
     self.beta = beta
 
diff --git a/yateto/codegen/gemm/gemmgen.py b/yateto/codegen/gemm/gemmgen.py
index 35b0b95..5510d18 100644
--- a/yateto/codegen/gemm/gemmgen.py
+++ b/yateto/codegen/gemm/gemmgen.py
@@ -86,14 +86,14 @@ def _pointer(self, term, offset2, transpose):
     if o > 0:
       return '{} + {}'.format(term.name, o)
     return term.name
-    
+
   def generate(self, cpp, routineCache):
     d = self._descr
     m, n, k = d.mnk()
     ldA = 0 if d.isACsc else d.leftTerm.memoryLayout.stridei(1)
     ldB = 0 if d.isBCsc else d.rightTerm.memoryLayout.stridei(1)
     ldC = d.result.memoryLayout.stridei(1)
-    
+
     assert (d.transA and (k,m) in d.leftTerm.memoryLayout) or (not d.transA and (m,k) in d.leftTerm.memoryLayout)
     assert (d.transB and (n,k) in d.rightTerm.memoryLayout) or (not d.transB and (k,n) in d.rightTerm.memoryLayout)
     assert (m,n) in d.result.memoryLayout
@@ -109,7 +109,7 @@ def generate(self, cpp, routineCache):
     if d.isBCsc:
       sppB = d.rightTerm.memoryLayout.entries(k, n)
       sppBRows = d.rightTerm.memoryLayout.shape()[0]
-    
+
     if d.isACsc and d.isBCsc:
       # count the flops by splitting into outer products (i.e. partition by k)
       # for each outer product, we need to compute all-by-all nonzero entries for m and n
@@ -124,7 +124,7 @@ def generate(self, cpp, routineCache):
       flops = 2 * m.size() * len(sppB)
     else:
       flops = 2 * m.size() * n.size() * k.size()
-    
+
     if isinstance(self._gemm_cfg, BLASlike):
       ptr_a = self._pointer(term=d.leftTerm, offset2=(m.start, k.start), transpose=d.transA)
       ptr_b = self._pointer(term=d.rightTerm, offset2=(k.start, n.start), transpose=d.transB)
@@ -293,13 +293,13 @@ def __init__(self, arch, gemmDescr, sppA, sppARows, sppB, sppBRows, gemm_cfg):
     self._mode = gemm_cfg.operation_name
     self._cmd = gemm_cfg.cmd
     self._blockSize = gemm_cfg.blockSize(gemmDescr['M'], gemmDescr['N'], gemmDescr['K']) if hasattr(gemm_cfg, 'blockSize') else dict()
-  
+
   def __eq__(self, other):
     return self._arch == other._arch and \
            self._gemmDescr == other._gemmDescr and \
            self._sppA == other._sppA and \
            self._sppB == other._sppB
-  
+
   def header(self, cpp):
     with cpp.PPIfndef('NDEBUG'):
       cpp('extern long long libxsmm_num_total_flops;')
@@ -319,7 +319,7 @@ def _callGenerator(self, argList):
 Given command: {' '.join(strcmd)}
 Stdout: {result.stdout}
 Stderr: {result.stderr}""")
-  
+
   def __call__(self, routineName, fileName):
     cpu_arch = self._arch.host_name
 
@@ -400,7 +400,7 @@ def __init__(self, shape, spp):
         self._shape = shape
         self._spp = spp
         self._temp = None
-      
+
       def __enter__(self):
         if self._spp is not None:
           self._temp = tempfile.NamedTemporaryFile()
@@ -413,11 +413,11 @@ def __enter__(self):
           self._temp.flush()
           return self._temp.name
         return None
-      
+
       def __exit__(self, exc_type, exc_val, exc_tb):
         if self._spp is not None:
           self._temp.__exit__(exc_type, exc_val, exc_tb)
-    
+
     with SparsityWrapper((self._gemmDescr['M'], self._gemmDescr['K']), self._sppA) as afile:
       with SparsityWrapper((self._sppBRows if self._mode=='pspamm' else self._gemmDescr['K'], self._gemmDescr['N']), self._sppB) as bfile:
         if self._mode == 'libxsmm':
@@ -533,7 +533,7 @@ def _kernel(self, routine_name):
   {alpha}, // alpha
   {beta}, // beta
   {prefetch_flag} // prefetch
-); 
+);
 """.format(kernel_var_name=kernel_var_name,
            prec=self._arch.typename, M=M, N=N, K=K,
            ldA=ldA, ldB=ldB, ldC=ldC,
diff --git a/yateto/codegen/gemm/generic.py b/yateto/codegen/gemm/generic.py
index 7d9d92d..c161976 100644
--- a/yateto/codegen/gemm/generic.py
+++ b/yateto/codegen/gemm/generic.py
@@ -103,7 +103,7 @@ def _generateSparseSparse(self, cpp):
   def _generateSparseDense(self, cpp):
     d = self._descr
     m, n, k = d.mnk()
-    
+
     assert d.isACsc != d.isBCsc
 
     Aaccess = self._accessFun(d.leftTerm, (m.start, k.start), d.isACsc, d.transA)
@@ -153,11 +153,11 @@ def _generateSparseDense(self, cpp):
   def _generateDenseDense(self, cpp):
     d = self._descr
     m, n, k = d.mnk()
-    
+
     Aaccess = self._accessFun(d.leftTerm, (m.start, k.start), False, d.transA)
     Baccess = self._accessFun(d.rightTerm, (k.start, n.start), False, d.transB)
     Caccess = self._accessFun(d.result, (m.start, n.start), False, False)
-    
+
     with cpp.For('int n = 0; n < {0}; ++n'.format(n.size())):
       if d.beta != 1.0:
         with cpp.For('int m = 0; m < {0}; ++m'.format(m.size())):
diff --git a/yateto/codegen/indexsum/factory.py b/yateto/codegen/indexsum/factory.py
index 48e3423..a3422a3 100644
--- a/yateto/codegen/indexsum/factory.py
+++ b/yateto/codegen/indexsum/factory.py
@@ -7,21 +7,21 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, term: Ind
     self.add = add
     self.result = result
     self.term = term
-    
+
     rA = loopRanges(self.term, self.result.indices)
     rB = loopRanges(self.result, self.result.indices)
     assert testLoopRangesAContainedInB(rA, rB)
-    
+
     self.loopRanges = rA
-    
+
     self.sumIndex = self.term.indices - self.result.indices
     assert len(self.sumIndex) == 1
 
     self.sumLoopRange = loopRanges(self.term, self.sumIndex)[str(self.sumIndex)]
-    
+
 
 def generator(arch, descr, target):
   if target == 'cpu':
     return Generic(arch, descr)
   elif target == 'gpu':
-    raise RuntimeError("IndexSum operation has not been implemented for GPU-like architectures")
\ No newline at end of file
+    raise RuntimeError("IndexSum operation has not been implemented for GPU-like architectures")
diff --git a/yateto/codegen/indexsum/generic.py b/yateto/codegen/indexsum/generic.py
index 36df9f5..6f072ab 100644
--- a/yateto/codegen/indexsum/generic.py
+++ b/yateto/codegen/indexsum/generic.py
@@ -7,11 +7,11 @@ def __init__(self, arch, descr):
 
   def generate(self, cpp, routineCache):
     d = self._descr
-        
+
     if not d.add:
       writeBB = boundingBoxFromLoopRanges(d.result.indices, d.loopRanges)
       initializeWithZero(cpp, self._arch, d.result, writeBB)
-    
+
     sumIndex = d.term.indices - d.result.indices
     assert len(sumIndex) == 1
     class IndexSumBody(object):
@@ -23,7 +23,7 @@ def __call__(s):
           cpp( 'sum += {}[{}];'.format(d.term.name, d.term.memoryLayout.addressString(d.term.indices)) )
         mult = '{} * '.format(d.alpha) if d.alpha != 1.0 else ''
         cpp( '{} = {}sum;'.format(target, mult) )
-        
+
         flop = 1 if d.alpha != 1.0 else 0
         return d.sumLoopRange.size() + flop
 
diff --git a/yateto/codegen/log/factory.py b/yateto/codegen/log/factory.py
index 0233228..3571114 100644
--- a/yateto/codegen/log/factory.py
+++ b/yateto/codegen/log/factory.py
@@ -22,22 +22,22 @@ def __init__(self,
     self.transA = transA
     self.transB = transB
     self.prefetchName = prefetchName
-    
+
     rA = loopRanges(self.leftTerm, self.loopIndices)
     rB = loopRanges(self.rightTerm, self.loopIndices)
     rC = loopRanges(self.result, self.loopIndices)
     assert testLoopRangesEqual(rA, rB)
     assert testLoopRangesAContainedInB(rA, rC)
     assert testLoopRangesAContainedInB(rB, rC)
-    
+
     rC.update(rA)
     rC.update(rB)
 
     self.loopRanges = rC
-    
+
     self.innerLoopIndices = self.loopIndices - self.result.indices
     self.outerLoopIndices = self.loopIndices - self.innerLoopIndices
-    
+
     self.assignLoopRanges = copy.deepcopy(self.loopRanges)
     self.addLoopRanges = copy.deepcopy(self.loopRanges)
 
@@ -52,8 +52,7 @@ def __init__(self,
       self.addLoopRanges[peelOffIndex].start   = self.loopRanges[peelOffIndex].start+1
     else:
       self.assignLoopRanges = None
-      
+
 
 def generator(arch, descr, target):
   return Generic(arch, descr, target)
-
diff --git a/yateto/codegen/log/generic.py b/yateto/codegen/log/generic.py
index 357f363..a239e8c 100644
--- a/yateto/codegen/log/generic.py
+++ b/yateto/codegen/log/generic.py
@@ -8,7 +8,7 @@ def __init__(self, arch, descr, target):
     self._arch = arch
     self._descr = descr
     self._target = target
-  
+
   def _pointer(self, cpp, targetName, baseName, term, loopIndices, const=True):
     indices = term.indices & loopIndices
     addressStr = term.memoryLayout.addressString(term.indices, indices) if len(indices) > 0 else ''
@@ -18,7 +18,7 @@ def _pointer(self, cpp, targetName, baseName, term, loopIndices, const=True):
 
   def _alignedStart(self, term, loopIndices):
     return term.memoryLayout.isAlignedAddressString(term.indices, term.indices & loopIndices)
-    
+
   def _memLayout(self, term, I, J):
     if len(I) == 0 and len(J) == 0:
       return DenseMemoryLayout((1,1))
@@ -34,7 +34,7 @@ def _memLayout(self, term, I, J):
 
   def _reduce(self, term, subset, memLayout):
     return reduceSpp(term.eqspp, term.indices, subset).reshape(memLayout.shape())
-  
+
   def _defuse(self, fusedRange, term, I):
     if len(I) == 1:
       return  {next(iter(I)): fusedRange}
@@ -42,14 +42,14 @@ def _defuse(self, fusedRange, term, I):
 
   def generate(self, cpp, routineCache, gemm_cfg):
     d = self._descr
-    
+
     A = d.leftTerm.indices - d.loopIndices
     B = d.rightTerm.indices - d.loopIndices
     C = d.result.indices - d.loopIndices
     Im = set(A) & set(C)
     In = set(B) & set(C)
     Ik = set(A) & set(B)
-    
+
     hasOuterLoops = len(d.outerLoopIndices) > 0
 
     if hasOuterLoops and self._target == 'gpu':
@@ -60,13 +60,13 @@ def generate(self, cpp, routineCache, gemm_cfg):
     outerBname = '_B' if hasOuterLoops else d.rightTerm.name
     outerCname = '_C' if hasOuterLoops else d.result.name
     outerPrefetchName = '_Cprefetch' if hasOuterLoops and d.prefetchName is not None else d.prefetchName
-    
+
     hasInnerLoops = len(d.innerLoopIndices) > 0
     innerAname = '_Ain' if hasInnerLoops else outerAname
     innerBname = '_Bin' if hasInnerLoops else outerBname
     innerCname = '_Cin' if hasInnerLoops else outerCname
     innerPrefetchName = '_Cprefetchin' if hasInnerLoops and outerPrefetchName is not None else outerPrefetchName
-    
+
     AmemLayout = self._memLayout(d.leftTerm, Im, Ik)
     BmemLayout = self._memLayout(d.rightTerm, Ik, In)
     CmemLayout = self._memLayout(d.result, Im, In)
@@ -88,7 +88,7 @@ def generate(self, cpp, routineCache, gemm_cfg):
       alignedStartC = self._alignedStart(d.result, d.outerLoopIndices) and self._alignedStart(d.result, d.innerLoopIndices),
       prefetchName = innerPrefetchName
     )
-    
+
     if not d.add:
       lr = dict()
       m, n, k = gemmDescr.mnk()
@@ -97,7 +97,7 @@ def generate(self, cpp, routineCache, gemm_cfg):
       lr.update( self._defuse(n, d.rightTerm, In) )
       writeBB = boundingBoxFromLoopRanges(d.result.indices, lr)
       initializeWithZero(cpp, self._arch, d.result, writeBB)
-    
+
     class LoGBody(object):
       def __call__(s):
         if hasInnerLoops:
@@ -127,4 +127,3 @@ def __call__(s):
         return flops
 
     return forLoops(cpp, d.outerLoopIndices, d.loopRanges, InnerLoopBody(), pragmaSimd=False)
-
diff --git a/yateto/codegen/product/factory.py b/yateto/codegen/product/factory.py
index 16c027f..a2e3059 100644
--- a/yateto/codegen/product/factory.py
+++ b/yateto/codegen/product/factory.py
@@ -11,21 +11,20 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, leftTerm:
 
     self.isACsc = self.leftTerm.memoryLayout.isCSC()
     self.isBCsc = self.rightTerm.memoryLayout.isCSC()
-    
+
     rA = loopRanges(self.leftTerm, self.result.indices)
     rB = loopRanges(self.rightTerm, self.result.indices)
     rC = loopRanges(self.result, self.result.indices)
     assert testLoopRangesEqual(rA, rB)
     assert testLoopRangesAContainedInB(rA, rC)
     assert testLoopRangesAContainedInB(rB, rC)
-    
+
     rA.update(rB)
 
-    self.loopRanges = rA    
+    self.loopRanges = rA
 
 def generator(arch, descr, target):
   if target == 'cpu':
     return Generic(arch, descr)
   elif target == 'gpu':
     raise RuntimeError("Product operation has not been implemented for GPU-like architectures")
-
diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py
index a7bc4a4..2e1c6d9 100644
--- a/yateto/codegen/visitor.py
+++ b/yateto/codegen/visitor.py
@@ -109,7 +109,7 @@ class OptimizedKernelGenerator(KernelGenerator):
   TEMP_MEM_REQUIRED_NAME = 'TmpMemRequiredInBytes'
   TEMP_MAX_MEM_REQUIRED_NAME = 'TmpMaxMemRequiredInBytes'
 
-  
+
   def __init__(self, arch, routineCache, routine_exporters):
     super().__init__(arch)
     self._routineCache = routineCache
@@ -122,7 +122,7 @@ def __init__(self, arch, routineCache, routine_exporters):
 
     for entry in routine_exporters:
       self._routine_factories[entry] = ExportFactory.makeFactory(routine_exporters[entry])
-  
+
   class KernelOutline(object):
     def __init__(self,
                  nonZeroFlops,
@@ -158,7 +158,7 @@ def _addTensor(cls, tensor, tensors):
         tensors[base_name] = tensors[base_name] | {group}
       else:
         tensors[base_name] = {group}
-  
+
   def generateKernelOutline(self, nonZeroFlops, cfg, gemm_cfg, target):
     scalarsP = ScalarsSet().visit(cfg)
     variables = SortedGlobalsList().visit(cfg)
@@ -194,7 +194,7 @@ def generateKernelOutline(self, nonZeroFlops, cfg, gemm_cfg, target):
       factory.freeTmp()
       factory.reset_stream()
       factory.reset_flags()
-      function = functionIO.getvalue()    
+      function = functionIO.getvalue()
     return self.KernelOutline(nonZeroFlops,
                               hwFlops,
                               tensors,
@@ -213,7 +213,7 @@ def _addFromKO(cls, koEntries, entries):
         entries[key] = value
       else:
         entries[key] = entries[key] | value
-    
+
 
   def generate(self, cpp, header, name, kernelOutlines, familyStride=None):
     tensors = collections.OrderedDict()
@@ -295,7 +295,7 @@ def kernelArgs(base_name_with_namespace, groups, writable, is_constant, target):
             header(f'{class_name}::{container_type} {base_name};')
           else:
             header(f'{typ}{ptr_type} {base_name}{{}};')
-        
+
         def scalarArgs(base_name_with_namespace, groups):
           prefix, base_name = Tensor.splitBasename(base_name_with_namespace)
           typ = self._arch.typename
@@ -419,7 +419,7 @@ class UnitTestGenerator(KernelGenerator):
   STREAM = '_stream'
   TMP_MEM = '_tmpMem'
   TMP_SIZE = 128 * 8
-  
+
   def __init__(self, arch):
     super().__init__(arch)
 
@@ -475,7 +475,7 @@ def _name(cls, var):
 
   def _viewName(self, var):
     return '_view_' + self._name(var)
-  
+
   def _groupStr(self, var):
     group = var.group()
     return ','.join([str(g) for g in group])
@@ -487,7 +487,7 @@ def _groupTemplate(self, var):
   def _groupIndex(self, var):
     gstr = self._groupStr(var)
     return '({})'.format(gstr) if gstr else ''
-  
+
   def generate(self, cpp, namespace, testName, kernelClass, cfg, target, gemm_cfg, testFramework, index=None):
     if target == 'gpu':
       if self._arch.backend in ['oneapi', 'acpp', 'hipsycl']:
@@ -534,7 +534,7 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, target, gemm_cfg,
       for var in variables:
         factory.tensor(var.tensor, self._tensorName(var))
         factory.temporary(self._name(var), var.memoryLayout().requiredReals(), iniZero=True)
-        
+
         shape = var.memoryLayout().shape()
         cpp('{supportNS}::DenseTensorView<{dim},{arch.typename},{arch.uintTypename}> {viewName}({utName}, {{{shape}}}, {{{start}}}, {{{stop}}});'.format(
             supportNS = SUPPORT_LIBRARY_NAMESPACE,
@@ -632,33 +632,33 @@ class InitializerGenerator(object):
   VIEW_FUN_NAME = 'create'
   VIEW_TYPE_NAME = 'type'
   VIEW_TYPE_NAME_CONST = 'type_const'
-  
+
   class TensorView(object):
     ARGUMENT_NAME = 'values'
 
     def typename(self, dim, arch, const):
       constStr = 'true' if const else 'false'
       return '::{}::{}<{},{},{},{}>'.format(SUPPORT_LIBRARY_NAMESPACE, type(self).__name__, dim, arch.typename, arch.uintTypename, constStr)
-    
+
     @classmethod
     def arguments(cls, arch, const):
       if const:
         return '{} const* {}'.format(arch.typename, cls.ARGUMENT_NAME)
       else:
         return '{} * {}'.format(arch.typename, cls.ARGUMENT_NAME)
-    
+
     def generate(cpp, group, memLayout, arch, index, const):
       raise NotImplementedError
-    
+
     def listToInitializerList(self, lst):
       return '{{{}}}'.format(', '.join([str(l) for l in lst]))
-    
+
     def formatArray(self, numberType, name, values, declarationOnly):
       lhs = '{} {}[]'.format(numberType, name)
       if declarationOnly:
         return ''
       return '{} {} = {};'.format(MODIFIERS, lhs, self.listToInitializerList(values))
-  
+
   class DenseTensorView(TensorView):
     START_NAME = 'Start'
     STOP_NAME = 'Stop'
@@ -680,7 +680,7 @@ def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declaration
   class CSCMatrixView(TensorView):
     ROWIND_NAME = 'RowInd'
     COLPTR_NAME = 'ColPtr'
-    
+
     def typename(self, dim, arch, const):
       constStr = 'true' if const else 'false'
       return '::{}::{}<{},{},{}>'.format(SUPPORT_LIBRARY_NAMESPACE, type(self).__name__, arch.typename, arch.uintTypename, constStr)
@@ -736,14 +736,14 @@ def __init__(self, arch, tensors, scalars):
     self._groupSize = {baseName: tuple(map(lambda x: x+1, mi)) for baseName, mi in maxIndex.items()}
     maxIndexScalar = {baseName: tuple(map(max, *groups.keys())) if len(groups) > 1 else next(iter(groups.keys())) for baseName, groups in self._scalarCollect.items()}
     self._groupSizeScalar = {baseName: tuple(map(lambda x: x+1, mi)) for baseName, mi in maxIndexScalar.items()}
-  
+
   def _tensorViewGenerator(self, memoryLayout):
     memLayoutMap = {
       'DenseMemoryLayout': self.DenseTensorView,
       'CSCMemoryLayout': self.CSCMatrixView
     }
     return memLayoutMap[type(memoryLayout).__name__]()
-  
+
   def iterate_collect(self):
     cur_namespace = ''
     cur_dict = collections.OrderedDict()
@@ -783,7 +783,7 @@ def iterate_collect_scalar(self):
   def generateTensorsH(self, header):
     for namespace, tensor_dict in self.iterate_collect():
       with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE):
-        for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items():        
+        for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items():
           with header.Struct(baseNameWithoutNamespace):
             groupSize = self._groupSize[baseName]
             self._tensor(header, '', tensors, groupSize, False)
@@ -809,7 +809,7 @@ def generateTensorsH(self, header):
                   header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args)))
     for namespace, scalar_dict in self.iterate_collect_scalar():
       with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE):
-        for (baseName, baseNameWithoutNamespace), scalars in scalar_dict.items():        
+        for (baseName, baseNameWithoutNamespace), scalars in scalar_dict.items():
           with header.Struct(baseNameWithoutNamespace):
             groupSize = self._groupSizeScalar[baseName]
             args = ndargs(len(groupSize))
@@ -827,13 +827,13 @@ def generateTensorsH(self, header):
                   header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args)))
                 with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True):
                   header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args)))
-  
+
   def generateTensorsCpp(self, cpp):
     for namespace, tensor_dict in self.iterate_collect():
       with cpp.Namespace(namespace):
         for (base_name, base_name_without_namespace), tensors in tensor_dict.items():
           self._tensor(cpp, '::'.join([self.TENSOR_NAMESPACE, base_name_without_namespace, '']), tensors, self._groupSize[base_name], True)
-  
+
   def generateInitH(self, header):
     for namespace, tensor_dict in self.iterate_collect():
       with header.Namespace(namespace), header.Namespace(self.INIT_NAMESPACE):
@@ -931,7 +931,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat
               tv.generate(cpp, ml, self._arch, index(group), False)
             with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgsConst, returnType='{} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME_CONST)):
               tv.generate(cpp, ml, self._arch, index(group), True)
-  
+
   def _array(self, cpp, typ, name, content, groupSize, declarationOnly=False, alwaysArray=True, constexpr=True, static=True):
     cexpr = CONSTEXPR + ' ' if constexpr else ''
     stat = STATIC + ' ' if static else ''
@@ -964,7 +964,5 @@ def _array(self, cpp, typ, name, content, groupSize, declarationOnly=False, alwa
       initStr = ', '.join(init)
       if isGroup:
         initStr = '{{{}}}'.format(initStr)
-      
-      cpp('{}{}{} {}{}{} = {};'.format(cexpr, stat, typ, name, groupIndices, arrayIndices, initStr))
-
 
+      cpp('{}{}{} {}{}{} = {};'.format(cexpr, stat, typ, name, groupIndices, arrayIndices, initStr))
diff --git a/yateto/controlflow/__init__.py b/yateto/controlflow/__init__.py
index 8b13789..e69de29 100644
--- a/yateto/controlflow/__init__.py
+++ b/yateto/controlflow/__init__.py
@@ -1 +0,0 @@
-
diff --git a/yateto/controlflow/fused_gemm_automata/__init__.py b/yateto/controlflow/fused_gemm_automata/__init__.py
index 3939969..6df79ed 100644
--- a/yateto/controlflow/fused_gemm_automata/__init__.py
+++ b/yateto/controlflow/fused_gemm_automata/__init__.py
@@ -1 +1 @@
-from .automata import Context
\ No newline at end of file
+from .automata import Context
diff --git a/yateto/controlflow/graph.py b/yateto/controlflow/graph.py
index e1b1420..a5af954 100644
--- a/yateto/controlflow/graph.py
+++ b/yateto/controlflow/graph.py
@@ -17,7 +17,7 @@ def variables(self):
 
   def maySubstitute(self, when, by):
     return self.substituted(when, by).memoryLayout().isCompatible(self.eqspp())
-  
+
   def substituted(self, when, by, memoryLayout=None):
     return by if self == when else self
 
@@ -38,13 +38,13 @@ def eqspp(self):
 
   def __hash__(self):
     return hash(self.name)
-  
+
   def __str__(self):
     return self.name
-  
+
   def __repr__(self):
     return str(self)
-  
+
   def __eq__(self, other):
     isEq = self.name == other.viewed().name # and self._memoryLayout == other._memoryLayout
     assert not isEq or (self.writable == other.viewed().writable and self._memoryLayout == other.viewed()._memoryLayout)
@@ -53,7 +53,7 @@ def __eq__(self, other):
   def setWritable(self, name):
     if self.name == name:
       self.writable = True
-  
+
   def viewed(self):
     return self
 
@@ -66,18 +66,18 @@ def __init__(self, variable, memoryLayout, eqspp):
   @property
   def tensor(self):
     return self.variable.tensor
-  
+
   @property
   def writable(self):
     return self.variable.writable
-  
+
   @property
   def is_temporary(self):
     return self.variable.is_temporary
-  
+
   def maySubstitute(self, when, by):
     return self.substituted(when, by).memoryLayout().isCompatible(self.eqspp())
-  
+
   def substituted(self, when, by, memoryLayout=None):
     return by if self == when else self
 
@@ -104,13 +104,13 @@ def eqspp(self):
 
   def __hash__(self):
     return hash(self.variable.name)
-  
+
   def __str__(self):
     return f'{self.variable.name}'
-  
+
   def __repr__(self):
     return str(self)
-  
+
   def __eq__(self, other):
     isEq = self.variable == other.viewed() and self._memoryLayout == other._memoryLayout
     return isEq
@@ -170,10 +170,10 @@ def isRHSExpression(self):
 
   def isRHSVariable(self):
     return not self.isRHSExpression()
-  
+
   def isCompound(self):
     return self.add
-  
+
   def hasTrivialScalar(self):
     return self.scalar is None or self.scalar == 1.0
 
diff --git a/yateto/controlflow/transformer.py b/yateto/controlflow/transformer.py
index 3500a7d..5d28db8 100644
--- a/yateto/controlflow/transformer.py
+++ b/yateto/controlflow/transformer.py
@@ -4,7 +4,7 @@
 from .fused_gemm_automata import Context as FusedGemmsContext
 
 
-class MergeScalarMultiplications(object):   
+class MergeScalarMultiplications(object):
   def visit(self, cfg):
     n = len(cfg)-1
     i = 1
@@ -34,7 +34,7 @@ def visit(self, cfg):
     for i in range(n):
       ua = cfg[i].action
       v = cfg[i+1]
-      
+
       if not ua.isCompound() \
           and ua.isRHSVariable() \
           and ua.term.writable \
@@ -49,7 +49,7 @@ def visit(self, cfg):
           for j in range(i, n):
             cfg[j].action = cfg[j].action.substituted(when, by)
           cfg = LivenessAnalysis().visit(cfg)
-  
+
     return cfg
 
 class SubstituteBackward(object):
diff --git a/yateto/controlflow/visitor.py b/yateto/controlflow/visitor.py
index 8841e06..1c111a2 100644
--- a/yateto/controlflow/visitor.py
+++ b/yateto/controlflow/visitor.py
@@ -6,19 +6,19 @@
 
 class AST2ControlFlow(Visitor):
   TEMPORARY_RESULT = '_tmp'
-  
+
   def __init__(self, simpleMemoryLayout=False):
     self._tmp = 0
     self._cfg = []
     self._writable = set()
     self._simpleMemoryLayout = simpleMemoryLayout
-  
+
   def cfg(self):
     return self._cfg + [ProgramPoint(None)]
 
   def _ml(self, node):
     return DenseMemoryLayout(node.shape()) if self._simpleMemoryLayout else node.memoryLayout()
-  
+
   def _addTransformOp(self, permute, variable):
     if not self._simpleMemoryLayout:
       permute.setEqspp( permute.computeSparsityPattern() )
@@ -44,7 +44,7 @@ def _addPermuteIfRequired(self, indices, term, variable):
         # permute needed, run before broadcast
         inode = Permute.subPermute(term, indices)
         intermediate = self._addTransformOp(inode, variable)
-      
+
       result = intermediate
       if len(term.indices) != len(indices):
         # broadcast needed, more output than input indices
@@ -54,13 +54,13 @@ def _addPermuteIfRequired(self, indices, term, variable):
 
   def generic_visit(self, node):
     variables = [self.visit(child) for child in node]
-    
+
     result = self._nextTemporary(node)
     action = ProgramAction(result, Expression(node, self._ml(node), variables), False)
     self._addAction(action)
-    
+
     return result
-  
+
   def visit_SliceView(self, node):
     var = self.visit(node.term())
     ml = node.getMemoryLayout(var.memoryLayout())
@@ -79,18 +79,18 @@ def visit_Add(self, node):
       action = ProgramAction(tmp, rhs, add)
       self._addAction(action)
       add = True
-    
+
     return tmp
-  
+
   def visit_ScalarMultiplication(self, node):
     variable = self.visit(node.term())
 
     result = self._nextTemporary(node)
     action = ProgramAction(result, variable, False, node.scalar())
     self._addAction(action)
-    
+
     return result
-  
+
   def visit_Assign(self, node):
     self.updateWritable(node[0].name())
     variables = [self.visit(child) for child in node]
@@ -98,9 +98,9 @@ def visit_Assign(self, node):
     rhs = self._addPermuteIfRequired(node.indices, node.rightTerm(), variables[1])
     action = ProgramAction(variables[0], rhs, False)
     self._addAction(action)
-    
+
     return variables[0]
-  
+
   def visit_IndexedTensor(self, node):
     return Variable(node.name(), node.name() in self._writable, self._ml(node), node.eqspp(), node.tensor, is_temporary=node.tensor.temporary)
 
diff --git a/yateto/gemm_configuration.py b/yateto/gemm_configuration.py
index 366a6c8..6eed1ca 100644
--- a/yateto/gemm_configuration.py
+++ b/yateto/gemm_configuration.py
@@ -13,7 +13,7 @@ class GemmTool(ABC):
   def __init__(self, operation_name: str, includes: List[str] = []):
     self.operation_name = operation_name
     self.includes = includes
-  
+
   def archSupported(self):
     return True
 
@@ -57,7 +57,7 @@ class MKL(BLASlike):
   def __init__(self, arch):
     self._arch = arch
     super().__init__('cblas_{}gemm'.format(arch.precision.lower()), ['mkl_cblas.h'])
-  
+
   def archSupported(self):
     return self._arch.host_name.lower() in {'snb', 'hsw', 'skx', 'knl'} or self._arch.host_name.lower().startswith('avx')
 
diff --git a/yateto/generator.py b/yateto/generator.py
index ce65cd9..4087395 100644
--- a/yateto/generator.py
+++ b/yateto/generator.py
@@ -63,7 +63,7 @@ def prepareUntilUnitTest(self):
       ast2cf.visit(ast)
     self.cfg = ast2cf.cfg()
     self.cfg = LivenessAnalysis().visit(self.cfg)
-  
+
   def prepareUntilCodeGen(self, cost_estimator, enableFusedGemm: bool):
     self.nonZeroFlops = 0
     for a in self.ast:
@@ -117,34 +117,34 @@ def __init__(self, namespace=None):
       self.namespace = namespace
     else:
       self.namespace = ''
-  
+
   def items(self):
     return self._kernels.items()
-  
+
   def __len__(self):
     return max(self._kernels.keys()) + 1
-  
-  @classmethod  
+
+  @classmethod
   def baseName(self, name):
     return re.match(Kernel.BASE_NAME, name).group(0)
-  
+
   @classmethod
   def isValidName(cls, name):
     return re.match(cls.VALID_NAME, name) is not None
-  
+
   @classmethod
   def group(cls, name):
     m = re.search(cls.GROUP_INDEX, name)
     return int(m.group(1))
-  
+
   def setStride(self, stride):
     self._stride = stride
-  
+
   def stride(self):
     if self._stride is not None:
       return self._stride
     return (1,)
-    
+
   @classmethod
   def linear(cls, stride, group):
     assert len(stride) == len(group)
@@ -158,7 +158,7 @@ def add(self, name, ast, prefetch=None, namespace=None, target='cpu'):
     if not self.name:
       self.name = baseName
     assert baseName == self.name
-    
+
     group = self.group(name)
     internalName = '_{}_{}'.format(baseName, group)
     self._kernels[group] = Kernel(internalName, ast, prefetch, namespace, target)
@@ -174,7 +174,7 @@ def kernels(self):
   def prepareUntilUnitTest(self):
     for kernel in self._kernels.values():
       kernel.prepareUntilUnitTest()
-  
+
   def prepareUntilCodeGen(self, costEstimator, enableFusedGemm: bool):
     for kernel in self._kernels.values():
       kernel.prepareUntilCodeGen(costEstimator, enableFusedGemm)
@@ -189,10 +189,10 @@ class GlobalRoutineCache:
   def __init__(self):
     self.cache = RoutineCache()
     self.dirs = []
-  
+
   def register(self, outputDir):
     self.dirs += [outputDir]
-  
+
   def generate(self, outputDir, namespace='yateto'):
     print('Calling external code generators...')
     fRoutines = Generator.FileNames(outputDir, Generator.ROUTINES_FILE_NAME)
@@ -200,7 +200,7 @@ def generate(self, outputDir, namespace='yateto'):
     with Cpp(fRoutines.h) as header:
       with header.HeaderGuard(Generator._headerGuardName(namespace, Generator.ROUTINES_FILE_NAME)):
         self.cache.generate(header, fRoutines.cpp, fGpulikeRoutines.cpp)
-    
+
     for subdir in self.dirs:
       relpath = os.path.relpath(outputDir, subdir)
       rfRoutines = Generator.FileNames(subdir, Generator.ROUTINES_FILE_NAME)
@@ -217,7 +217,7 @@ class Generator(object):
   DOCTEST_FILE_NAME = 'test-kernel'
   HEADER_GUARD_SUFFIX = 'H_'
   SUPPORT_LIBRARY_HEADER = 'yateto.h'
-  
+
   class FileNames(object):
     HEADER = 'h'
     CPP = 'cpp'
@@ -227,7 +227,7 @@ def __init__(self, outputDir, name):
       self.cppName = '{}.{}'.format(name, self.CPP)
       self.h = os.path.join(outputDir, self.hName)
       self.cpp = os.path.join(outputDir, self.cppName)
-  
+
   def __init__(self, arch):
     self._kernels = list()
     self._kernelFamilies = dict()
@@ -242,7 +242,7 @@ def add(self, name: str, ast: Node, prefetch=None, namespace=None, target='cpu')
       if baseName not in self._kernelFamilies:
         self._kernelFamilies[baseName] = KernelFamily()
       self._kernelFamilies[baseName].add(name, ast, prefetch, namespace, target)
-    else:      
+    else:
       if not Kernel.isValidName(name):
         raise ValueError(f'Kernel name invalid (must match regexp {Kernel.VALID_NAME}): {name}')
       kernel = Kernel(name, ast, prefetch, namespace=namespace, target=target)
@@ -273,7 +273,7 @@ def addFamily(self,
       ast = astGenerator(*p)
       prefetch = prefetchGenerator(*p) if prefetchGenerator is not None else None
       family.add(indexedName, ast, prefetch, namespace, target=target)
-  
+
   @classmethod
   def _headerGuardName(self, namespace, fileBaseName):
     partlist = namespace.upper().split('::') + [fileBaseName.upper(), self.HEADER_GUARD_SUFFIX]
@@ -310,7 +310,7 @@ def generate(self,
     fTensors = self.FileNames(outputDir, self.TENSORS_FILE_NAME)
     fInit = self.FileNames(outputDir, self.INIT_FILE_NAME)
     fRoutines = self.FileNames(outputDir, self.ROUTINES_FILE_NAME)
-    fGpulikeRoutines = self.FileNames(outputDir, self.GPULIKE_ROUTINES_FILE_NAME)  
+    fGpulikeRoutines = self.FileNames(outputDir, self.GPULIKE_ROUTINES_FILE_NAME)
 
     print('Generating unit tests...')
     def unit_test_body(cpp, testFramework):
diff --git a/yateto/input.py b/yateto/input.py
index 3200215..2867c20 100644
--- a/yateto/input.py
+++ b/yateto/input.py
@@ -83,9 +83,9 @@ def openMaybeCompressed(basefilename):
 def parseXMLMatrixFile(xmlFile, clones=dict(), transpose=lambda name: False, alignStride=lambda name: False, namespace=None):
   with openMaybeCompressed(xmlFile) as file:
     root = etree.fromstring(file.read())
-  
+
   matrices = dict()
-  
+
   for node in root:
     if node.tag == 'matrix':
       name = node.get('name')
@@ -131,7 +131,7 @@ def parseJSONMatrixFile(jsonFile, clones=dict(), transpose=lambda name: False, a
 def memoryLayoutFromFile(xmlFile, db, clones, strict=False):
   with openMaybeCompressed(xmlFile) as file:
     root = etree.fromstring(file.read())
-  
+
   strtobool = ['yes', 'true', '1']
   groups = dict()
 
diff --git a/yateto/memory.py b/yateto/memory.py
index fa3ea9a..11a4497 100644
--- a/yateto/memory.py
+++ b/yateto/memory.py
@@ -13,11 +13,11 @@ def __init__(self, shape):
 
   def shape(self):
     return self._shape
-  
+
   @abstractmethod
   def address(self, entry):
     pass
-  
+
   @abstractmethod
   def subtensorOffset(self, topLeftEntry):
     pass
@@ -32,7 +32,7 @@ def mayVectorizeDim(self, dim):
 
   def mayFuse(self, positions):
     return len(positions) == 1
-  
+
   @classmethod
   @abstractmethod
   def fromSpp(cls, spp, **kwargs):
@@ -61,7 +61,7 @@ class DenseMemoryLayout(MemoryLayout):
   @classmethod
   def setAlignmentArch(cls, arch):
     cls.ALIGNMENT_ARCH = arch
-  
+
   def __init__(self, shape, boundingBox=None, stride=None, alignStride=False):
     super().__init__(shape)
 
@@ -78,13 +78,13 @@ def __init__(self, shape, boundingBox=None, stride=None, alignStride=False):
       self._stride = stride
     else:
       self._computeStride()
-  
+
   def _computeStride(self):
     stride = [1]
     for i in range(len(self._bbox)-1):
       stride.append(stride[i] * self._bbox[i].size())
     self._stride = tuple(stride)
-  
+
   def _alignBB(self):
     if self.ALIGNMENT_ARCH is not None:
       self._range0 = self._bbox[0]
@@ -92,7 +92,7 @@ def _alignBB(self):
       self._bbox = BoundingBox([rnew] + self._bbox[1:])
     else:
       warnings.warn('Set architecture with DenseMemoryLayout.setAlignmentArch(arch) if you want to use the align stride feature.', UserWarning)
-  
+
   def alignedStride(self):
     if self.ALIGNMENT_ARCH is None:
       return False
@@ -120,7 +120,7 @@ def __contains__(self, entry):
 
   def permuted(self, permutation):
     newShape = tuple([self._shape[p] for p in permutation])
-    
+
     originalBB = BoundingBox([self._range0] + self._bbox[1:]) if self._range0 else self._bbox
     newBB = BoundingBox([copy.copy(originalBB[p]) for p in permutation])
     return DenseMemoryLayout(newShape, newBB, alignStride=self._range0 is not None)
@@ -131,7 +131,7 @@ def address(self, entry):
 
   def subtensorOffset(self, topLeftEntry):
     return self.address(topLeftEntry)
-  
+
   def notWrittenAddresses(self, writeBB):
     if writeBB == self._bbox:
       return []
@@ -143,10 +143,10 @@ def notWrittenAddresses(self, writeBB):
 
   def stride(self):
     return self._stride
-  
+
   def stridei(self, dim):
     return self._stride[dim]
-  
+
   def bbox(self):
     return self._bbox
 
@@ -158,7 +158,7 @@ def requiredReals(self):
       return 1
     size = self._bbox[-1].size() * self._stride[-1]
     return size
-  
+
   def addressString(self, indices, I = None, prefix='_', offsets=()):
     if len(self._bbox) == 0:
       return '0'
@@ -189,13 +189,13 @@ def isAlignedAddressString(self, indices, I = None):
 
   def mayFuse(self, positions):
     return all( [self._stride[j] == self._shape[i]*self._stride[i] for i,j in zip(positions[:-1], positions[1:])] )
-  
+
   def _subShape(self, positions):
     sub = 1
     for p in positions:
       sub *= self._shape[p]
     return sub
-  
+
   def _subRange(self, positions):
     start = 0
     stop = 0
@@ -205,7 +205,7 @@ def _subRange(self, positions):
       stop += s * (self._bbox[p].stop-1)
       s *= self._shape[p]
     return Range(start, stop+1)
-    
+
   def _firstStride(self, positions):
     return self._stride[ positions[0] ]
 
@@ -238,7 +238,7 @@ def unfold(self, indices, I, J):
     stride = (self._firstStride(positionsI), self._firstStride(positionsJ))
 
     return DenseMemoryLayout(shape, bbox, stride)
-  
+
   def defuse(self, fusedRange, indices, I):
     positions = indices.positions(I)
     s = self._subShape(positions)
@@ -256,7 +256,7 @@ def defuse(self, fusedRange, indices, I):
 
   def isCompatible(self, spp):
     return BoundingBox.fromSpp(spp) in self.bbox()
-  
+
   def subslice(self, index, start, end):
     return MemoryLayoutView(self, index, start, end)
 
@@ -265,16 +265,16 @@ def __eq__(self, other):
 
   def __str__(self):
     return '{}(shape: {}, bounding box: {}, stride: {})'.format(type(self).__name__, self._shape, self._bbox, self._stride)
-  
+
   def isCSC(self):
     return False
-  
+
   def spp(self):
     raise NotImplementedError()
-  
+
   def storage(self):
     return self
-  
+
   def alignmentOffset(self, dim):
     return 0
 
@@ -287,7 +287,7 @@ def __init__(self, spp, alignStride=False):
 
     self.aligned = alignStride
     self._spp = spp
-    
+
     if len(self._shape) != 2:
       raise ValueError('CSCMemoryLayout may only be used for matrices.')
 
@@ -296,7 +296,7 @@ def __init__(self, spp, alignStride=False):
       range0 = self._bbox[0]
       rnew = Range( DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(range0.start), DenseMemoryLayout.ALIGNMENT_ARCH.alignedUpper(range0.stop) )
       self._bbox = BoundingBox([rnew] + self._bbox[1:])
-    
+
     nonzeros = spp.nonzero()
     nonzeros = sorted(zip(nonzeros[0], nonzeros[1]), key=lambda x: (x[1], x[0]))
 
@@ -309,13 +309,13 @@ def __init__(self, spp, alignStride=False):
 
         for i in range(lower, upper):
           nonzeros_pre.add((np.int64(i), nonzero[1]))
-      
+
       nonzeros = list(nonzeros_pre)
       nonzeros = sorted(zip([nonzero[0] for nonzero in nonzeros], [nonzero[1] for nonzero in nonzeros]), key=lambda x: (x[1], x[0]))
-    
+
     self._rowIndex = np.ndarray(shape=(len(nonzeros),), dtype=int)
     self._colPtr = np.ndarray(shape=(self._shape[1]+1,), dtype=int)
-    
+
     lastCol = 0
     self._colPtr[0] = 0
     for i,entry in enumerate(nonzeros):
@@ -335,31 +335,31 @@ def bbox(self):
 
   def bboxi(self, dim):
     return self._bbox[dim]
-  
+
   def rowIndex(self):
     return self._rowIndex
-  
+
   def colPointer(self):
     return self._colPtr
-  
+
   def isAlignedAddressString(self, indices, I = None):
     if I is None:
       I = set(indices)
     positions = indices.positions(I)
     return len(positions) == 0 or (positions[0] == 0 and all(p != 0 for p in positions[1:]))
-  
+
   def address(self, entry):
     assert entry in self._bbox
 
     start = self._colPtr[ entry[1] ]
     stop = self._colPtr[ entry[1]+1 ]
     subRowInd = self._rowIndex[start:stop]
- 
+
     find = np.where(subRowInd == entry[0])[0]
     assert len(find) == 1
 
     return start + find[0]
-  
+
   def subtensorOffset(self, topLeftEntry):
     assert topLeftEntry in self._bbox
     assert topLeftEntry[0] <= self._bbox[0].start
@@ -399,16 +399,16 @@ def isCompatible(self, spp):
 
   def __eq__(self, other):
     return self._bbox == other._bbox and np.array_equal(self._rowIndex, other._rowIndex) and np.array_equal(self._colPtr, other._colPtr)
-  
+
   def subslice(self, index, start, end):
     return MemoryLayoutView(self, index, start, end)
-  
+
   def spp(self):
     return self._spp
-  
+
   def storage(self):
     return self
-  
+
   def alignmentOffset(self, dim):
     return 0
 
@@ -427,19 +427,19 @@ def __init__(self, base, index, start, end):
     self.index = index
     self.start = start
     self.end = end
-  
+
   def relidx(self, index):
     return tuple(index[i] if i != self.index else index[i] + self.start for i in range(len(self._shape)))
-  
+
   def relbox(self, bbox):
     return BoundingBox([Range(max(bbox[i].start + self.start, self.start), min(bbox[i].stop + self.start, self.end)) if i == self.index else bbox[i] for i in range(len(self._shape))])
-  
+
   def relspp(self, spp):
     subslice = tuple(slice(self.start, self.end) if i == self.index else slice(None) for i in range(spp.ndim))
     superarray = np.zeros(tuple(self.base.shape()), dtype=bool)
     superarray[subslice] = spp.as_ndarray()
     return aspp.general(superarray)
-  
+
   def relranges(self):
     starts, ends = self.base.relranges()
     starts[self.index] = max(starts[self.index], self.start)
@@ -448,23 +448,23 @@ def relranges(self):
 
   def __contains__(self, bbox):
     return self.base.__contains__(self.relbox(bbox))
-  
+
   def __eq__(self, other):
     return self.storage() == other.storage() and self.relranges() == other.relranges()
-  
+
   def address(self, entry):
     return self.base.address(self.relidx(entry))
-  
+
   def subtensorOffset(self, topLeftEntry):
     return self.base.subtensorOffset(self.relidx(topLeftEntry))
-  
+
   def alignedStride(self):
     return self.base.alignedStride() and (self.index != 0 or DenseMemoryLayout.ALIGNMENT_ARCH.checkAlignment(self.end - self.start))
-  
+
   def fromSpp(self):
     # cannot be implemented. Call should result in error.
     raise NotImplementedError()
-  
+
   def isCompatible(self, spp):
     # only a rough criterion. Can possibly be refined.
     if spp.as_ndarray().shape != tuple(self.shape()):
@@ -474,19 +474,19 @@ def isCompatible(self, spp):
 
   def mayVectorizeDim(self, dim):
     return self.base.mayVectorizeDim(dim)
-  
+
   def isAlignedAddressString(self, indices, I = None):
     return self.base.isAlignedAddressString(indices, I)
-  
+
   def addressString(self, indices, I = None, prefix='_', offsets=()):
     if len(offsets) == 0:
       offsets = [0] * len(self._shape)
     newOffsets = tuple(offsets[i] if self.index != i else offsets[i] + self.start for i in range(len(self._shape)))
     return self.base.addressString(indices, I, prefix, newOffsets)
-  
+
   def subslice(self, index, start, end):
     return MemoryLayoutView(self, index, start, end)
-  
+
   def unfold(self, indices, I, J):
     positionsI = indices.positions(I)
     positionsJ = indices.positions(J)
@@ -504,7 +504,7 @@ def unfold(self, indices, I, J):
       scale *= shape[p]
 
     return MemoryLayoutView(self.base.unfold(indices, I, J), newIndex, self.start * scale, self.end * scale)
-  
+
   def withDummyDimension(self):
     return MemoryLayoutView(self.base.withDummyDimension(), self.index, self.start, self.end)
 
@@ -520,11 +520,11 @@ def defuse(self, fusedRange, indices, I):
       return self.base.defuse(newFusedRange, indices, I)
     else:
       return self.base.defuse(fusedRange, indices, I)
-  
+
   def stride(self):
     # pass through
     return self.base.stride()
-  
+
   def stridei(self, dim):
     # pass through
     return self.base.stridei(dim)
@@ -533,16 +533,16 @@ def notWrittenAddresses(self, writeBB):
     # focus only on the subview
     outside = set(self.base.notWrittenAddresses(self.bbox()))
     return list(set(self.base.notWrittenAddresses(self.relbox(writeBB))) - outside)
-  
+
   def bbox(self):
     return self.relbox(self.base.bbox())
-  
+
   def storage(self):
     return self.base.storage()
-  
+
   def permuted(self, permutation):
     return MemoryLayoutView(self.base.permuted(permutation), permutation[self.index], self.start, self.end)
-  
+
   def entries(self, rowRange, colRange):
     if self.index == 0:
       return self.base.entries(Range(rowRange.start + self.start, rowRange.stop + self.start), colRange)
@@ -550,13 +550,13 @@ def entries(self, rowRange, colRange):
       return self.base.entries(rowRange, Range(colRange.start + self.start, colRange.stop + self.start))
     else:
       raise NotImplementedError()
-  
+
   def mayFuse(self, positions):
     return (self.index not in positions or positions[-1] == self.index) and self.base.mayFuse(positions)
 
   def __repr__(self):
     return f'MemoryLayoutView(index: {self.index}; range: [{self.start},{self.end}); base: {self.base})'
-  
+
   def alignmentOffset(self, dim):
     val = self.base.alignmentOffset(dim)
     if self.index == dim:
diff --git a/yateto/type.py b/yateto/type.py
index e186a44..874645e 100644
--- a/yateto/type.py
+++ b/yateto/type.py
@@ -8,7 +8,7 @@ class AbstractType(object):
   @classmethod
   def isValidName(cls, name):
     return re.match(cls.VALID_NAME, name) is not None
-  
+
   def name(self):
     return self._name
 
@@ -21,12 +21,12 @@ class IdentifiedType(AbstractType):
   def __init__(self, name, namespace=None):
     if not self.isValidName(name):
       raise ValueError('Invalid name (must match regexp {}): {}'.format(self.VALID_NAME, name))
-    
+
     self._name = name
     self.namespace = namespace
 
     self.datatype = None # TODO
-  
+
   def __str__(self):
     return self._name
 
@@ -39,14 +39,14 @@ def getGroup(cls, name):
 
   def group(self):
     return self.getGroup(self._name)
-  
+
   @classmethod
   def getBaseName(cls, name):
     return re.match(cls.BASE_NAME, name).group(0)
-  
+
   def baseName(self):
     return self.getBaseName(self._name)
-  
+
   @classmethod
   def splitBasename(cls, base_name_with_namespace):
     name_parts = base_name_with_namespace.rsplit('::', 1)
@@ -56,23 +56,23 @@ def splitBasename(cls, base_name_with_namespace):
       prefix = ''
     base_name = name_parts[-1]
     return prefix, base_name
-  
+
   def prefix(self):
     return '{}::'.format(self.namespace) if self.namespace else ''
-  
+
   def baseNameWithNamespace(self):
     return '{}{}'.format(self.prefix(), self.baseName())
 
   def nameWithNamespace(self):
     return '{}{}'.format(self.prefix(), self.name())
-  
+
   def __hash__(self):
     return hash(self._name)
 
-class Scalar(IdentifiedType):  
+class Scalar(IdentifiedType):
   def __init__(self, name, namespace=None):
     super().__init__(name, namespace=namespace)
-  
+
   def __hash__(self):
     return hash(self._name)
 
@@ -88,10 +88,10 @@ def __init__(self,
     super().__init__(name, namespace=namespace)
     if not isinstance(shape, tuple):
       raise ValueError('shape must be a tuple')
-    
+
     if any(x < 1 for x in shape):
       raise ValueError('shape must not contain entries smaller than 1')
-    
+
     if not self.isValidName(name):
       raise ValueError('Tensor name invalid (must match regexp {}): {}'.format(self.VALID_NAME, name))
 
@@ -125,7 +125,7 @@ def __init__(self,
     else:
       self._spp = aspp.dense(shape)
     self._groupSpp = self._spp
-    
+
     self.setMemoryLayout(memoryLayoutClass, alignStride)
 
   def __hash__(self):
@@ -148,16 +148,16 @@ def setGroupSpp(self, spp):
 
   def __getitem__(self, indexNames):
     return IndexedTensor(self, indexNames)
-  
+
   def shape(self):
     return self._shape
-  
+
   def memoryLayout(self):
     return self._memoryLayout
-  
+
   def spp(self, groupSpp=True):
     return self._groupSpp if groupSpp else self._spp
-  
+
   def values(self):
     return self._values
 
@@ -186,7 +186,7 @@ def __eq__(self, other):
     if equal:
       assert self._shape == other._shape and aspp.array_equal(self._spp, other._spp) and self._memoryLayout == other._memoryLayout
     return equal
-  
+
   def __str__(self):
     return '{}: {}'.format(self._name, self._shape)
 
@@ -196,13 +196,13 @@ def update(self, collection):
 
   def __getitem__(self, key):
     return self.__dict__[key]
-  
+
   def __setitem__(self, key, value):
     self.__dict__[key] = value
 
   def __contains__(self, key):
     return key in self.__dict__
-  
+
   @classmethod
   def group(cls, name):
     group = Tensor.getGroup(name)