diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..79aabf1 --- /dev/null +++ b/.clang-format @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: 2021 SeisSol Group +# +# SPDX-License-Identifier: BSD-3-Clause + +Language: Cpp +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: false +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Right +AlignOperands: true +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllConstructorInitializersOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: Yes +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Regroup +IncludeCategories: + # keep the doctest headers in front + - Regex: '^(<|")doctest' + Priority: 1 + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 3 + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 4 + - Regex: '.*' + Priority: 2 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: true +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PointerAlignment: Left +QualifierAlignment: Left +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: c++17 +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..240b03b --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: 2021 SeisSol Group +# +# SPDX-License-Identifier: BSD-3-Clause + +# apply clang-format/basic pre-commit +511bc1513e3a6d5ceb4b338750f57253ec429e9b +# update clang-format config +fb0b99c3454b5dad40ae0cff13099e4e8a5bf894 diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..1fa3922 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: 2025 SeisSol Group +# +# SPDX-License-Identifier: BSD-3-Clause +# SPDX-LicenseComments: Full text under /LICENSE and /LICENSES/ +# +# SPDX-FileContributor: Author lists in /AUTHORS and /CITATION.cff + +name: pre-commit +on: + - push + +jobs: + pre-commit: + name: pre-commit + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v6 + - uses: pre-commit/action@v3.0.1 diff --git a/.github/workflows/yateto-ci.yml b/.github/workflows/yateto-cpu.yml similarity index 92% rename from .github/workflows/yateto-ci.yml rename to .github/workflows/yateto-cpu.yml index b637aba..e9c719a 100644 --- a/.github/workflows/yateto-ci.yml +++ b/.github/workflows/yateto-cpu.yml @@ -1,23 +1,23 @@ -name: Yateto CI +name: yateto-cpu on: push jobs: general: - runs-on: ubuntu-latest - container: + runs-on: ubuntu-24.04 + container: image: seissol/gha-cpu:davschneller-gpu-image steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Install Yateto run: | pip3 install -e . --break-system-packages - + - name: Python Tests run: | python3 -m unittest tests/internals/*.py - + - name: install-packages run: | apt-get update -y @@ -25,7 +25,7 @@ jobs: add-apt-repository ppa:deadsnakes/ppa apt-get update -y apt-get install -y cxxtest - + - name: Interface Tests run: | cd ./tests/interface @@ -38,8 +38,8 @@ jobs: done codegen: - runs-on: ubuntu-latest - container: + runs-on: ubuntu-24.04 + container: image: seissol/gha-cpu:davschneller-gpu-image env: CTEST_OUTPUT_ON_FAILURE: 1 @@ -47,14 +47,14 @@ jobs: fail-fast: false matrix: generator: [none, Eigen, LIBXSMM, LIBXSMM_JIT, OpenBLAS, PSpaMM] - + steps: - - uses: actions/checkout@v5 + - uses: actions/checkout@v6 - name: Install Yateto run: | pip3 install -e . --break-system-packages - + - name: install-packages run: | apt-get update -y @@ -62,7 +62,7 @@ jobs: add-apt-repository ppa:deadsnakes/ppa apt-get update -y apt-get install -y cxxtest - + - name: install-libxsmm if: ${{ matrix.generator == 'LIBXSMM_JIT' }} run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..37ca028 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: 2025 SeisSol Group +# +# SPDX-License-Identifier: BSD-3-Clause + +--- + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-merge-conflict + name: '[GENERIC] merge conflict check' + - id: check-symlinks + name: '[GENERIC] symlink check' + - id: destroyed-symlinks + name: '[GENERIC] detect broken symlinks' + - id: detect-private-key + name: '[GENERIC] detect private keys uploaded by accident' + - id: check-case-conflict + name: '[GENERIC] detect OS file naming case conflicts' + - id: check-executables-have-shebangs + name: '[GENERIC] check for shebangs in executable files' + - id: check-illegal-windows-names + name: '[GENERIC] detect illegal Windows file names' + - id: check-json + name: '[JSON] check' + - id: check-xml + name: '[XML] check' + +- repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.18.1 + hooks: + - id: markdownlint-cli2 + name: '[MARKDOWN] lint' + +#- repo: https://github.com/fsfe/reuse-tool +# rev: v6.0.0 +# hooks: +# - id: reuse +# name: '[GENERIC] REUSE compatibiltiy' + +#- repo: https://github.com/psf/black-pre-commit-mirror +# rev: 25.1.0 +# hooks: +# - id: black +# files: ^(?!preprocessing|postprocessing) +# name: '[PYTHON] black' +#- repo: https://github.com/pycqa/isort +# rev: 6.0.1 +# hooks: +# - id: isort +# files: ^(?!preprocessing|postprocessing) +# args: ["--profile", "black"] +# name: '[PYTHON] isort' +- repo: https://github.com/pycqa/bandit + rev: 1.8.6 + hooks: + - id: bandit + args: ["--confidence-level", "high", "--severity-level", "high"] + name: '[PYTHON] bandit' +#- repo: https://github.com/pycqa/flake8 +# rev: '7.3.0' +# hooks: +# - id: flake8 +# files: ^(?!preprocessing|postprocessing) +# name: '[PYTHON] Flake8' + +- repo: https://github.com/sphinx-contrib/sphinx-lint + rev: 'v1.0.0' + hooks: + - id: sphinx-lint + name: '[SPHINX/RST] sphinx lint' + +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: 'v21.1.0' + hooks: + - id: clang-format + name: '[C++] clang-format' + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: end-of-file-fixer + name: '[GENERIC] newline eof' + - id: trailing-whitespace + name: '[GENERIC] remove trailing whitespace' diff --git a/README.md b/README.md index 3919b6a..90510e2 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # YATeTo -It is **Y**et **A**nother **Te**nsor **To**olbox for discontinuous Galerkin methods and other -applications. You can find much more information about the package -[here](https://arxiv.org/abs/1903.11521). +It is **Y**et **A**nother **Te**nsor **To**olbox for +discontinuous Galerkin methods and other +applications. You can find much more information about the package +[in this paper](https://arxiv.org/abs/1903.11521). ## Installation @@ -24,7 +25,7 @@ def add(g): B = Tensor('B', (N, N, N)) w = Tensor('w', (N,)) C = Tensor('C', (N, N)) - + kernel = C['ij'] <= 2.0 * C['ij'] + A['lj'] * B['ikl'] * w['k'] g.add(name='kernel', ast=kernel) diff --git a/examples/SConstruct b/examples/SConstruct index 3b37d18..18a0fb2 100644 --- a/examples/SConstruct +++ b/examples/SConstruct @@ -26,7 +26,7 @@ if 'CXX' in env['ENV']: # generate help text Help(vars.GenerateHelpText(env)) - + # handle unknown, maybe misspelled variables unknownVariables = vars.UnknownVariables() diff --git a/examples/common/Stopwatch.h b/examples/common/Stopwatch.h index 7d7a5f1..064b1d9 100644 --- a/examples/common/Stopwatch.h +++ b/examples/common/Stopwatch.h @@ -3,8 +3,10 @@ * This file is part of SeisSol. * * @author Alexander Heinecke (Alexander.Heinecke@mytum.de) - * @author Sebastian Rettenberger (sebastian.rettenberger AT tum.de, http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger) - * @author Carsten Uphoff (c.uphoff AT tum.de, http://www5.in.tum.de/wiki/index.php/Carsten_Uphoff,_M.Sc.) + * @author Sebastian Rettenberger (sebastian.rettenberger AT tum.de, + * http://www5.in.tum.de/wiki/index.php/Sebastian_Rettenberger) + * @author Carsten Uphoff (c.uphoff AT tum.de, + * http://www5.in.tum.de/wiki/index.php/Carsten_Uphoff,_M.Sc.) * * @section LICENSE * Copyright (c) 2016-2017, SeisSol Group @@ -48,97 +50,81 @@ /** * Stopwatch * - * Part of SeisSol, so you can easily calculate the needed time of SeisSol computations with a high precision + * Part of SeisSol, so you can easily calculate the needed time of SeisSol + * computations with a high precision */ -class Stopwatch -{ -private: - struct timespec m_start; - - /** Time already spent */ - long long m_time; - +class Stopwatch { + private: + struct timespec m_start; + + /** Time already spent */ + long long m_time; + /** Returns the time difference in nanoseconds. */ - long long difftime(struct timespec const& end) - { + long long difftime(struct timespec const& end) { return 1000000000L * (end.tv_sec - m_start.tv_sec) + end.tv_nsec - m_start.tv_nsec; } - - double seconds(long long time) - { - return 1.0e-9 * time; - } -public: - /** - * Constructor - * - * resets the Stopwatch - */ - Stopwatch() : m_time(0) - {} - - /** - * Destructor - */ - ~Stopwatch() - {} - - /** - * Reset the stopwatch to zero - */ - void reset() - { - m_time = 0; - } - - /** - * starts the time measuring - */ - void start() - { - clock_gettime(CLOCK_MONOTONIC, &m_start); - } - - /** - * get time measuring - * - * @return measured time (until now) in seconds - */ - double split() - { - struct timespec end; - clock_gettime(CLOCK_MONOTONIC, &end); - + double seconds(long long time) { return 1.0e-9 * time; } + + public: + /** + * Constructor + * + * resets the Stopwatch + */ + Stopwatch() : m_time(0) {} + + /** + * Destructor + */ + ~Stopwatch() {} + + /** + * Reset the stopwatch to zero + */ + void reset() { m_time = 0; } + + /** + * starts the time measuring + */ + void start() { clock_gettime(CLOCK_MONOTONIC, &m_start); } + + /** + * get time measuring + * + * @return measured time (until now) in seconds + */ + double split() { + struct timespec end; + clock_gettime(CLOCK_MONOTONIC, &end); + return seconds(difftime(end)); - } - - /** - * pauses the measuring - * - * @return measured time (until now) in seconds - */ - double pause() - { - struct timespec end; - clock_gettime(CLOCK_MONOTONIC, &end); - - m_time += difftime(end); - return seconds(m_time); - } - - /** - * stops time measuring - * - * @return measured time in seconds - */ - double stop() - { - double time = pause(); - reset(); - return time; - } + } + + /** + * pauses the measuring + * + * @return measured time (until now) in seconds + */ + double pause() { + struct timespec end; + clock_gettime(CLOCK_MONOTONIC, &end); + + m_time += difftime(end); + return seconds(m_time); + } + + /** + * stops time measuring + * + * @return measured time in seconds + */ + double stop() { + double time = pause(); + reset(); + return time; + } }; #endif // STOPWATCH_H - diff --git a/examples/common/Util.h b/examples/common/Util.h index 6a83a92..08242b6 100644 --- a/examples/common/Util.h +++ b/examples/common/Util.h @@ -7,12 +7,11 @@ typedef double real; #elif REAL_SIZE == 4 typedef float real; #else -# error REAL_SIZE not supported. +#error REAL_SIZE not supported. #endif void fillWithStuff(real* A, unsigned reals) { for (unsigned j = 0; j < reals; ++j) { - A[j] = drand48(); + A[j] = drand48(); } } - diff --git a/examples/optimal_ind.py b/examples/optimal_ind.py index be5941f..3a2f8e1 100755 --- a/examples/optimal_ind.py +++ b/examples/optimal_ind.py @@ -29,6 +29,3 @@ def add(g): tmp2[i2] <= tmp1[i1] * C['dfjk'], S['abij'] <= tmp2[i2] * A['acik'] ] g.add('kernel_{}_{}'.format(i1,i2), kernel) - - - diff --git a/examples/seissol_eqspp.py b/examples/seissol_eqspp.py index 7e3e15f..6f439b3 100755 --- a/examples/seissol_eqspp.py +++ b/examples/seissol_eqspp.py @@ -10,7 +10,7 @@ def printEqspp(): def add(g): db = parseXMLMatrixFile('seissol_matrices.xml') - + Q = Tensor('Q', (8, 20, 15)) I = Tensor('I', (8, 20, 15)) g.add('seissol_stiffness', Q['skp'] <= db.kXiTDivM['lk'] * I['slq'] * db.star['qp']) diff --git a/examples/site_scons/arch.py b/examples/site_scons/arch.py index a5e9d4d..072a648 100644 --- a/examples/site_scons/arch.py +++ b/examples/site_scons/arch.py @@ -8,21 +8,21 @@ # @section LICENSE # Copyright (c) 2016, SeisSol Group # All rights reserved. -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: -# +# # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. -# +# # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. -# +# # 3. Neither the name of the copyright holder nor the names of its # contributors may be used to endorse or promote products derived from this # software without specific prior written permission. -# +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE @@ -57,7 +57,7 @@ def getRealSize(architecture): def getCpu(architecture): return architecture[1:] - + def getAlignment(architecture): alignments = { 'noarch': 16, @@ -69,13 +69,13 @@ def getAlignment(architecture): 'skx': 64 } return alignments[ getCpu(architecture) ] - + def getFlags(architecture, compiler = 'intel'): if architecture not in getArchitectures(): raise ValueError('Unknown architecture.') - + cpu = getCpu(architecture) - + if cpu == 'wsm': flags = ['-msse3'] elif cpu == 'snb': @@ -93,13 +93,13 @@ def getFlags(architecture, compiler = 'intel'): flags = ['-xCORE-AVX512', '-fma'] else: flags = [] - + # enable interproc. opts for small cores if cpu in ['knc', 'knl', 'skx']: flags.append('-ip') flags.append('-fopenmp') - + return flags def getDefines(architecture): diff --git a/examples/site_scons/site_tools/cxxtest.py b/examples/site_scons/site_tools/cxxtest.py index 65f2891..0fe97d0 100644 --- a/examples/site_scons/site_tools/cxxtest.py +++ b/examples/site_scons/site_tools/cxxtest.py @@ -15,7 +15,7 @@ # Maintainer: Gašper Ažman # # This file is maintained as a part of the CxxTest test suite. -# +# # == About == # # This builder correctly tracks dependencies and supports just about every @@ -171,7 +171,7 @@ def defaultCxxTestGenLocation(env): def findCxxTestGen(env): """locate the cxxtestgen script by checking environment, path and project""" - + # check the SCons environment... # Then, check the OS environment... cxxtest = envget(env, 'CXXTEST', None) @@ -201,7 +201,7 @@ def findCxxTestGen(env): # make sure it was correct if isValidScriptPath(cxxtest): return os.path.realpath(cxxtest) - + # No valid environment variable found, so... # Next, check the path... # Next, check the project @@ -209,7 +209,7 @@ def findCxxTestGen(env): envget(env, 'CXXTEST_INSTALL_DIR'), envget(env, 'CXXTEST_CXXTESTGEN_DEFAULT_LOCATION')) - cxxtest = (env.WhereIs(envget(env, 'CXXTEST_CXXTESTGEN_SCRIPT_NAME')) or + cxxtest = (env.WhereIs(envget(env, 'CXXTEST_CXXTESTGEN_SCRIPT_NAME')) or env.WhereIs(envget(env, 'CXXTEST_CXXTESTGEN_SCRIPT_NAME'), path=[Dir(check_path).abspath])) @@ -286,7 +286,7 @@ def generate(env, **kwargs): # # Expected behaviour: keyword arguments override environment variables; # environment variables override default settings. - # + # env.SetDefault( CXXTEST_RUNNER = 'ErrorPrinter' ) env.SetDefault( CXXTEST_OPTS = '' ) env.SetDefault( CXXTEST_SUFFIX = '.t.h' ) @@ -316,7 +316,7 @@ def generate(env, **kwargs): # find and add the CxxTest headers to the path. env.AppendUnique( CXXTEST_CPPPATH = findCxxTestHeaders(env) ) - + cxxtest = env['CXXTEST'] if cxxtest: # @@ -397,4 +397,3 @@ def CxxTest(env, target, source = None, **kwargs): def exists(env): return os.path.exists(env['CXXTEST']) - diff --git a/examples/springer.py b/examples/springer.py index 5a152f8..c88597b 100755 --- a/examples/springer.py +++ b/examples/springer.py @@ -14,7 +14,7 @@ def cold(): def add_tensor(name, ind, size): shape = tuple(size[k] for k in ind) return Tensor(name + str(_bench_no), shape) - + def add_bench(g, descr, sizes): global _bench_no diff --git a/examples/stock.py b/examples/stock.py index c50e644..e530d05 100755 --- a/examples/stock.py +++ b/examples/stock.py @@ -41,4 +41,3 @@ def add(g): stock = R['ijk'] <= S['xyz'] * XLTP['lx'] * XRTP['il'] * YL['ym'] * YR['mj'] * ZL['zn'] * ZR['nk'] g.add('stock{}_trans_pad'.format(pqx), stock) - diff --git a/include/yateto.h b/include/yateto.h index 46c68db..bee5074 100644 --- a/include/yateto.h +++ b/include/yateto.h @@ -1,9 +1,9 @@ #ifndef YATETO_H_ #define YATETO_H_ -#include "yateto/TensorView.h" #include "yateto/InitTools.h" #include "yateto/LinearAllocator.h" #include "yateto/Misc.h" +#include "yateto/TensorView.h" #endif diff --git a/include/yateto/CopyPolicy.h b/include/yateto/CopyPolicy.h index 1af26db..fae24e6 100644 --- a/include/yateto/CopyPolicy.h +++ b/include/yateto/CopyPolicy.h @@ -4,14 +4,14 @@ #include namespace yateto { - template - class SimpleCopyPolicy { +template +class SimpleCopyPolicy { public: - float_t* copy(float_t const* first, float_t const* last, float_t*& mem) { - mem = std::copy(first, last, mem); - return mem; - } - }; -} + float_t* copy(const float_t* first, const float_t* last, float_t*& mem) { + mem = std::copy(first, last, mem); + return mem; + } +}; +} // namespace yateto -#endif // YATETO_COPY_POLICY_H_ +#endif // YATETO_COPY_POLICY_H_ diff --git a/include/yateto/InitTools.h b/include/yateto/InitTools.h index d582b01..966f1bb 100644 --- a/include/yateto/InitTools.h +++ b/include/yateto/InitTools.h @@ -2,137 +2,144 @@ #define YATETO_INITTOOLS_H_ #include "CopyPolicy.h" + #include #include #include namespace yateto { - /** Computes a number of tensors inside of a tensor family. - * - * @return a number of tensors. - * */ - template - constexpr size_t numFamilyMembers() { - return sizeof(T::Size) / sizeof(T::Size[0]); - } - - - /** Computes the next closest aligned memory address for a provided relative address. - * - * @param size a pointer address as integer. - * @param alignment a size of a vector register. - * @return the next closest aligned relative address. - * */ - template - constexpr size_t alignedUpper(int_t size, size_t alignment) { - return size + (alignment - size % alignment) % alignment; - } +/** Computes a number of tensors inside of a tensor family. + * + * @return a number of tensors. + * */ +template +constexpr size_t numFamilyMembers() { + return sizeof(T::Size) / sizeof(T::Size[0]); +} +/** Computes the next closest aligned memory address for a provided relative + * address. + * + * @param size a pointer address as integer. + * @param alignment a size of a vector register. + * @return the next closest aligned relative address. + * */ +template +constexpr size_t alignedUpper(int_t size, size_t alignment) { + return size + (alignment - size % alignment) % alignment; +} - /** Computes a number of real number which fits into a vector register. - * - * NOTE: a size of real number depends of floating number representation i.e. double or float. - * - * @param alignment a size of a vector register in bytes - * @return number of real numbers inside of a vector register - * */ - template - constexpr size_t alignedReals(size_t alignment) { - return alignment / sizeof(float_t); - } +/** Computes a number of real number which fits into a vector register. + * + * NOTE: a size of real number depends of floating number representation i.e. + * double or float. + * + * @param alignment a size of a vector register in bytes + * @return number of real numbers inside of a vector register + * */ +template +constexpr size_t alignedReals(size_t alignment) { + return alignment / sizeof(float_t); +} +/** Computes a size occupied by a tensor family including data alignment b/w + * tensors in terms of real numbers. + * + * NOTE: recursive function. + * + * @param alignedReals number of real numbers inside of a vector register. + * @param n a tensor index inside of a tensor family. + * @return a size of a tensor family + * */ +template +constexpr size_t computeFamilySize(size_t alignedReals = 1, size_t n = numFamilyMembers()) { + return n == 0 ? 0 + : alignedUpper(T::Size[n - 1], alignedReals) + + computeFamilySize(alignedReals, n - 1); +} - /** Computes a size occupied by a tensor family including data alignment b/w tensors in terms of real numbers. - * - * NOTE: recursive function. - * - * @param alignedReals number of real numbers inside of a vector register. - * @param n a tensor index inside of a tensor family. - * @return a size of a tensor family - * */ - template - constexpr size_t computeFamilySize(size_t alignedReals = 1, size_t n = numFamilyMembers()) { - return n == 0 ? 0 : alignedUpper(T::Size[n-1], alignedReals) + computeFamilySize(alignedReals, n-1); +template +class CopyManager { + public: + /** Copies data from a tensor to a given memory chunk. + * + * NOTE: The function shifts and aligns a pointer w.r.t. to a given vector + * register size. + * + * @param mem an address to a chunk of memory. + * NOTE: the address is going to be incremented every time + * when new information is written. + * @param alignment a size of a vector register (in bytes). + * @param ptr. + * @param alignment. + * */ + template + void copyTensorToMemAndSetPtr(float_t*& mem, float_t*& ptr, size_t alignment = 1) { + ptr = mem; + copyValuesToMem(mem, T::Values, T::Values + T::Size, alignment); + } + + /** Copies data from tensors from a tensor family to a given memory chunk. + * + * NOTE: The function writes the actual address (where aligned tensor data + * stored) back to a tensor family + * + * @param container a reference to a container which contains tensor family + * data. + * @param mem an address to an allocated chunk of memory. + * NOTE: the address is going to be incremented every time + * when new information is written. + * @param alignment a size of a vector register (in bytes). + * */ + template + void copyFamilyToMemAndSetPtr(float_t*& mem, + typename T::template Container& container, + size_t alignment = 1) { + + // determine a size of the container i.e a number of tensor that it holds + size_t n = sizeof(T::Size) / sizeof(T::Size[0]); + + for (size_t i = 0; i < n; ++i) { + // init pointer of each tensor to the allocated memeory + container.data[i] = mem; + + // copy values and shift pointer + copyValuesToMem(mem, T::Values[i], T::Values[i] + T::Size[i], alignment); } - - - template - class CopyManager { - public: - - /** Copies data from a tensor to a given memory chunk. - * - * NOTE: The function shifts and aligns a pointer w.r.t. to a given vector register size. - * - * @param mem an address to a chunk of memory. - * NOTE: the address is going to be incremented every time - * when new information is written. - * @param alignment a size of a vector register (in bytes). - * @param ptr. - * @param alignment. - * */ - template - void copyTensorToMemAndSetPtr(float_t*& mem, float_t*& ptr, size_t alignment = 1) { - ptr = mem; - copyValuesToMem(mem, T::Values, T::Values + T::Size, alignment); - } - - - /** Copies data from tensors from a tensor family to a given memory chunk. - * - * NOTE: The function writes the actual address (where aligned tensor data stored) - * back to a tensor family - * - * @param container a reference to a container which contains tensor family data. - * @param mem an address to an allocated chunk of memory. - * NOTE: the address is going to be incremented every time - * when new information is written. - * @param alignment a size of a vector register (in bytes). - * */ - template - void copyFamilyToMemAndSetPtr(float_t*& mem, - typename T::template Container& container, - size_t alignment = 1) { - - // determine a size of the container i.e a number of tensor that it holds - size_t n = sizeof(T::Size) / sizeof(T::Size[0]); - - for (size_t i = 0; i < n; ++i) { - // init pointer of each tensor to the allocated memeory - container.data[i] = mem; - - // copy values and shift pointer - copyValuesToMem(mem, T::Values[i], T::Values[i] + T::Size[i], alignment); - } - } - - protected: - /** Copies a tensor to a given memory chunk, and shifts a given pointer. - * - * NOTE: The function shifts and aligns a pointer w.r.t. to a given vector register size. - * - * @param mem an address to a chunk of memory. - * NOTE: the address is going to be incremented every time - * when new information is written. - * @param first a pointer to the beginning of tensor data. - * @param last a pointer to the end of tensor data. - * @param alignment a size of a vector register (in bytes). - * */ - virtual void copyValuesToMem(float_t*& mem, float_t const* first, float_t const* last, size_t alignment) { - - // copy data - mem = copier.copy(first, last, mem); - - // shift pointer - mem += (alignedUpper(reinterpret_cast(mem), alignment) - reinterpret_cast(mem)) / sizeof(float_t); - assert(reinterpret_cast(mem) % alignment == 0); - } - - private: - CopyPolicyT copier{}; - }; - - template using DefaultCopyManager = CopyManager>; -} + } + + protected: + /** Copies a tensor to a given memory chunk, and shifts a given pointer. + * + * NOTE: The function shifts and aligns a pointer w.r.t. to a given vector + * register size. + * + * @param mem an address to a chunk of memory. + * NOTE: the address is going to be incremented every time + * when new information is written. + * @param first a pointer to the beginning of tensor data. + * @param last a pointer to the end of tensor data. + * @param alignment a size of a vector register (in bytes). + * */ + virtual void + copyValuesToMem(float_t*& mem, const float_t* first, const float_t* last, size_t alignment) { + + // copy data + mem = copier.copy(first, last, mem); + + // shift pointer + mem += (alignedUpper(reinterpret_cast(mem), alignment) - + reinterpret_cast(mem)) / + sizeof(float_t); + assert(reinterpret_cast(mem) % alignment == 0); + } + + private: + CopyPolicyT copier{}; +}; + +template +using DefaultCopyManager = CopyManager>; +} // namespace yateto #endif diff --git a/include/yateto/LinearAllocator.h b/include/yateto/LinearAllocator.h index d3621ab..a0c41fd 100644 --- a/include/yateto/LinearAllocator.h +++ b/include/yateto/LinearAllocator.h @@ -5,31 +5,31 @@ #include namespace yateto { -template +template struct LinearAllocatorT { -public: + public: void initialize(T* ptr) { - isInit = true; - userSpaceMem = ptr; + isInit = true; + userSpaceMem = ptr; } T* allocate(size_t size) { - assert(isInit && "YATETO: Temporary-Memory manager hasn't been initialized"); - int currentByteCount = byteCount; - byteCount += size; - return &userSpaceMem[currentByteCount]; + assert(isInit && "YATETO: Temporary-Memory manager hasn't been initialized"); + int currentByteCount = byteCount; + byteCount += size; + return &userSpaceMem[currentByteCount]; } void free() { - isInit = false; - byteCount = 0; - userSpaceMem = nullptr; + isInit = false; + byteCount = 0; + userSpaceMem = nullptr; } -private: + private: size_t byteCount{0}; bool isInit{false}; - T *userSpaceMem{nullptr}; + T* userSpaceMem{nullptr}; }; -} // yateto -#endif // YATETO_LINEAR_ALLOCATED_H_ \ No newline at end of file +} // namespace yateto +#endif // YATETO_LINEAR_ALLOCATED_H_ diff --git a/include/yateto/Misc.h b/include/yateto/Misc.h index 8362b2b..3913c82 100644 --- a/include/yateto/Misc.h +++ b/include/yateto/Misc.h @@ -5,14 +5,13 @@ namespace yateto { -template +template auto getMaxTmpMemRequired(KernelType& krnl) { return KernelType::TmpMaxMemRequiredInBytes; } -template -auto getMaxTmpMemRequired(KernelType& krnl, - OtherKernelTypes&... otherKrnls) { +template +auto getMaxTmpMemRequired(KernelType& krnl, OtherKernelTypes&... otherKrnls) { auto currentTmpMem = KernelType::TmpMaxMemRequiredInBytes; auto otherTmpMem = getMaxTmpMemRequired(otherKrnls...); return (currentTmpMem > otherTmpMem) ? currentTmpMem : otherTmpMem; @@ -28,6 +27,6 @@ constexpr size_t leadDim() noexcept { return dimSize(); } -} // yateto +} // namespace yateto #endif // YATETO_MISC_H_ diff --git a/include/yateto/TensorView.h b/include/yateto/TensorView.h index 73ddc86..573172e 100644 --- a/include/yateto/TensorView.h +++ b/include/yateto/TensorView.h @@ -1,432 +1,432 @@ #ifndef YATETO_MATRIXVIEW_H_ #define YATETO_MATRIXVIEW_H_ +#include #include #include -#include #include #include #include namespace yateto { - template - class slice { +template +class slice { public: - explicit slice(uint_t start = 0, uint_t stop = std::numeric_limits::max()) - : start(start), stop(stop) - {} - - uint_t start; - uint_t stop; - }; - - template - struct count_slices : std::integral_constant {}; - template - struct count_slices : std::integral_constant>) ? 1 : 0) + count_slices::value> {}; - - template - class TensorView { + explicit slice(uint_t start = 0, uint_t stop = std::numeric_limits::max()) + : start(start), stop(stop) {} + + uint_t start; + uint_t stop; +}; + +template +struct count_slices : std::integral_constant {}; +template +struct count_slices + : std::integral_constant>) ? 1 : 0) + + count_slices::value> {}; + +template +class TensorView { public: - explicit TensorView(std::initializer_list shape) { - std::copy(shape.begin(), shape.end(), m_shape); - } + explicit TensorView(std::initializer_list shape) { + std::copy(shape.begin(), shape.end(), m_shape); + } - explicit TensorView(uint_t const shape[]) { - for (uint_t d = 0; d < Dim; ++d) { - m_shape[d] = shape[d]; - } - } - - static constexpr uint_t dim() { - return Dim; + explicit TensorView(const uint_t shape[]) { + for (uint_t d = 0; d < Dim; ++d) { + m_shape[d] = shape[d]; } + } - uint_t shape(uint_t dim) const { - return m_shape[dim]; - } + static constexpr uint_t dim() { return Dim; } + + uint_t shape(uint_t dim) const { return m_shape[dim]; } protected: - uint_t m_shape[Dim]; - }; + uint_t m_shape[Dim]; +}; - template - class TensorView<0, real_t, uint_t> { +template +class TensorView<0, real_t, uint_t> { public: - explicit TensorView(std::initializer_list shape) {} + explicit TensorView(std::initializer_list shape) {} - explicit TensorView(uint_t const shape[]) {} - - static constexpr uint_t dim() { - return 0; - } + explicit TensorView(const uint_t shape[]) {} - uint_t shape(uint_t dim) const { - return 0; - } - }; + static constexpr uint_t dim() { return 0; } + + uint_t shape(uint_t dim) const { return 0; } +}; - template - class DenseTensorView : public TensorView { +template +class DenseTensorView : public TensorView { public: - using data_t = std::conditional_t; - using dataref_t = std::conditional_t; + using data_t = std::conditional_t; + using dataref_t = std::conditional_t; - explicit DenseTensorView(data_t values, std::initializer_list shape, std::initializer_list start, std::initializer_list stop) + explicit DenseTensorView(data_t values, + std::initializer_list shape, + std::initializer_list start, + std::initializer_list stop) : TensorView(shape), m_values(values) { - std::copy(start.begin(), start.end(), m_start); - std::copy(stop.begin(), stop.end(), m_stop); - computeStride(); - } + std::copy(start.begin(), start.end(), m_start); + std::copy(stop.begin(), stop.end(), m_stop); + computeStride(); + } - explicit DenseTensorView(data_t values, std::initializer_list shape) + explicit DenseTensorView(data_t values, std::initializer_list shape) : TensorView(shape), m_values(values), m_start{} { - std::copy(shape.begin(), shape.end(), m_stop); - computeStride(); - } - - explicit DenseTensorView(data_t values, uint_t const shape[], uint_t const start[], uint_t const stop[]) + std::copy(shape.begin(), shape.end(), m_stop); + computeStride(); + } + + explicit DenseTensorView(data_t values, + const uint_t shape[], + const uint_t start[], + const uint_t stop[]) : TensorView(shape), m_values(values) { - for (uint_t d = 0; d < Dim; ++d) { - m_start[d] = start[d]; - m_stop[d] = stop[d]; - } - computeStride(); + for (uint_t d = 0; d < Dim; ++d) { + m_start[d] = start[d]; + m_stop[d] = stop[d]; } + computeStride(); + } - explicit DenseTensorView(data_t values, uint_t const shape[]) - : TensorView(shape), m_values(values), m_start{} { - for (uint_t d = 0; d < Dim; ++d) { - m_stop[d] = shape[d]; - } - computeStride(); - } - - explicit DenseTensorView(data_t values, uint_t const shape[], uint_t const stride[]) + explicit DenseTensorView(data_t values, const uint_t shape[]) : TensorView(shape), m_values(values), m_start{} { - for (uint_t d = 0; d < Dim; ++d) { - m_stop[d] = shape[d]; - m_stride[d] = stride[d]; - } - } - - uint_t size() const { - return (m_stop[Dim-1]-m_start[Dim-1]) * m_stride[Dim-1]; - } - - void setZero() { - uint_t entry[Dim]; - std::copy(m_start, m_start + Dim, entry); - while (entry[Dim-1] != m_stop[Dim-1]) { - auto values = &operator[](entry); - for (uint_t i = 0.0; i < m_stop[0]-m_start[0]; ++i) { - values[i*m_stride[0]] = 0.0; - } - if (Dim == 1) { - break; - } - - uint_t d = 0; - do { - entry[d] = m_start[d]; - d++; - ++entry[d]; - } while (entry[d] == m_stop[d] && d < Dim-1); - } - } - - template - bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head) const { - return static_cast(head) >= start[dim] && static_cast(head) < stop[dim]; - } - - template - bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head, Tail... tail) const { - return static_cast(head) >= start[dim] - && static_cast(head) < stop[dim] - && isInRange(start, stop, dim+1, tail...); - } - - template - bool isInRange(Entry... entry) const { - static_assert(sizeof...(entry) == Dim, - "Number of arguments to isInRange(...) does not match the tensor dimension."); - return isInRange(m_start, m_stop, 0, entry...); + for (uint_t d = 0; d < Dim; ++d) { + m_stop[d] = shape[d]; } + computeStride(); + } - template - dataref_t operator()(Entry... entry) { - static_assert(sizeof...(entry) == Dim, - "Number of arguments to operator() does not match the tensor dimension."); - assert(isInRange(entry...)); - return m_values[address(entry...)]; + explicit DenseTensorView(data_t values, const uint_t shape[], const uint_t stride[]) + : TensorView(shape), m_values(values), m_start{} { + for (uint_t d = 0; d < Dim; ++d) { + m_stop[d] = shape[d]; + m_stride[d] = stride[d]; } + } - template - const real_t& operator()(Entry... entry) const { - static_assert(sizeof...(entry) == Dim, - "Number of arguments to operator() const does not match the tensor dimension."); - assert(isInRange(entry...)); - return m_values[address(entry...)]; - } + uint_t size() const { return (m_stop[Dim - 1] - m_start[Dim - 1]) * m_stride[Dim - 1]; } - const real_t& operator[](uint_t const entry[Dim]) const { - uint_t addr = 0; - for (uint_t d = 0; d < Dim; ++d) { - assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]); - addr += (entry[d] - m_start[d]) * m_stride[d]; + void setZero() { + uint_t entry[Dim]; + std::copy(m_start, m_start + Dim, entry); + while (entry[Dim - 1] != m_stop[Dim - 1]) { + auto values = &operator[](entry); + for (uint_t i = 0.0; i < m_stop[0] - m_start[0]; ++i) { + values[i * m_stride[0]] = 0.0; } - return m_values[addr]; - } - - dataref_t operator[](uint_t const entry[Dim]) { - uint_t addr = 0; - for (uint_t d = 0; d < Dim; ++d) { - assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]); - addr += (entry[d] - m_start[d]) * m_stride[d]; + if (Dim == 1) { + break; } - return m_values[addr]; - } - template - void copyToView(view_t& other) const { - assert(Dim == other.dim()); - - uint_t entry[Dim]; - for (uint_t d = 0; d < Dim; ++d) { - assert(this->shape(d) == other.shape(d)); + uint_t d = 0; + do { entry[d] = m_start[d]; + d++; + ++entry[d]; + } while (entry[d] == m_stop[d] && d < Dim - 1); + } + } + + template + bool isInRange(const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head) const { + return static_cast(head) >= start[dim] && static_cast(head) < stop[dim]; + } + + template + bool isInRange( + const uint_t start[Dim], const uint_t stop[Dim], int dim, Head head, Tail... tail) const { + return static_cast(head) >= start[dim] && static_cast(head) < stop[dim] && + isInRange(start, stop, dim + 1, tail...); + } + + template + bool isInRange(Entry... entry) const { + static_assert(sizeof...(entry) == Dim, + "Number of arguments to isInRange(...) does not match the " + "tensor dimension."); + return isInRange(m_start, m_stop, 0, entry...); + } + + template + dataref_t operator()(Entry... entry) { + static_assert(sizeof...(entry) == Dim, + "Number of arguments to operator() does not match the tensor " + "dimension."); + assert(isInRange(entry...)); + return m_values[address(entry...)]; + } + + template + const real_t& operator()(Entry... entry) const { + static_assert(sizeof...(entry) == Dim, + "Number of arguments to operator() const does not match the " + "tensor dimension."); + assert(isInRange(entry...)); + return m_values[address(entry...)]; + } + + const real_t& operator[](const uint_t entry[Dim]) const { + uint_t addr = 0; + for (uint_t d = 0; d < Dim; ++d) { + assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]); + addr += (entry[d] - m_start[d]) * m_stride[d]; + } + return m_values[addr]; + } + + dataref_t operator[](const uint_t entry[Dim]) { + uint_t addr = 0; + for (uint_t d = 0; d < Dim; ++d) { + assert(entry[d] >= m_start[d] && entry[d] < m_stop[d]); + addr += (entry[d] - m_start[d]) * m_stride[d]; + } + return m_values[addr]; + } + + template + void copyToView(view_t& other) const { + assert(Dim == other.dim()); + + uint_t entry[Dim]; + for (uint_t d = 0; d < Dim; ++d) { + assert(this->shape(d) == other.shape(d)); + entry[d] = m_start[d]; + } + + uint_t stop0 = std::min(m_stop[0], this->shape(0)); + data_t val = m_values; + while (entry[Dim - 1] != m_stop[Dim - 1]) { + for (uint_t i = m_start[0]; i < stop0; ++i) { + entry[0] = i; + other[entry] = *(val++); } - - uint_t stop0 = std::min(m_stop[0], this->shape(0)); - data_t val = m_values; - while (entry[Dim-1] != m_stop[Dim-1]) { - for (uint_t i = m_start[0]; i < stop0; ++i) { - entry[0] = i; - other[entry] = *(val++); - } - val += (m_stop[0]-stop0); - - if (Dim == 1) { - break; - } - - uint_t d = 0; - do { - entry[d] = m_start[d]; - d++; - ++entry[d]; - } while (entry[d] == m_stop[d] && d < Dim-1); - } - } - - template - auto subtensor(Entry... entry) -> DenseTensorView::value, real_t, uint_t, Const> { - static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); - constexpr auto nSlices = count_slices::value; - uint_t begin[Dim]; - uint_t size[nSlices]; - uint_t stride[nSlices]; - extractSubtensor(begin, size, stride, entry...); - DenseTensorView subtensor(&operator[](begin), size, stride); - return subtensor; - } + val += (m_stop[0] - stop0); - template - auto subtensor(Entry... entry) const -> DenseTensorView::value, real_t, uint_t, true> { - static_assert(sizeof...(entry) == Dim, "Number of arguments to subtensor() does not match tensor dimension."); - constexpr auto nSlices = count_slices::value; - uint_t begin[Dim]; - uint_t size[nSlices]; - uint_t stride[nSlices]; - extractSubtensor(begin, size, stride, entry...); - DenseTensorView subtensor(&operator[](begin), size, stride); - return subtensor; - } - - data_t data() { - return m_values; - } - - const real_t* data() const { - return m_values; - } - - protected: - void computeStride() { - m_stride[0] = 1; - for (uint_t d = 0; d < Dim-1; ++d) { - m_stride[d+1] = m_stride[d] * (m_stop[d] - m_start[d]); + if (Dim == 1) { + break; } - } - - template - uint_t address(Head head) const { - assert(static_cast(head) >= m_start[Dim-1] && static_cast(head) < m_stop[Dim-1]); - return (head - m_start[Dim-1]) * m_stride[Dim-1]; - } - - template - uint_t address(Head head, Tail... tail) const { - uint_t const d = (Dim-1) - sizeof...(tail); - assert(static_cast(head) >= m_start[d] && static_cast(head) < m_stop[d]); - return (head - m_start[d]) * m_stride[d] + address(tail...); - } - - template, int> = 0> - void extractDim(uint_t*& begin, uint_t*&, uint_t*&, uint_t dimNo, T entry) const { - assert(static_cast(entry) >= m_start[dimNo] && static_cast(entry) < m_stop[dimNo]); - *begin++ = entry; - } - - template>, int> = 0> - void extractDim(uint_t*& begin, uint_t*& size, uint_t*& stride, uint_t dimNo, T dim) const { - *begin = std::max(m_start[dimNo], dim.start); - *size++ = std::min(m_stop[dimNo], dim.stop) - *begin; - ++begin; - *stride++ = m_stride[dimNo]; - } - - template - void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head) const { - extractDim(begin, size, stride, Dim-1, head); - } - - template - void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head, Tail... tail) const { - uint_t const d = (Dim-1) - sizeof...(tail); - extractDim(begin, size, stride, d, head); - extractSubtensor(begin, size, stride, tail...); - } - data_t m_values; - uint_t m_start[Dim]; - uint_t m_stop[Dim]; - uint_t m_stride[Dim]; - }; + uint_t d = 0; + do { + entry[d] = m_start[d]; + d++; + ++entry[d]; + } while (entry[d] == m_stop[d] && d < Dim - 1); + } + } + + template + auto subtensor(Entry... entry) + -> DenseTensorView::value, real_t, uint_t, Const> { + static_assert(sizeof...(entry) == Dim, + "Number of arguments to subtensor() does not match tensor dimension."); + constexpr auto nSlices = count_slices::value; + uint_t begin[Dim]; + uint_t size[nSlices]; + uint_t stride[nSlices]; + extractSubtensor(begin, size, stride, entry...); + DenseTensorView subtensor(&operator[](begin), size, stride); + return subtensor; + } + + template + auto subtensor(Entry... entry) const + -> DenseTensorView::value, real_t, uint_t, true> { + static_assert(sizeof...(entry) == Dim, + "Number of arguments to subtensor() does not match tensor dimension."); + constexpr auto nSlices = count_slices::value; + uint_t begin[Dim]; + uint_t size[nSlices]; + uint_t stride[nSlices]; + extractSubtensor(begin, size, stride, entry...); + DenseTensorView subtensor(&operator[](begin), size, stride); + return subtensor; + } + + data_t data() { return m_values; } + + const real_t* data() const { return m_values; } - template - class DenseTensorView<0,real_t,uint_t,Const> : public TensorView<0, real_t, uint_t> { + protected: + void computeStride() { + m_stride[0] = 1; + for (uint_t d = 0; d < Dim - 1; ++d) { + m_stride[d + 1] = m_stride[d] * (m_stop[d] - m_start[d]); + } + } + + template + uint_t address(Head head) const { + assert(static_cast(head) >= m_start[Dim - 1] && + static_cast(head) < m_stop[Dim - 1]); + return (head - m_start[Dim - 1]) * m_stride[Dim - 1]; + } + + template + uint_t address(Head head, Tail... tail) const { + const uint_t d = (Dim - 1) - sizeof...(tail); + assert(static_cast(head) >= m_start[d] && static_cast(head) < m_stop[d]); + return (head - m_start[d]) * m_stride[d] + address(tail...); + } + + template , int> = 0> + void extractDim(uint_t*& begin, uint_t*&, uint_t*&, uint_t dimNo, T entry) const { + assert(static_cast(entry) >= m_start[dimNo] && + static_cast(entry) < m_stop[dimNo]); + *begin++ = entry; + } + + template >, int> = 0> + void extractDim(uint_t*& begin, uint_t*& size, uint_t*& stride, uint_t dimNo, T dim) const { + *begin = std::max(m_start[dimNo], dim.start); + *size++ = std::min(m_stop[dimNo], dim.stop) - *begin; + ++begin; + *stride++ = m_stride[dimNo]; + } + + template + void extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head) const { + extractDim(begin, size, stride, Dim - 1, head); + } + + template + void + extractSubtensor(uint_t* begin, uint_t* size, uint_t* stride, Head head, Tail... tail) const { + const uint_t d = (Dim - 1) - sizeof...(tail); + extractDim(begin, size, stride, d, head); + extractSubtensor(begin, size, stride, tail...); + } + + data_t m_values; + uint_t m_start[Dim]; + uint_t m_stop[Dim]; + uint_t m_stride[Dim]; +}; + +template +class DenseTensorView<0, real_t, uint_t, Const> : public TensorView<0, real_t, uint_t> { public: - using data_t = std::conditional_t; - using dataref_t = std::conditional_t; + using data_t = std::conditional_t; + using dataref_t = std::conditional_t; - explicit DenseTensorView(data_t values, std::initializer_list shape, std::initializer_list start, std::initializer_list stop) - : TensorView<0, real_t, uint_t>(shape), m_values(values) { - } + explicit DenseTensorView(data_t values, + std::initializer_list shape, + std::initializer_list start, + std::initializer_list stop) + : TensorView<0, real_t, uint_t>(shape), m_values(values) {} - uint_t size() const { - return 1; - } + uint_t size() const { return 1; } - void setZero() { - m_values[0] = 0.0; - } + void setZero() { m_values[0] = 0.0; } - template - void copyToView(view_t& other) { - other.m_values[0] = m_values[0]; - } + template + void copyToView(view_t& other) { + other.m_values[0] = m_values[0]; + } protected: - data_t m_values; - }; + data_t m_values; +}; - template - class CSCMatrixView : public TensorView<2, real_t, uint_t> { +template +class CSCMatrixView : public TensorView<2, real_t, uint_t> { public: - using data_t = std::conditional_t; - using dataref_t = std::conditional_t; + using data_t = std::conditional_t; + using dataref_t = std::conditional_t; - explicit CSCMatrixView(data_t values, std::initializer_list shape, uint_t const* rowInd, uint_t const* colPtr) + explicit CSCMatrixView(data_t values, + std::initializer_list shape, + const uint_t* rowInd, + const uint_t* colPtr) : TensorView<2, real_t, uint_t>(shape), m_values(values), m_rowInd(rowInd), m_colPtr(colPtr) { - } + } - explicit CSCMatrixView(data_t values, uint_t const shape[], uint_t const* rowInd, uint_t const* colPtr) + explicit CSCMatrixView(data_t values, + const uint_t shape[], + const uint_t* rowInd, + const uint_t* colPtr) : TensorView<2, real_t, uint_t>(shape), m_values(values), m_rowInd(rowInd), m_colPtr(colPtr) { - } + } - uint_t size() const { - return m_colPtr[ this->shape(1) ]; - } + uint_t size() const { return m_colPtr[this->shape(1)]; } - void setZero() { - memset(m_values, 0, size() * sizeof(real_t)); - } + void setZero() { memset(m_values, 0, size() * sizeof(real_t)); } - const real_t& operator()(uint_t row, uint_t col) const { - assert(col >= 0 && col < this->shape(1)); - uint_t addr = m_colPtr[ col ]; - uint_t stop = m_colPtr[ col+1 ]; - while (addr < stop) { - if (m_rowInd[addr] == row) { - break; - } - ++addr; + const real_t& operator()(uint_t row, uint_t col) const { + assert(col >= 0 && col < this->shape(1)); + uint_t addr = m_colPtr[col]; + uint_t stop = m_colPtr[col + 1]; + while (addr < stop) { + if (m_rowInd[addr] == row) { + break; } - assert(addr != stop); - - return m_values[addr]; + ++addr; } + assert(addr != stop); - dataref_t operator()(uint_t row, uint_t col) { - assert(col >= 0 && col < this->shape(1)); - uint_t addr = m_colPtr[ col ]; - uint_t stop = m_colPtr[ col+1 ]; - while (addr < stop) { - if (m_rowInd[addr] == row) { - break; - } - ++addr; - } - assert(addr != stop); + return m_values[addr]; + } - return m_values[addr]; + dataref_t operator()(uint_t row, uint_t col) { + assert(col >= 0 && col < this->shape(1)); + uint_t addr = m_colPtr[col]; + uint_t stop = m_colPtr[col + 1]; + while (addr < stop) { + if (m_rowInd[addr] == row) { + break; + } + ++addr; } + assert(addr != stop); - bool isInRange(uint_t row, uint_t col) const { - assert(col >= 0 && col < this->shape(1)); - uint_t addr = m_colPtr[ col ]; - uint_t stop = m_colPtr[ col+1 ]; - while (addr < stop) { - if (m_rowInd[addr] == row) { - return true; - } - ++addr; - } + return m_values[addr]; + } - return false; + bool isInRange(uint_t row, uint_t col) const { + assert(col >= 0 && col < this->shape(1)); + uint_t addr = m_colPtr[col]; + uint_t stop = m_colPtr[col + 1]; + while (addr < stop) { + if (m_rowInd[addr] == row) { + return true; + } + ++addr; } - dataref_t operator[](const uint_t entry[2]) { - return operator()(entry[0], entry[1]); - } + return false; + } - const real_t& operator[](const uint_t entry[2]) const { - return operator()(entry[0], entry[1]); - } + dataref_t operator[](const uint_t entry[2]) { return operator()(entry[0], entry[1]); } + + const real_t& operator[](const uint_t entry[2]) const { return operator()(entry[0], entry[1]); } + + template + void copyToView(view_t& other) { + assert(2 == other.dim()); + assert(this->shape(0) == other.shape(0) && this->shape(1) == other.shape(1)); - template - void copyToView(view_t& other) { - assert(2 == other.dim()); - assert(this->shape(0) == other.shape(0) && this->shape(1) == other.shape(1)); - - uint_t entry[2]; - uint_t ncols = this->shape(1); - for (uint_t col = 0; col < ncols; ++col) { - entry[1] = col; - for (uint_t i = m_colPtr[col]; i < m_colPtr[col+1]; ++i) { - entry[0] = m_rowInd[i]; - other[entry] = m_values[i]; - } + uint_t entry[2]; + uint_t ncols = this->shape(1); + for (uint_t col = 0; col < ncols; ++col) { + entry[1] = col; + for (uint_t i = m_colPtr[col]; i < m_colPtr[col + 1]; ++i) { + entry[0] = m_rowInd[i]; + other[entry] = m_values[i]; } } + } protected: - data_t m_values; - uint_t const* m_rowInd; - uint_t const* m_colPtr; - }; -} + data_t m_values; + const uint_t* m_rowInd; + const uint_t* m_colPtr; +}; +} // namespace yateto #endif diff --git a/tests/Dockerfile-setup b/tests/Dockerfile-setup index 19d26fb..f5987ad 100644 --- a/tests/Dockerfile-setup +++ b/tests/Dockerfile-setup @@ -11,7 +11,7 @@ ARG GID=1000 RUN addgroup --gid $GID tester RUN adduser --disabled-password --gecos '' --uid $UID --gid $GID tester -RUN chown $UID:$GID /local_workspace /cache +RUN chown $UID:$GID /local_workspace /cache USER tester # copy repo from the local current directory (fetched with Jenkins) to the workdir of the image diff --git a/tests/Jenkinsfile b/tests/Jenkinsfile index 811d349..2a4ca99 100644 --- a/tests/Jenkinsfile +++ b/tests/Jenkinsfile @@ -1,27 +1,27 @@ properties([ parameters([string( - defaultValue: 'runner', - description: 'agent name which tells where to run a job', + defaultValue: 'runner', + description: 'agent name which tells where to run a job', name: 'AGENT', trim: true), string( - defaultValue: '', - description: 'target architecture (according to yateto format). If not given then taken from Jenkins env-vars', - name: 'ARCH', + defaultValue: '', + description: 'target architecture (according to yateto format). If not given then taken from Jenkins env-vars', + name: 'ARCH', trim: true), string( - defaultValue: 'matmul minimal', - description: 'whitespace separate list of examples', - name: 'EXAMPLES', + defaultValue: 'matmul minimal', + description: 'whitespace separate list of examples', + name: 'EXAMPLES', trim: true), string( - defaultValue: 'Eigen LIBXSMM OpenBLAS', - description: 'whitespace separate list of generators', - name: 'GENERATORS', + defaultValue: 'Eigen LIBXSMM OpenBLAS', + description: 'whitespace separate list of generators', + name: 'GENERATORS', trim: true), booleanParam( - defaultValue: false, - description: 'if true the environment image will be build. Note: it will take a considerable amount of time', + defaultValue: false, + description: 'if true the environment image will be build. Note: it will take a considerable amount of time', name: 'BUILD_ENV_IMAGE') ]) ]) @@ -52,12 +52,12 @@ pipeline { } } steps { - // Make sure that Jenkins knows the location of Spack. + // Make sure that Jenkins knows the location of Spack. // You will need to add it to the Jenkins settings dir("tests") { script { - withCredentials([usernamePassword(credentialsId: 'docker-hub', - usernameVariable: 'USERNAME', + withCredentials([usernamePassword(credentialsId: 'docker-hub', + usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { sh """ docker run --rm -v \$(pwd):/home -w /home ${USERNAME}/spack-ubuntu-1804:latest containerize > ./Dockerfile-env @@ -79,10 +79,10 @@ pipeline { steps { script { def dockerFilePath = "tests/Dockerfile-setup" - def buildParams = String.format("--no-cache --build-arg UID=%s --build-arg GID=%s --file %s .", + def buildParams = String.format("--no-cache --build-arg UID=%s --build-arg GID=%s --file %s .", env.USER_ID, env.GROUP_ID, dockerFilePath) def customImage = docker.build("yateto:latest", buildParams) - } + } } } stage('RunTest') { @@ -105,15 +105,15 @@ for example in ${EXAMPLES}; do echo " Host Arch: ${ARCH}" echo " Example: ${example}" echo "===========================" - + cmake .. -DEXAMPLES=$example -DCMAKE_BUILD_TYPE=$build_type -DPRECISION=$precision -DVARIANT=$backend -DARCH=$ARCH make - + STORAGE=/cache/$example-$backend-$precision-$build_type echo $STORAGE mkdir -p $STORAGE cp -r ./$example/*/* $STORAGE - + make test rm -rf ./* done @@ -129,11 +129,11 @@ cmake .. && make && make test writeFile(file: "run_tests.sh", text: TestScript) } sh "mkdir ./cache" - + script { // define test arch. for testing // if the user specifies ARCH as parameter it is going to be used for testing - // if not, we will try to get ARCH from the Jenkins env. variables + // if not, we will try to get ARCH from the Jenkins env. variables // if the user didn't set env.HOST_ARCH in his/her Jenkins settings, then 'noarch' will be used env.TEST_ARCH="noarch" if (!env.ARCH.allWhitespace) { @@ -147,9 +147,9 @@ cmake .. && make && make test } } } - post { + post { always { sh "docker image rm yateto:latest" } } -} \ No newline at end of file +} diff --git a/tests/README.md b/tests/README.md index 0b4bc93..b949b4a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,45 +1,49 @@ -## Testing +# Testing + Testing is divided into 3 parts: interface, generic and code-gen. -- *interface* - yateto comes with some helper structures that -external projects can use. The structures are defined -in the *\*/**include** directory and allow a user to copy data from one -tensor to another if they have the same dimensionality but -different sizes. For example, a target tensor may be padded along the leading dimension -to achieve efficient vectorization. This part of the testing is supposed to +- *interface* - yateto comes with some helper structures that +external projects can use. The structures are defined +in the *\*/**include** directory and allow +a user to copy data from one +tensor to another if they have the same dimensionality but +different sizes. For example, a target tensor may be padded along the leading dimension +to achieve efficient vectorization. This part of the testing is supposed to check whether the provided structures operate correctly. -- *generic* - yateto generates an optimized tensor contraction source code based on the -*Loop over GEMM* algorithm. To achieve better performance, the generated source code -contains calls to optimized GEMM libraries and generators. However, yateto can -also generate not-optimized source code using simple *for-loops* +- *generic* - yateto generates an optimized tensor +contraction source code based on the +*Loop over GEMM* algorithm. To achieve better performance, the generated source code +contains calls to optimized GEMM libraries and generators. However, yateto can +also generate not-optimized source code using simple *for-loops* which can be used either for performance comparison or for testing with respect -to an optimized one. This part of the testing is supposed to check whether the +to an optimized one. This part of the testing is supposed to check whether the *generic* implementation of tensor contraction is numerically correct. - - *code-gen* - checks wither an optimized tensor contraction code produces the same numerical results as the *generic* implementation. -The testing is supposed to only be performed with **GNU** tools and, therefore, +The testing is supposed to only be performed with **GNU** tools and, therefore, no platform specific libraries (*like intel MKL*) are included. Moreover, only the -following architectures are supported for testing: *sandy bridge, haswell, skylake, -ThunderX2*. +following architectures are supported for testing: *sandy bridge, haswell, skylake, +ThunderX2*. In order to compile tests, make sure that you have **CxxTest** installed and visible in you current working environment. ## Current status + | Part | Status | |:---------:|:---------------:| -| interface | 1 test | +| interface | 1 test | | generic | not implemented | | code-gen | 2 tests | - ## Running tests manually + ### Interface -```console + +```bash cd mkdir interface/build && cd interface/build cmake .. make @@ -47,29 +51,39 @@ ctest ``` ### Code-gen -Code-gen allows a user to test yateto with one of the following GEMM libraries/generators: Eigen, OpenBLAS, LIBXSMM. Make sure that you have them installed on your system and visible in your current working environment. -##### CMake options +Code-gen allows a user to test yateto with one of the +following GEMM libraries/generators: Eigen, OpenBLAS, +LIBXSMM. Make sure that you have them installed on your +system and visible in your current working environment. + +#### CMake options + | CMake Variable Name | Type | Allowed Values | |:-------------------:|:------:|:------------------------------:| -| ARCH | string | snb / hsw / skx / thunderx2t99 | -| EXAMPLES | list | matmul / minimal / matmult;minimal | -| PRECISION | string | double / single | -| VARIANT | string | Eigen / OpenBLAS / LIBXSMM | +| ARCH | string | snb / hsw / skx / thunderx2t99 | +| EXAMPLES | list | matmul / minimal / matmult;minimal | +| PRECISION | string | double / single | +| VARIANT | string | Eigen / OpenBLAS / LIBXSMM / PSpaMM | + +#### Default -##### Default -Uses: **haswell** architecture, **matmul** and **minimal** as examples, **Eigen** +Uses: **haswell** architecture, **matmul** and **minimal** as examples, **Eigen** as a GEMM implementation, **double** precision. -```console + +```bash cd mkdir code-gen/build && cd code-gen/build cmake .. make ctest ``` -##### A Specific Example -For **haswell** architecture with **single** precision and **libxsmm** + +#### A Specific Example + +For **haswell** architecture with **single** precision and **libxsmm** as a GEMM generator. -```console + +```bash cd mkdir code-gen/build && cd code-gen/build cmake .. -DPRECISION=single -DVARIANT=LIBXSMM make @@ -77,12 +91,8 @@ ctest ``` ## Running tests automatically -The following [pipeline](Jenkinsfile) has been implemented to run the aforementioned tests automatically. As a regular user, you can see results of the last few runs of the pipeline [here](http://vmbungartz10.informatik.tu-muenchen.de/seissol/view/Yateto/job/yateto-codegen/). - -You can trigger the pipeline and thus run all tests if you a member of SeisSol in github. To achive this, please, perform the following steps: -- open this [page](http://vmbungartz10.informatik.tu-muenchen.de/seissol/view/Yateto/job/yateto-codegen/) -- click on `log in` button at the top right corner and follow the authentication procedure -- click on `Build with Parameters` button. You will be forwarded to the next page where you can adjust parameters. We do not recommend to make any changes in `AGENT` and `BUILD_ENV_IMAGE` fields -- click on `Build` to trigger the pipeline. -- After that, you will see a new flashing entry at the very top of `Build History` field. If you want to see a detail status information about all steps involved in the pipeline then click on a dropdown widget of the flashing entry and select `Console Output` \ No newline at end of file +See the `.github/workflows` folder for a +file which runs the tests automatically. +The local `Jenkinsfile` exists as well, +but is probably outdated as of now (early 2026). diff --git a/tests/code-gen/cmake/FindLIBXSMM.cmake b/tests/code-gen/cmake/FindLIBXSMM.cmake index ced8a8b..10a0870 100644 --- a/tests/code-gen/cmake/FindLIBXSMM.cmake +++ b/tests/code-gen/cmake/FindLIBXSMM.cmake @@ -34,4 +34,4 @@ else() endif() mark_as_advanced(LIBXSMM_INCLUDE_DIRS LIBXSMM_LIBRARIES) -find_package_handle_standard_args(LIBXSMM REQUIRED_VARS LIBXSMM_INCLUDE_DIRS LIBXSMM_LIBRARIES) \ No newline at end of file +find_package_handle_standard_args(LIBXSMM REQUIRED_VARS LIBXSMM_INCLUDE_DIRS LIBXSMM_LIBRARIES) diff --git a/tests/code-gen/cmake/FindLibxsmm_executable.cmake b/tests/code-gen/cmake/FindLibxsmm_executable.cmake index a3bb50b..00fdf7c 100644 --- a/tests/code-gen/cmake/FindLibxsmm_executable.cmake +++ b/tests/code-gen/cmake/FindLibxsmm_executable.cmake @@ -24,4 +24,4 @@ find_program(Libxsmm_executable_PROGRAM libxsmm_gemm_generator ) find_package_handle_standard_args(Libxsmm_executable - REQUIRED_VARS Libxsmm_executable_PROGRAM) \ No newline at end of file + REQUIRED_VARS Libxsmm_executable_PROGRAM) diff --git a/tests/code-gen/generator.py b/tests/code-gen/generator.py index 4cd21c3..2ec8f0a 100755 --- a/tests/code-gen/generator.py +++ b/tests/code-gen/generator.py @@ -62,4 +62,4 @@ for kernel in g.kernels(): d = os.path.join(outDir, kernel.name) os.makedirs(d, exist_ok=True) - PrintEquivalentSparsityPatterns(d).visit(kernel.ast) \ No newline at end of file + PrintEquivalentSparsityPatterns(d).visit(kernel.ast) diff --git a/tests/code-gen/stock.py b/tests/code-gen/stock.py index c50e644..e530d05 100755 --- a/tests/code-gen/stock.py +++ b/tests/code-gen/stock.py @@ -41,4 +41,3 @@ def add(g): stock = R['ijk'] <= S['xyz'] * XLTP['lx'] * XRTP['il'] * YL['ym'] * YR['mj'] * ZL['zn'] * ZR['nk'] g.add('stock{}_trans_pad'.format(pqx), stock) - diff --git a/tests/interface/CMakeLists.txt b/tests/interface/CMakeLists.txt index acadc58..80f00ad 100644 --- a/tests/interface/CMakeLists.txt +++ b/tests/interface/CMakeLists.txt @@ -9,7 +9,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) enable_testing() # generate and add an interface test - add_custom_command(COMMAND ${CXXTEST_PYTHON_TESTGEN_EXECUTABLE} + add_custom_command(COMMAND ${CXXTEST_PYTHON_TESTGEN_EXECUTABLE} --error-printer -o TensorView.t.cpp ${CMAKE_SOURCE_DIR}/TensorView.t.h OUTPUT TensorView.t.cpp COMMENT "Generating a test runner") diff --git a/tests/interface/TensorView.t.h b/tests/interface/TensorView.t.h index 6487c93..215bd0e 100644 --- a/tests/interface/TensorView.t.h +++ b/tests/interface/TensorView.t.h @@ -3,57 +3,52 @@ using namespace yateto; -class TensorViewTestSuite : public CxxTest::TestSuite -{ -private: +class TensorViewTestSuite : public CxxTest::TestSuite { + private: double m_data[12]; -public: - void setUp() - { + public: + void setUp() { for (int i = 0; i < 12; ++i) { - m_data[i] = static_cast(i+1); + m_data[i] = static_cast(i + 1); } } - void testAccess() - { - DenseTensorView<3, double> tensor(m_data, {3,2,2}); - TS_ASSERT_EQUALS(tensor(0,0,0), 1.0); - TS_ASSERT_EQUALS(tensor(1,1,0), 5.0); - TS_ASSERT_EQUALS(tensor(2,1,1), 12.0); + void testAccess() { + DenseTensorView<3, double> tensor(m_data, {3, 2, 2}); + TS_ASSERT_EQUALS(tensor(0, 0, 0), 1.0); + TS_ASSERT_EQUALS(tensor(1, 1, 0), 5.0); + TS_ASSERT_EQUALS(tensor(2, 1, 1), 12.0); } - void testSubtensor() - { - DenseTensorView<3, double> tensor(m_data, {3,2,2}); + void testSubtensor() { + DenseTensorView<3, double> tensor(m_data, {3, 2, 2}); auto sub = tensor.subtensor(1, slice<>(), slice<>()); - TS_ASSERT_EQUALS(sub(0,0), 2.0); - TS_ASSERT_EQUALS(sub(1,0), 5.0); - TS_ASSERT_EQUALS(sub(0,1), 8.0); - TS_ASSERT_EQUALS(sub(1,1), 11.0); + TS_ASSERT_EQUALS(sub(0, 0), 2.0); + TS_ASSERT_EQUALS(sub(1, 0), 5.0); + TS_ASSERT_EQUALS(sub(0, 1), 8.0); + TS_ASSERT_EQUALS(sub(1, 1), 11.0); auto sub2 = sub.subtensor(1, slice<>()); TS_ASSERT_EQUALS(sub2(0), 5.0); TS_ASSERT_EQUALS(sub2(1), 11.0); - auto sub3 = tensor.subtensor(slice<>(1,3), slice<>(), slice<>()); - TS_ASSERT_EQUALS(sub3(0,0,0), 2.0); - TS_ASSERT_EQUALS(sub3(0,1,0), 5.0); - TS_ASSERT_EQUALS(sub3(1,0,1), 9.0); - TS_ASSERT_EQUALS(sub3(1,1,1), 12.0); - } + auto sub3 = tensor.subtensor(slice<>(1, 3), slice<>(), slice<>()); + TS_ASSERT_EQUALS(sub3(0, 0, 0), 2.0); + TS_ASSERT_EQUALS(sub3(0, 1, 0), 5.0); + TS_ASSERT_EQUALS(sub3(1, 0, 1), 9.0); + TS_ASSERT_EQUALS(sub3(1, 1, 1), 12.0); + } - void testSetZero() - { - DenseTensorView<3, double> tensor(m_data, {3,2,2}); + void testSetZero() { + DenseTensorView<3, double> tensor(m_data, {3, 2, 2}); auto sub = tensor.subtensor(1, slice<>(), slice<>()); sub.setZero(); for (int i = 0; i < 12; ++i) { - if ((i-1) % 3 == 0) { + if ((i - 1) % 3 == 0) { TS_ASSERT_EQUALS(m_data[i], 0.0); } else { - TS_ASSERT_EQUALS(m_data[i], static_cast(i+1)); + TS_ASSERT_EQUALS(m_data[i], static_cast(i + 1)); } } } diff --git a/tests/spack.yaml b/tests/spack.yaml index 9e8d72a..0b328cc 100644 --- a/tests/spack.yaml +++ b/tests/spack.yaml @@ -1,11 +1,11 @@ spack: definitions: - - packages: + - packages: - eigen@3.3.7 - openblas@0.3.12 - libxsmm+generator+shared@master - cmake@3.16.0 - + specs: - matrix: - [\$packages] diff --git a/yateto/arch.py b/yateto/arch.py index 07b1b2b..ff99cbc 100644 --- a/yateto/arch.py +++ b/yateto/arch.py @@ -114,7 +114,7 @@ def formatConstant(self, constant): def onHeap(self, numReals): return (numReals * self.bytesPerReal) > self._tmpStackLimit - + def __eq__(self, other): return self.name == other.name @@ -194,7 +194,7 @@ def getHeterogeneousArchitectureIdentifiedBy(host_arch, device_arch, device_back if device_arch.startswith('sm_'): alignment = 64 - elif device_arch.startswith('gfx'): + elif device_arch.startswith('gfx'): alignment = 128 elif re.match(r"\d+_\d+_\d+", device_arch): alignment = 32 @@ -230,10 +230,10 @@ def deriveArchitecture(host_def: HostArchDefinition, device_def: Union[DeviceArc alignment = host_def.alignment if host_def.prefetch is not None: prefetch = host_def.prefetch - + if alignment is None: raise NotImplementedError(f'The architecture {host_def.archname} is unknown to Yateto, and no custom alignment was given') - + if prefetch is None: raise NotImplementedError(f'The architecture {host_def.archname} is unknown to Yateto, and no custom prefetching info was given') diff --git a/yateto/aspp.py b/yateto/aspp.py index 2b10d09..e2e21b5 100644 --- a/yateto/aspp.py +++ b/yateto/aspp.py @@ -41,7 +41,7 @@ def reshape(self, shape): @abstractmethod def transposed(self, shape): pass - + @abstractmethod def broadcast(self, shape): pass @@ -77,7 +77,7 @@ def reshape(self, shape): def transposed(self, perm): return type(self)(tuple(self.shape[p] for p in perm)) - + def broadcast(self, bcst): return type(self)(tuple(shp * bc for shp, bc in zip(self.shape, bcst))) @@ -99,7 +99,7 @@ def einsum(description, a1, a2): sz1 = {i: a1.shape[A.find(i)] for i in A} sz2 = {i: a2.shape[B.find(i)] for i in B} intersect = filter(lambda x: x in sz1, sz2.keys()) - assert all([sz1[i] == sz2[i] for i in intersect]) + assert all([sz1[i] == sz2[i] for i in intersect]) sz1.update(sz2) return dense(tuple(sz1[i] for i in C)) else: @@ -166,7 +166,7 @@ def reshape(self, shape): def transposed(self, perm): return type(self)(self.pattern.transpose(perm).copy(order=self.NUMPY_DEFAULT_ORDER)) - + def broadcast(self, bcst): return type(self)(np.tile(self.pattern, bcst).copy(order=self.NUMPY_DEFAULT_ORDER)) diff --git a/yateto/ast/__init__.py b/yateto/ast/__init__.py index 8b13789..e69de29 100644 --- a/yateto/ast/__init__.py +++ b/yateto/ast/__init__.py @@ -1 +0,0 @@ - diff --git a/yateto/ast/cost.py b/yateto/ast/cost.py index 7d6d83b..3d5dbfe 100644 --- a/yateto/ast/cost.py +++ b/yateto/ast/cost.py @@ -25,7 +25,7 @@ def estimate_Product(self, node): for size in node.shape(): cost *= size return cost - + def estimate_IndexSum(self, node): cost = node.sumIndex().shape()[0] - 1 for size in node.indices.shape(): @@ -36,7 +36,7 @@ def estimate_IndexSum(self, node): class CachedCostEstimator(CostEstimator): def __init__(self): self._cost = dict() - + def estimate(self, node): if node in self._cost: return self._cost[node] @@ -95,7 +95,7 @@ def __init__(self): super().__init__() self._lead_dim = 0 self._loaded_to_gpu_cache = {} - + def generic_estimate(self, node): result = super().generic_estimate(node) self._loaded_to_gpu_cache[node] = set() @@ -171,14 +171,14 @@ def __init__(self): def generic_estimate(self, node): self._cache[node] = node.eqspp() return 0 - + def estimate_Product(self, node): spp = node.computeSparsityPattern(self._cache[node.leftTerm()], self._cache[node.rightTerm()]) self._cache[node] = spp return spp.count_nonzero() - + def estimate_IndexSum(self, node): termSpp = self._cache[node.term()] spp = node.computeSparsityPattern(termSpp) - self._cache[node] = spp + self._cache[node] = spp return termSpp.count_nonzero() - spp.count_nonzero() diff --git a/yateto/ast/indices.py b/yateto/ast/indices.py index 6f2ab02..7ac7ed6 100644 --- a/yateto/ast/indices.py +++ b/yateto/ast/indices.py @@ -6,72 +6,72 @@ class Indices(object): def __init__(self, indexNames = '', shape = ()): self._indices = tuple(indexNames) self._size = dict() - + assert len(self._indices) == len(set(self._indices)), 'Repeated indices are not allowed ({}).'.format(indexNames) assert len(self._indices) == len(shape), 'Indices {} do not match tensor shape {}.'.format(str(self), shape) self._size = {self._indices[i]: size for i, size in enumerate(shape)} - + def tostring(self): return ''.join(self._indices) - + def extract(self, indexNames): return Indices(str(indexNames), self.subShape(indexNames)) - + def firstIndex(self): return self.extract(self._indices[0]) def shape(self): return self.subShape(self._indices) - + def subShape(self, indexNames): return tuple(self._size[index] for index in indexNames) def indexSize(self, index): return self._size[index] - + def permuted(self, indexNames): assert set(indexNames) == set(self) return Indices(indexNames, self.subShape(indexNames)) - + def find(self, index): assert len(index) == 1 return self._indices.index(index) - + def positions(self, I, sort=True): pos = [self.find(i) for i in I] if sort: return sorted(pos) return pos - + def __eq__(self, other): return other != None and self._indices == other._indices and self._size == other._size - + def __ne__(self, other): return other == None or self._indices != other._indices or self._size != other._size - + def __hash__(self): return hash((self._indices, self.shape())) - + def __iter__(self): return iter(self._indices) - + def __getitem__(self, key): return self._indices[key] - + def __len__(self): return len(self._indices) - + def __and__(self, other): return set(self) & set(other) - + def __rand__(self, other): return self & other - + def __le__(self, other): indexNamesContained = set(self._indices) <= set(other._indices) return indexNamesContained and all(self._size[index] == other._size[index] for index in self._indices) - + def __sub__(self, other): indexNames = [index for index in self._indices if index not in other] return Indices(indexNames, self.subShape(indexNames)) @@ -80,7 +80,7 @@ def merged(self, other): indexNames = self._indices + other._indices shape = self.subShape(self._indices) + other.subShape(other._indices) return Indices(indexNames, shape) - + def mergeStrict(self, other): indexNames = list(self._indices) shape = list(self.subShape(self._indices)) @@ -94,17 +94,17 @@ def mergeStrict(self, other): myShp = shape[myPos] assert myShp == shp, f"Index merge failed. {self} vs. {other} in {idx}: {myShp} vs. {shp}" return Indices(tuple(indexNames), tuple(shape)) - + def sorted(self): indexNames = sorted(self._indices) return Indices(indexNames, self.subShape(indexNames)) - + def __str__(self): return self.tostring() - + def __repr__(self): return '({})'.format(','.join('{}={}'.format(index, self._size[index]) for index in self._indices)) - + def size(self): return self._size @@ -112,31 +112,31 @@ class Range(object): def __init__(self, start, stop): self.start = start self.stop = stop - + def size(self): return self.stop - self.start - + def aligned(self, arch): return Range(arch.alignedLower(self.start), arch.alignedUpper(self.stop)) - + def __and__(self, other): return Range(max(self.start, other.start), min(self.stop, other.stop)) def __or__(self, other): return Range(min(self.start, other.start), max(self.stop, other.stop)) - + def __contains__(self, other): return self.start <= other.start and self.stop >= other.stop - + def __eq__(self, other): return self.start == other.start and self.stop == other.stop - + def __str__(self): return 'Range({}, {})'.format(self.start, self.stop) - + def __iter__(self): return iter(range(self.start, self.stop)) - + class BoundingBox(object): def __init__(self, listOfRanges): self._box = listOfRanges @@ -150,7 +150,7 @@ def size(self): for r in self._box: s *= r.size() return s - + def __contains__(self, entry): if len(entry) != len(self): return False @@ -159,24 +159,24 @@ def __contains__(self, entry): if isinstance(entry[0], Range): return all(e in self[i] for i,e in enumerate(entry)) return all(e >= self[i].start and e <= self[i].stop for i,e in enumerate(entry)) - + def __getitem__(self, key): return self._box[key] - + def __len__(self): return len(self._box) - + def __iter__(self): return iter(self._box) - + def __eq__(self, other): return all(s == o for s,o in zip(self,other)) - + def __str__(self): return '{}({})'.format(type(self).__name__, ', '.join(str(r) for r in self)) @functools.total_ordering -class LoGCost(object): +class LoGCost(object): def __init__(self, stride = sys.maxsize, leftTranspose = sys.maxsize, rightTranspose = sys.maxsize, fusedIndices = 0): """ stride (w.r.t. first dimension): 0 = unit stride, 1 non-unit stride (lower is better) @@ -187,15 +187,15 @@ def __init__(self, stride = sys.maxsize, leftTranspose = sys.maxsize, rightTrans self._leftTranspose = leftTranspose self._rightTranspose = rightTranspose self._fusedIndices = fusedIndices - + @staticmethod def addIdentity(): return LoGCost(0, 0, 0, 0) - + def _totuple(self): # minus sign before _fusedIndices as higher is better return (self._stride, self._leftTranspose + self._rightTranspose, -self._fusedIndices) - + def __lt__(self, other): s = self._totuple() o = other._totuple() @@ -205,9 +205,9 @@ def __lt__(self, other): def __eq__(self, other): return self._totuple() == other._totuple() and self._leftTranspose == other._leftTranspose - + def __add__(self, other): return LoGCost(self._stride + other._stride, self._leftTranspose + other._leftTranspose, self._rightTranspose + other._rightTranspose, self._fusedIndices + other._fusedIndices) - + def __repr__(self): return '{{stride: {}, left transpose: {}, right transpose: {}, fused indices: {}}}'.format(self._stride, self._leftTranspose, self._rightTranspose, self._fusedIndices) diff --git a/yateto/ast/log.py b/yateto/ast/log.py index c48f201..1f397fd 100644 --- a/yateto/ast/log.py +++ b/yateto/ast/log.py @@ -19,14 +19,14 @@ def fusedVariants(memLayout, I, P, M, prune = False): D = set(s for g in groupStrings for s in allSubstrings(g)) if prune: D = set(d for d in D if d[0] == M[0]) - D = set(d for d in D if memLayout.mayFuse(sorted(P[i] for i in d))) + D = set(d for d in D if memLayout.mayFuse(sorted(P[i] for i in d))) return D def LoG(contraction, Aperm = None, Bperm = None, Cperm = None): L = contraction.leftTerm() R = contraction.rightTerm() I = contraction - + if Aperm is not None: L = copy.copy(L) L.setIndexPermutation(Aperm, permuteEqspp=False) @@ -53,7 +53,7 @@ def LoG(contraction, Aperm = None, Bperm = None, Cperm = None): Im = (set(A) & set(C)) - Icommon In = (set(B) & set(C)) - Icommon Ik = (set(A) & set(B)) - Icommon - + PA = {idx: pos for pos, idx in enumerate(A)} PB = {idx: pos for pos, idx in enumerate(B)} PC = {idx: pos for pos, idx in enumerate(C)} @@ -64,7 +64,7 @@ def LoG(contraction, Aperm = None, Bperm = None, Cperm = None): AK = fusedVariants(L.memoryLayout(), Ik, PA, A) BK = fusedVariants(R.memoryLayout(), Ik, PB, B) BN = fusedVariants(R.memoryLayout(), In, PB, B) - + MC = CM & AM NC = CN & BN KC = AK & BK diff --git a/yateto/ast/node.py b/yateto/ast/node.py index 951ccfc..435ca9f 100644 --- a/yateto/ast/node.py +++ b/yateto/ast/node.py @@ -11,38 +11,38 @@ def __init__(self): self._children = [] self._eqspp = None self.prefetch = None - + def size(self): return self.indices.size() - + def shape(self): return self.indices.shape() - + @abstractmethod def nonZeroFlops(self): pass def __iter__(self): return iter(self._children) - + def __getitem__(self, key): return self._children[key] - + def __len__(self): return len(self._children) - + def setChildren(self, children): self._children = children def eqspp(self): return self._eqspp - + def setEqspp(self, spp): self._eqspp = spp def boundingBox(self): return BoundingBox.fromSpp(self._eqspp) - + @abstractmethod def memoryLayout(self): pass @@ -63,11 +63,11 @@ def setIndexPermutation(self, indices, permuteEqspp=True): def permute(self, indices, spp, strict=True): perm = tuple(indices.find(idx) for idx in self.indices if idx in indices or strict) return spp.transposed(perm) - + def reshape(self, indices, spp): rshp = [indices.indexSize(idx) if idx in indices else 1 for idx in self.indices] return spp.reshape(rshp) - + def broadcast(self, indices, spp): reshaped = self.reshape(indices, spp) bcst = [1 if idx in indices else self.indices.indexSize(idx) for idx in self.indices] @@ -102,34 +102,34 @@ def __mul__(self, other): other.setTerm(self * other.term()) return other return self._binOp(other, Einsum) - + def __rmul__(self, other): return self.__mul__(other) - + def __add__(self, other): if not isinstance(other, Node): raise ValueError('Unsupported operation: Cannot add {} to {}.'.format(self, other)) return self._binOp(other, Add) - + def __radd__(self, other): return self.__add__(other) - + def __neg__(self): self._checkMultipleScalarMults() return ScalarMultiplication(-1.0, self) def __sub__(self, other): return self._binOp(-other, Add) - + def __le__(self, other): return Assign(self, other) - + def subslice(self, index, start, end): return SliceView(self, index, start, end) - + def subselect(self, index, position): return SliceView(self, index, position, position + 1) - + def viewed(self): return self @@ -140,35 +140,35 @@ def __init__(self, subnode, index, start, end): self.index = index self.start = start self.end = end - + def name(self): return self.term().name() - + def viewed(self): return self.term().viewed() - + def term(self): return self[0] - + def nonZeroFlops(self): return 0 - + def setIndexPermutation(self, indices, permuteEqspp=True): assert str(indices) == str(self.indices) def memoryLayout(self): return self._memoryLayout - + def getMemoryLayout(self, memoryLayout): return memoryLayout.subslice(list(self.indices).index(self.index), self.start, self.end) def computeMemoryLayout(self): self._memoryLayout = self.getMemoryLayout(self.term().memoryLayout()) - + def computeSparsityPattern(self, *spps): assert len(spps) in (0, 1) spp = spps[0] if len(spps) == 1 else self.term().eqspp() - + if isinstance(spp, aspp.dense): nowshape = spp.shape subshape = tuple(self.end - self.start if self.indices[i] == self.index else nowshape[i] for i in range(spp.ndim)) @@ -177,7 +177,7 @@ def computeSparsityPattern(self, *spps): subslice = tuple(slice(self.start, self.end) if self.indices[i] == self.index else slice(None) for i in range(spp.ndim)) subarray = spp.as_ndarray()[subslice] return aspp.general(subarray) - + def __str__(self): return f'{type(self).__name__}[{self.index}: {self.start}..{self.end}]' @@ -186,19 +186,19 @@ def __init__(self, tensor, indexNames): super().__init__() self.tensor = tensor self.indices = Indices(indexNames, self.tensor.shape()) - + def nonZeroFlops(self): return 0 - + def setIndexPermutation(self, indices, permuteEqspp=True): assert str(indices) == str(self.indices) - + def spp(self, groupSpp=True): return self.tensor.spp(groupSpp) - + def name(self): return self.tensor.name() - + def memoryLayout(self): return self.tensor.memoryLayout() @@ -216,7 +216,7 @@ def __init__(self, *args): super().__init__() self._children = list(args) self._memoryLayout = None - + def memoryLayout(self): return self._memoryLayout @@ -255,17 +255,17 @@ def setIndexPermutation(self, indices, permuteEqspp=True): if self._memoryLayout is not None: self._memoryLayout = self._memoryLayout.permuted(p) self.indices = self.indices.permuted(indices) - + def __str__(self): return '{}[{}]'.format(type(self).__name__, self.indices if self.indices != None else '') - + def computeSparsityPattern(self, *spps): raise NotImplementedError class Einsum(Op): def nonZeroFlops(self): raise NotImplementedError - + class Add(Op): def computeSparsityPattern(self, *spps): if len(spps) == 0: @@ -276,7 +276,7 @@ def computeSparsityPattern(self, *spps): add_spp = permute_summand(i) spp = aspp.add(spp, add_spp) return spp - + def nonZeroFlops(self): nzFlops = 0 for child in self: @@ -296,7 +296,7 @@ def __init__(self, scalar, term): def fixedIndexPermutation(self): return self.term().fixedIndexPermutation() - + def setTerm(self, term): self._children[0] = term if self.fixedIndexPermutation(): @@ -312,7 +312,7 @@ def is_constant(self): def scalar(self): return self._scalar - + def computeSparsityPattern(self, *spps): if len(spps) == 0: return self.term().eqspp() @@ -323,20 +323,20 @@ def nonZeroFlops(self): if self._isConstant and self._scalar in [-1.0, 1.0]: return 0 return self.eqspp().count_nonzero() - + def __str__(self): return '{}: {}'.format(super().__str__(), str(self._scalar)) class BinOp(Op): def __init__(self, lTerm, rTerm): super().__init__(lTerm, rTerm) - + def leftTerm(self): return self._children[0] - + def rightTerm(self): return self._children[1] - + def setChildren(self, children): if len(children) != 2: raise ValueError('BinOp node must have exactly 2 children.') @@ -347,10 +347,10 @@ def setChildren(self, children): if not isinstance(children[0].viewed(), IndexedTensor): raise ValueError('First child of Assign node must be an IndexedTensor: ' + str(children[0].viewed())) super().setChildren(children) - + def nonZeroFlops(self): return 0 - + def computeSparsityPattern(self, *spps): spp = spps[1] if len(spps) == 2 else self.rightTerm().eqspp() return self.broadcast(self.rightTerm().indices, self.permute(self.rightTerm().indices, spp, False)) @@ -370,7 +370,7 @@ def computeSparsityPattern(self, *spps): assert len(spps) <= 1 spp = spps[0] if len(spps) == 1 else self.term().eqspp() return self.permute(self.term().indices, spp) - + @classmethod def subPermute(cls, term, indices): subIndexNames = [idx for idx in indices if idx in term.indices] @@ -408,10 +408,10 @@ def __init__(self, lTerm, rTerm): assert lTerm.indices.subShape(K) == rTerm.indices.subShape(K) self.indices = lTerm.indices.merged(rTerm.indices - K) - + def nonZeroFlops(self): return self.eqspp().count_nonzero() - + def computeSparsityPattern(self, *spps): if len(spps) == 0: spps = [node.eqspp() for node in self] @@ -423,13 +423,13 @@ def __init__(self, term, sumIndex): super().__init__(term) self.indices = term.indices - set([sumIndex]) self._sumIndex = term.indices.extract(sumIndex) - + def nonZeroFlops(self): return self.term().eqspp().count_nonzero() - self.eqspp().count_nonzero() - + def sumIndex(self): return self._sumIndex - + def computeSparsityPattern(self, *spps): assert len(spps) <= 1 spp = spps[0] if len(spps) == 1 else self.term().eqspp() @@ -446,7 +446,7 @@ def __init__(self, indices, lTerm, rTerm, sumIndices): def nonZeroFlops(self): raise NotImplementedError - + def computeSparsityPattern(self, *spps): if len(spps) == 0: spps = [node.eqspp() for node in self] @@ -486,13 +486,13 @@ def nonZeroFlops(self): p = Product(self.leftTerm(), self.rightTerm()) p.setEqspp( p.computeSparsityPattern() ) return 2*p.nonZeroFlops() - self.eqspp().count_nonzero() - + def computeSparsityPattern(self, *spps): if len(spps) == 0: spps = [node.eqspp() for node in self] assert len(spps) == 2 return _productContractionLoGSparsityPattern(self, *spps) - + def cost(self): A = self.leftTerm().indices B = self.rightTerm().indices @@ -500,13 +500,13 @@ def cost(self): BstrideOne = (B.find(self._k[0]) == 0) if not self._transB else (B.find(self._n[0]) == 0) cost = LoGCost(int(not AstrideOne) + int(not BstrideOne), int(self._transA), int(self._transB), len(self._m) + len(self._n) + len(self._k)) return cost - + def loopIndices(self): i1 = self.indices - (self._m + self._n) i2 = (self.leftTerm().indices - (self._m + self._k)) - i1 i3 = ((self.rightTerm().indices - (self._k + self._n)) - i1) - i2 return i1.merged(i2).merged(i3) - + def transA(self): return self._transA @@ -538,7 +538,7 @@ def indexString(name, fused, indices, transpose=False): if batchedIndices: indexStr = re.sub(r'([{}])'.format(''.join(batchedIndices)), r'[\1]', indexStr) return '{}{}_{{{}}}'.format(name, '^T' if transpose else '', indexStr) - + def __str__(self): Astr = self.indexString('A', [self._m, self._k], self.leftTerm().indices, self._transA) Bstr = self.indexString('B', [self._k, self._n], self.rightTerm().indices, self._transB) @@ -577,4 +577,4 @@ def nonZeroFlops(self): return nzFlops def is_empty(self): - return len(self._children) == 0 \ No newline at end of file + return len(self._children) == 0 diff --git a/yateto/ast/opt.py b/yateto/ast/opt.py index 54884f8..84034e1 100644 --- a/yateto/ast/opt.py +++ b/yateto/ast/opt.py @@ -3,11 +3,11 @@ def strengthReduction(terms, target_indices, cost_estimator, split = 0): n = len(terms) - + indexList = [index for term in terms for index in term.indices] uniqueIndices = set(indexList) summationIndices = set(index for index in uniqueIndices if indexList.count(index) == 1) - set(target_indices) - + while len(summationIndices) != 0: i = split while i < n: diff --git a/yateto/ast/transformer.py b/yateto/ast/transformer.py index 08c21ef..505e6e4 100644 --- a/yateto/ast/transformer.py +++ b/yateto/ast/transformer.py @@ -10,7 +10,7 @@ from .. import aspp # Similar as ast.NodeTransformer -class Transformer(Visitor): +class Transformer(Visitor): def generic_visit(self, node, **kwargs): newChildren = [self.visit(child, **kwargs) for child in node] node.setChildren(newChildren) @@ -20,7 +20,7 @@ class DeduceIndices(Transformer): def __init__(self, targetIndices: Union[str, Indices] = None): self._targetIndices = targetIndices self._indexSetVisitor = ComputeIndexSet() - + def visit(self, node, bound=None): forceIndices = bound is None and self._targetIndices is not None if bound is None: @@ -71,7 +71,7 @@ def visit_Einsum(self, node, bound): deduced = g - contractions node.indices = deduced.sorted() return node - + def visit_Add(self, node, bound): for child in node: self.visit(child, bound) @@ -97,7 +97,7 @@ def visit_ScalarMultiplication(self, node, bound): self.visit(node.term(), bound) node.indices = deepcopy(node.term().indices) return node - + def visit_SliceView(self, node, bound): self.visit(node.term(), bound) node.indices = Indices(node.term().indices, [shape if index != node.index else (node.end - node.start) for index, shape in zip(node.term().indices, node.term().shape())]) @@ -106,7 +106,7 @@ def visit_SliceView(self, node, bound): def visit_Assign(self, node, bound): lhs = node[0] rhs = node[1] - + lhsTensor = lhs.viewed() if not isinstance(lhsTensor, IndexedTensor): raise ValueError('Assign: Left-hand side must be of type IndexedTensor') @@ -213,12 +213,12 @@ def visit_ScalarMultiplication(self, node): self.generic_visit(node) node.setEqspp(node.term().eqspp()) return node - + def visit_Assign(self, node): self.generic_visit(node) node.setEqspp( node.computeSparsityPattern() ) return node - + def getEqspp(self, terms, targetIndices): # Shortcut if all terms have dense eqspps if all(term.eqspp().is_dense() for term in terms): @@ -230,19 +230,19 @@ def getEqspp(self, terms, targetIndices): minTree.setIndexPermutation(targetIndices) minTree = FindContractions().visit(minTree) return ComputeSparsityPattern(True).visit(minTree) - + def visit_Einsum(self, node): self.generic_visit(node) terms = list(node) node.setEqspp( self.getEqspp(terms, node.indices) ) - + for child in node: child.setEqspp( self.getEqspp(terms, child.indices) ) # TODO: Backtracking of equivalent sparsity pattern to children? return node - + def visit_SliceView(self, node): self.generic_visit(node) node.setEqspp(node.computeSparsityPattern()) @@ -263,6 +263,6 @@ def generic_visit(self, node): node.setEqspp( node.computeSparsityPattern() ) node.computeMemoryLayout() return node - + def visit_IndexedTensor(self, node): return node diff --git a/yateto/ast/visitor.py b/yateto/ast/visitor.py index 6b16221..c333d58 100644 --- a/yateto/ast/visitor.py +++ b/yateto/ast/visitor.py @@ -15,7 +15,7 @@ def visit(self, node, **kwargs): method = 'visit_' + node.__class__.__name__ visitor = getattr(self, method, self.generic_visit) return visitor(node, **kwargs) - + def generic_visit(self, node, **kwargs): for child in node: self.visit(child, **kwargs) @@ -37,7 +37,7 @@ def addIndent(string, indent): class PrettyPrinter(Visitor): def __init__(self): self._indent = 0 - + def generic_visit(self, node): print(' ' * self._indent + str(node)) self._indent = self._indent + 1 @@ -54,7 +54,7 @@ def generic_visit(self, node): else: spps = [self.visit(child) for child in node] return node.computeSparsityPattern(*spps) - + def visit_IndexedTensor(self, node): return node.eqspp() @@ -89,7 +89,7 @@ def findVariants(self, node): for child in node: permutationVariants.update( self.visit(child) ) return permutationVariants - + def variantsFixedRootPermutation(self, node, fixedPerm, permutationVariants): variants = dict() minCost = LoGCost.addIdentity() @@ -150,10 +150,10 @@ def visit_ScalarMultiplication(self, node): permutationVariants = self.visit(node.term()) permutationVariants[node] = {key: self.Variant(variant._cost, [key]) for key,variant in permutationVariants[node.term()].items()} return permutationVariants - + def visit_Product(self, node): return self.allPermutationsNoCostBinaryOp(node) - + def visit_IndexSum(self, node): permutationVariants = self.findVariants(node) tV = permutationVariants[node.term()] @@ -171,7 +171,7 @@ def visit_IndexSum(self, node): def visit_Contraction(self, node): permutationVariants = self.findVariants(node) - + variants = dict() iterator = itertools.permutations(node.indices) for Cs in iterator: @@ -228,7 +228,7 @@ def __init__(self, directory): self._directory = directory self._cmap = self.colors.ListedColormap(['white', 'black']) self._norm = self.colors.BoundaryNorm([0.0, 0.5, 1.0], 2, clip=True) - + def generic_visit(self, node): nameFun = getattr(node, 'name', None) name = nameFun() if nameFun else '_result' diff --git a/yateto/codegen/__init__.py b/yateto/codegen/__init__.py index 8b13789..e69de29 100644 --- a/yateto/codegen/__init__.py +++ b/yateto/codegen/__init__.py @@ -1 +0,0 @@ - diff --git a/yateto/codegen/cache.py b/yateto/codegen/cache.py index d2af0e2..8369ebb 100644 --- a/yateto/codegen/cache.py +++ b/yateto/codegen/cache.py @@ -3,14 +3,14 @@ class RoutineGenerator(object): def __call__(self, routineName, fileName): pass - + def target(self): return 'cpu' class GpuRoutineGenerator(object): def __call__(self, routineName, fileName): pass - + def target(self): return 'gpu' @@ -18,17 +18,17 @@ class RoutineCache(object): def __init__(self): self._routines = dict() self._generators = dict() - + def addRoutine(self, name, generator): if name in self._routines and not self._routines[name] == generator: raise RuntimeError(f'`{name}` is already in RoutineCache but the generator is not equal. ' f'(That is, a name was given twice for different routines.)') self._routines[name] = generator - + generatorName = type(generator).__name__ if generatorName not in self._generators: self._generators[generatorName] = generator - + def generate(self, header, cppFileName, gpuFileName): with Cpp(gpuFileName) as gpucpp: with Cpp(cppFileName) as cpp: diff --git a/yateto/codegen/code.py b/yateto/codegen/code.py index da6f93c..31e7713 100644 --- a/yateto/codegen/code.py +++ b/yateto/codegen/code.py @@ -45,13 +45,13 @@ def __enter__(self): def __exit__(self, type, value, traceback): pass - + class Block: def __init__(self, writer, argument, foot = ''): self.writer = writer self.argument = argument self.foot = foot - + def __enter__(self): space = ' ' if self.argument else '' self.writer(self.argument + space + '{') @@ -69,36 +69,36 @@ def __init__(self, writer, arguments, foot=None): self.foot = [''] * len(self.arguments) else: self.foot = foot - + def __enter__(self): for arg in self.arguments: self.writer(arg + ' {') self.writer.indent += 1 - + def __exit__(self, type, value, traceback): # Blocks are closed in reverse order, thus reverse footer for arg, foot in zip(self.arguments, reversed(self.foot)): self.writer.indent -= 1 self.writer('}' + foot) - + class HeaderGuard: def __init__(self, writer, name): self.writer = writer self.name = name - + def __enter__(self): self.writer('#ifndef ' + self.name) self.writer('#define ' + self.name) def __exit__(self, type, value, traceback): self.writer('#endif') - + class PPIfBlock: def __init__(self, writer, name, typ): self.writer = writer self.name = name self.typ = typ - + def __enter__(self): self.writer('#{} {}'.format(self.typ, self.name)) @@ -109,33 +109,33 @@ class Cpp: def __init__(self, streamOrFileName = sys.stdout): self.fileHandle = streamOrFileName self.indent = 0 - + def __enter__(self): self.out = open(self.fileHandle, 'w+') if isinstance(self.fileHandle, str) else self.fileHandle return self - + def __exit__(self, type, value, traceback): if self.out is not sys.stdout: self.out.close() self.out = None - + def __call__(self, code): indentSpace = self.indent * ' ' for line in code.splitlines(): self.out.write(indentSpace + line + '\n') - + def emptyline(self): self.out.write('\n') - + def If(self, argument): return Block(self, 'if ({})'.format(argument)) - + def For(self, argument): return Block(self, 'for ({})'.format(argument)) def ForRange(self, variable, range): return self.For(f'int {variable} = {range.start}; {variable} < {range.end}; ++{variable}') - + def Namespace(self, name): if len(name) == 0: return NoScope() @@ -149,47 +149,47 @@ def Namespace(self, name): def AnonymousScope(self): return Block(self, '') - + def Function(self, name, arguments = '', returnType = 'void', const = False): if returnType: returnType += ' ' return Block(self, '{}{}({}){}'.format(returnType, name, arguments, ' const' if const else '')) - + def functionDeclaration(self, name, arguments = '', returnType = 'void'): return self.__call__('{} {}({});'.format(returnType, name, arguments)) def Class(self, name): return Block(self, 'class ' + name, foot=';') - + def classDeclaration(self, name): return self.__call__('class {};'.format(name)) - + def forwardStruct(self, name): self.__call__('struct {};'.format(name)) def Struct(self, name): return Block(self, 'struct ' + name, foot=';') - + def HeaderGuard(self, name): return HeaderGuard(self, name) - + def PPIfndef(self, name): return PPIfBlock(self, name, 'ifndef') - + def PPIf(self, name): return PPIfBlock(self, name, 'if') - + def label(self, name): self.indent -= 1 self.__call__(name + ':') self.indent += 1 - + def includeSys(self, header): self.__call__('#include <{}>'.format(header)) def include(self, header): self.__call__('#include "{}"'.format(header)) - + def includes(self, header_list): for header in header_list: self.include(header) diff --git a/yateto/codegen/common.py b/yateto/codegen/common.py index 8f36725..626bbb8 100644 --- a/yateto/codegen/common.py +++ b/yateto/codegen/common.py @@ -27,7 +27,7 @@ def __init__(self, name, memoryLayout, eqspp, is_compute_constant=False, is_temp self.values = values self.datatype = datatype self.addressing = addressing - + @classmethod def fromNode(cls, name, node): return cls(name, node.memoryLayout(), node.eqspp()) @@ -83,7 +83,7 @@ def forLoops(cpp, indexNames, ranges, body, pragmaSimd=True, prefix='_', indexNo flops = forLoops(cpp, indexNames, ranges, body, pragmaSimd, prefix, indexNo-1) flops = flops * rng.size() return flops - + def loopRanges(term: IndexedTensorDescription, loopIndices): overlap = set(loopIndices) & set(term.indices) bbox = BoundingBox.fromSpp(term.eqspp) @@ -92,7 +92,7 @@ def loopRanges(term: IndexedTensorDescription, loopIndices): def testLoopRangesEqual(A, B): overlap = A.keys() & B.keys() return all([A[index] == B[index] for index in overlap]) - + def testLoopRangesAContainedInB(A, B): overlap = A.keys() & B.keys() return all([A[index] in B[index] for index in overlap]) @@ -188,7 +188,7 @@ def __init__(self, kernel: Function, arguments: list[TinytcKernelArgument | Tiny hasher = hashlib.sha512() hasher.update(self.source.encode('utf-8')) self.name = f'tinytc_wrapper_{hasher.hexdigest()}' - + self.wrapper_args = [f'long {BatchedOperationsAux.NUM_ELEMENTS_NAME}', f'void* {BatchedOperationsAux.STREAM_PTR_NAME}'] self.wrapper_call_args = [] self.call_args = [] @@ -207,7 +207,7 @@ def __init__(self, kernel: Function, arguments: list[TinytcKernelArgument | Tiny if not arg.constant: self.wrapper_call_args.append(BatchedOperationsAux.NUM_ELEMENTS_NAME) if not arg.temporary and not arg.constant: - offset_name = f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{arg.name}' + offset_name = f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{arg.name}' self.wrapper_args.append(f'long {offset_name}') self.wrapper_call_args.append(offset_name) self.call_args.append(f'{BatchedOperationsAux.EXTRA_OFFSET_NAME}_{arg.call_expr}') diff --git a/yateto/codegen/copyscaleadd/factory.py b/yateto/codegen/copyscaleadd/factory.py index 5e30735..8c936f6 100644 --- a/yateto/codegen/copyscaleadd/factory.py +++ b/yateto/codegen/copyscaleadd/factory.py @@ -21,7 +21,7 @@ def __init__(self, alpha, beta, result: IndexedTensorDescription, term: IndexedT self.beta = beta self.result = result self.term = term - + assert self.alpha != 0.0, 'copyscaleadd does not support alpha=0.0 at the moment.' assert self.beta == 1.0 or self.beta == 0.0, 'copyscaleadd supports only beta=0.0 or beta=1.0 at the moment.' @@ -37,7 +37,7 @@ def __init__(self, alpha, beta, result: IndexedTensorDescription, term: IndexedT for idx in rB: if idx not in rA: rAB[idx] = rB[idx] - + self.loopRanges = rAB diff --git a/yateto/codegen/copyscaleadd/generic.py b/yateto/codegen/copyscaleadd/generic.py index 44bf704..078199d 100644 --- a/yateto/codegen/copyscaleadd/generic.py +++ b/yateto/codegen/copyscaleadd/generic.py @@ -4,7 +4,7 @@ class Generic(object): def __init__(self, arch, descr): self._arch = arch self._descr = descr - + def _formatTerm(self, alpha, term): prefix = '' if alpha == 0.0: diff --git a/yateto/codegen/factory.py b/yateto/codegen/factory.py index 5999542..bff1654 100644 --- a/yateto/codegen/factory.py +++ b/yateto/codegen/factory.py @@ -13,12 +13,12 @@ def __init__(self, cpp, arch, target): self._arch = arch self._freeList = list() self._target = target - + def create(self, node, *args): method = 'create_' + node.__class__.__name__ factory = getattr(self, method, self.generic_create) return factory(node, *args) - + def generic_create(self, node, *args): raise NotImplementedError @@ -59,7 +59,7 @@ def temporary(self, bufname, size, iniZero=False, memory=list()): def allocateTemporary(self): return True - + def post_generate(self, routine_cache): pass @@ -118,7 +118,7 @@ def create_FusedGEMMs(self, node, result, arguments, add, scalar, prefetchName, description = fused_gemms.Description(node, result, arguments, add, scalar) generator = fused_gemms.generator(self._arch, description, gemm_cfg, self._target) return generator.generate(self._cpp, routineCache, gemm_cfg) - + def create_IndexSum(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): assert len(arguments) == 1 description = indexsum.Description( @@ -129,7 +129,7 @@ def create_IndexSum(self, node, result, arguments, add, scalar, prefetchName, ro ) generator = indexsum.generator(self._arch, description, self._target) return generator.generate(self._cpp, routineCache) - + def create_Product(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): assert len(arguments) == 2 description = product.Description( @@ -146,17 +146,17 @@ def create_Permute(self, node, result, arguments, add, scalar, prefetchName, rou result = IndexedTensorDescription.fromNode(result, node) term = IndexedTensorDescription.fromNode(arguments[0], node.term()) return self._csa(result, term, add, scalar, routineCache, gemm_cfg) - + def create_Broadcast(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): result = IndexedTensorDescription.fromNode(result, node) term = IndexedTensorDescription.fromNode(arguments[0], node.term()) return self._csa(result, term, add, scalar, routineCache, gemm_cfg) - + def simple(self, result, term, add, scalar, routineCache, gemm_cfg): result = IndexedTensorDescription.fromVar(result, self._indices(result)) term = IndexedTensorDescription.fromVar(term, self._indices(term)) return self._csa(result, term, add, scalar, routineCache, gemm_cfg) - + def _csa(self, result, term, add, scalar, routineCache, gemm_cfg): description = copyscaleadd.Description( alpha = scalar, @@ -177,30 +177,30 @@ def __init__(self, cpp, arch, nameFun, testFramework): def _formatTerm(self, var, indices): address = var.memoryLayout().addressString(indices) return '{}[{}]'.format(self._name(var), address) - + def create_Einsum(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): g = node.indices for child in node: g = g.merged(child.indices - g) - + ranges = {idx: Range(0, g.indexSize(idx)) for idx in g} - + resultTerm = self._formatTerm(result, node.indices) terms = [self._formatTerm(arguments[i], child.indices) for i,child in enumerate(node)] - + if scalar and scalar != 1.0: terms.insert(0, str(scalar)) - + if not add: self._cpp.memset(self._name(result), result.memoryLayout().requiredReals(), self._arch.typename) - + class EinsumBody(object): def __call__(s): self._cpp( '{} += {};'.format(resultTerm, ' * '.join(terms)) ) return len(terms) return forLoops(self._cpp, g, ranges, EinsumBody(), pragmaSimd=False) - + def create_ScalarMultiplication(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): return self.simple(result, arguments[0], add, scalar, routineCache) @@ -209,7 +209,7 @@ def create_Permute(self, node, result, arguments, add, scalar, prefetchName, rou resultTerm = self._formatTerm(result, node.indices) termTerm = self._formatTerm(arguments[0], node.term().indices) return self._simpleBody(resultTerm, termTerm, add, scalar, node.indices) - + def create_Broadcast(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): assert node.term().indices <= node.indices resultTerm = self._formatTerm(result, node.indices) @@ -282,10 +282,10 @@ class ExportGenerator: def __init__(self, arch): self.arch = arch - + def generate(self, cpp, cache): pass - + def add_linear_operation(self, dest, ops, target, permute, add): pass @@ -297,25 +297,25 @@ def makeFactory(cls, generator): def __init__(self, generator, cpp, arch, target): super().__init__(cpp, arch, target) self.generator = generator - + def post_generate(self, routine_cache): self.generator.generate(self._cpp, routine_cache) def allocateTemporary(self): return False - + def create_LoopOverGEMM(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): assert len(arguments) == 2 makeNode = IndexedTensorDescription.fromNode argnodes = [makeNode(arguments[0], node.leftTerm()), makeNode(arguments[1], node.rightTerm())] return self.handleLinear(makeNode(result, node), argnodes, add, scalar, node.transA(), node.transB()) - + def create_IndexSum(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): assert len(arguments) == 1 makeNode = IndexedTensorDescription.fromNode argnodes = [makeNode(arguments[0], node.term())] return self.handleLinear(makeNode(result, node), argnodes, add, scalar, False, False) - + def create_Product(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): assert len(arguments) == 2 makeNode = IndexedTensorDescription.fromNode @@ -329,7 +329,7 @@ def create_Permute(self, node, result, arguments, add, scalar, prefetchName, rou def create_Broadcast(self, node, result, arguments, add, scalar, prefetchName, routineCache, gemm_cfg): term = arguments[0] return self.handleLinear(IndexedTensorDescription.fromVar(result, node.indices), [IndexedTensorDescription.fromVar(term, node.term().indices)], add, scalar, False, False) - + def simple(self, result, term, add, scalar, routineCache, gemm_cfg): return self.handleLinear(IndexedTensorDescription.fromVar(result, self._indices(result)), [IndexedTensorDescription.fromVar(term, self._indices(term))], add, scalar, False, False) @@ -357,10 +357,10 @@ def handleLinear(self, dest, ops, add, scalar, transposeA, transposeB): # convert indices to loop numbers target, permute = self.getIndices(dest, ops) - + if not (scalar == 1 or scalar == 1.0): ops += [scalar] target += [[]] permute += [[]] - + return self.generator.add_linear_operation(dest, ops, target, permute, add) diff --git a/yateto/codegen/gemm/factory.py b/yateto/codegen/gemm/factory.py index 98c177c..0f38428 100644 --- a/yateto/codegen/gemm/factory.py +++ b/yateto/codegen/gemm/factory.py @@ -26,17 +26,17 @@ def __init__(self, self.alpha = alpha self.beta = beta self.prefetchName = prefetchName - + self.isACsc = self.leftTerm.memoryLayout.isCSC() self.isBCsc = self.rightTerm.memoryLayout.isCSC() - + bbA = BoundingBox.fromSpp(self.leftTerm.eqspp) bbB = BoundingBox.fromSpp(self.rightTerm.eqspp) bbC = BoundingBox.fromSpp(self.result.eqspp) - + kA = 1 if not transA else 0 kB = 0 if not transB else 1 - + k = bbA[kA] & bbB[kB] m = bbA[1-kA] n = bbB[1-kB] @@ -46,19 +46,19 @@ def __init__(self, self.alignedA = alignedStartA and not transA and self.leftTerm.memoryLayout.alignedStride() self.alignedC = alignedStartC and self.result.memoryLayout.alignedStride() - + if self.alignedA and self.alignedC: m = m.aligned(arch) else: mStartAligned = arch.checkAlignment(m.start) self.alignedA = self.alignedA & mStartAligned self.alignedC = self.alignedC & mStartAligned - + self._mnk = (m, n, k) def mnk(self): return self._mnk - + def setBeta(self, beta): self.beta = beta diff --git a/yateto/codegen/gemm/gemmgen.py b/yateto/codegen/gemm/gemmgen.py index 35b0b95..5510d18 100644 --- a/yateto/codegen/gemm/gemmgen.py +++ b/yateto/codegen/gemm/gemmgen.py @@ -86,14 +86,14 @@ def _pointer(self, term, offset2, transpose): if o > 0: return '{} + {}'.format(term.name, o) return term.name - + def generate(self, cpp, routineCache): d = self._descr m, n, k = d.mnk() ldA = 0 if d.isACsc else d.leftTerm.memoryLayout.stridei(1) ldB = 0 if d.isBCsc else d.rightTerm.memoryLayout.stridei(1) ldC = d.result.memoryLayout.stridei(1) - + assert (d.transA and (k,m) in d.leftTerm.memoryLayout) or (not d.transA and (m,k) in d.leftTerm.memoryLayout) assert (d.transB and (n,k) in d.rightTerm.memoryLayout) or (not d.transB and (k,n) in d.rightTerm.memoryLayout) assert (m,n) in d.result.memoryLayout @@ -109,7 +109,7 @@ def generate(self, cpp, routineCache): if d.isBCsc: sppB = d.rightTerm.memoryLayout.entries(k, n) sppBRows = d.rightTerm.memoryLayout.shape()[0] - + if d.isACsc and d.isBCsc: # count the flops by splitting into outer products (i.e. partition by k) # for each outer product, we need to compute all-by-all nonzero entries for m and n @@ -124,7 +124,7 @@ def generate(self, cpp, routineCache): flops = 2 * m.size() * len(sppB) else: flops = 2 * m.size() * n.size() * k.size() - + if isinstance(self._gemm_cfg, BLASlike): ptr_a = self._pointer(term=d.leftTerm, offset2=(m.start, k.start), transpose=d.transA) ptr_b = self._pointer(term=d.rightTerm, offset2=(k.start, n.start), transpose=d.transB) @@ -293,13 +293,13 @@ def __init__(self, arch, gemmDescr, sppA, sppARows, sppB, sppBRows, gemm_cfg): self._mode = gemm_cfg.operation_name self._cmd = gemm_cfg.cmd self._blockSize = gemm_cfg.blockSize(gemmDescr['M'], gemmDescr['N'], gemmDescr['K']) if hasattr(gemm_cfg, 'blockSize') else dict() - + def __eq__(self, other): return self._arch == other._arch and \ self._gemmDescr == other._gemmDescr and \ self._sppA == other._sppA and \ self._sppB == other._sppB - + def header(self, cpp): with cpp.PPIfndef('NDEBUG'): cpp('extern long long libxsmm_num_total_flops;') @@ -319,7 +319,7 @@ def _callGenerator(self, argList): Given command: {' '.join(strcmd)} Stdout: {result.stdout} Stderr: {result.stderr}""") - + def __call__(self, routineName, fileName): cpu_arch = self._arch.host_name @@ -400,7 +400,7 @@ def __init__(self, shape, spp): self._shape = shape self._spp = spp self._temp = None - + def __enter__(self): if self._spp is not None: self._temp = tempfile.NamedTemporaryFile() @@ -413,11 +413,11 @@ def __enter__(self): self._temp.flush() return self._temp.name return None - + def __exit__(self, exc_type, exc_val, exc_tb): if self._spp is not None: self._temp.__exit__(exc_type, exc_val, exc_tb) - + with SparsityWrapper((self._gemmDescr['M'], self._gemmDescr['K']), self._sppA) as afile: with SparsityWrapper((self._sppBRows if self._mode=='pspamm' else self._gemmDescr['K'], self._gemmDescr['N']), self._sppB) as bfile: if self._mode == 'libxsmm': @@ -533,7 +533,7 @@ def _kernel(self, routine_name): {alpha}, // alpha {beta}, // beta {prefetch_flag} // prefetch -); +); """.format(kernel_var_name=kernel_var_name, prec=self._arch.typename, M=M, N=N, K=K, ldA=ldA, ldB=ldB, ldC=ldC, diff --git a/yateto/codegen/gemm/generic.py b/yateto/codegen/gemm/generic.py index 7d9d92d..c161976 100644 --- a/yateto/codegen/gemm/generic.py +++ b/yateto/codegen/gemm/generic.py @@ -103,7 +103,7 @@ def _generateSparseSparse(self, cpp): def _generateSparseDense(self, cpp): d = self._descr m, n, k = d.mnk() - + assert d.isACsc != d.isBCsc Aaccess = self._accessFun(d.leftTerm, (m.start, k.start), d.isACsc, d.transA) @@ -153,11 +153,11 @@ def _generateSparseDense(self, cpp): def _generateDenseDense(self, cpp): d = self._descr m, n, k = d.mnk() - + Aaccess = self._accessFun(d.leftTerm, (m.start, k.start), False, d.transA) Baccess = self._accessFun(d.rightTerm, (k.start, n.start), False, d.transB) Caccess = self._accessFun(d.result, (m.start, n.start), False, False) - + with cpp.For('int n = 0; n < {0}; ++n'.format(n.size())): if d.beta != 1.0: with cpp.For('int m = 0; m < {0}; ++m'.format(m.size())): diff --git a/yateto/codegen/indexsum/factory.py b/yateto/codegen/indexsum/factory.py index 48e3423..a3422a3 100644 --- a/yateto/codegen/indexsum/factory.py +++ b/yateto/codegen/indexsum/factory.py @@ -7,21 +7,21 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, term: Ind self.add = add self.result = result self.term = term - + rA = loopRanges(self.term, self.result.indices) rB = loopRanges(self.result, self.result.indices) assert testLoopRangesAContainedInB(rA, rB) - + self.loopRanges = rA - + self.sumIndex = self.term.indices - self.result.indices assert len(self.sumIndex) == 1 self.sumLoopRange = loopRanges(self.term, self.sumIndex)[str(self.sumIndex)] - + def generator(arch, descr, target): if target == 'cpu': return Generic(arch, descr) elif target == 'gpu': - raise RuntimeError("IndexSum operation has not been implemented for GPU-like architectures") \ No newline at end of file + raise RuntimeError("IndexSum operation has not been implemented for GPU-like architectures") diff --git a/yateto/codegen/indexsum/generic.py b/yateto/codegen/indexsum/generic.py index 36df9f5..6f072ab 100644 --- a/yateto/codegen/indexsum/generic.py +++ b/yateto/codegen/indexsum/generic.py @@ -7,11 +7,11 @@ def __init__(self, arch, descr): def generate(self, cpp, routineCache): d = self._descr - + if not d.add: writeBB = boundingBoxFromLoopRanges(d.result.indices, d.loopRanges) initializeWithZero(cpp, self._arch, d.result, writeBB) - + sumIndex = d.term.indices - d.result.indices assert len(sumIndex) == 1 class IndexSumBody(object): @@ -23,7 +23,7 @@ def __call__(s): cpp( 'sum += {}[{}];'.format(d.term.name, d.term.memoryLayout.addressString(d.term.indices)) ) mult = '{} * '.format(d.alpha) if d.alpha != 1.0 else '' cpp( '{} = {}sum;'.format(target, mult) ) - + flop = 1 if d.alpha != 1.0 else 0 return d.sumLoopRange.size() + flop diff --git a/yateto/codegen/log/factory.py b/yateto/codegen/log/factory.py index 0233228..3571114 100644 --- a/yateto/codegen/log/factory.py +++ b/yateto/codegen/log/factory.py @@ -22,22 +22,22 @@ def __init__(self, self.transA = transA self.transB = transB self.prefetchName = prefetchName - + rA = loopRanges(self.leftTerm, self.loopIndices) rB = loopRanges(self.rightTerm, self.loopIndices) rC = loopRanges(self.result, self.loopIndices) assert testLoopRangesEqual(rA, rB) assert testLoopRangesAContainedInB(rA, rC) assert testLoopRangesAContainedInB(rB, rC) - + rC.update(rA) rC.update(rB) self.loopRanges = rC - + self.innerLoopIndices = self.loopIndices - self.result.indices self.outerLoopIndices = self.loopIndices - self.innerLoopIndices - + self.assignLoopRanges = copy.deepcopy(self.loopRanges) self.addLoopRanges = copy.deepcopy(self.loopRanges) @@ -52,8 +52,7 @@ def __init__(self, self.addLoopRanges[peelOffIndex].start = self.loopRanges[peelOffIndex].start+1 else: self.assignLoopRanges = None - + def generator(arch, descr, target): return Generic(arch, descr, target) - diff --git a/yateto/codegen/log/generic.py b/yateto/codegen/log/generic.py index 357f363..a239e8c 100644 --- a/yateto/codegen/log/generic.py +++ b/yateto/codegen/log/generic.py @@ -8,7 +8,7 @@ def __init__(self, arch, descr, target): self._arch = arch self._descr = descr self._target = target - + def _pointer(self, cpp, targetName, baseName, term, loopIndices, const=True): indices = term.indices & loopIndices addressStr = term.memoryLayout.addressString(term.indices, indices) if len(indices) > 0 else '' @@ -18,7 +18,7 @@ def _pointer(self, cpp, targetName, baseName, term, loopIndices, const=True): def _alignedStart(self, term, loopIndices): return term.memoryLayout.isAlignedAddressString(term.indices, term.indices & loopIndices) - + def _memLayout(self, term, I, J): if len(I) == 0 and len(J) == 0: return DenseMemoryLayout((1,1)) @@ -34,7 +34,7 @@ def _memLayout(self, term, I, J): def _reduce(self, term, subset, memLayout): return reduceSpp(term.eqspp, term.indices, subset).reshape(memLayout.shape()) - + def _defuse(self, fusedRange, term, I): if len(I) == 1: return {next(iter(I)): fusedRange} @@ -42,14 +42,14 @@ def _defuse(self, fusedRange, term, I): def generate(self, cpp, routineCache, gemm_cfg): d = self._descr - + A = d.leftTerm.indices - d.loopIndices B = d.rightTerm.indices - d.loopIndices C = d.result.indices - d.loopIndices Im = set(A) & set(C) In = set(B) & set(C) Ik = set(A) & set(B) - + hasOuterLoops = len(d.outerLoopIndices) > 0 if hasOuterLoops and self._target == 'gpu': @@ -60,13 +60,13 @@ def generate(self, cpp, routineCache, gemm_cfg): outerBname = '_B' if hasOuterLoops else d.rightTerm.name outerCname = '_C' if hasOuterLoops else d.result.name outerPrefetchName = '_Cprefetch' if hasOuterLoops and d.prefetchName is not None else d.prefetchName - + hasInnerLoops = len(d.innerLoopIndices) > 0 innerAname = '_Ain' if hasInnerLoops else outerAname innerBname = '_Bin' if hasInnerLoops else outerBname innerCname = '_Cin' if hasInnerLoops else outerCname innerPrefetchName = '_Cprefetchin' if hasInnerLoops and outerPrefetchName is not None else outerPrefetchName - + AmemLayout = self._memLayout(d.leftTerm, Im, Ik) BmemLayout = self._memLayout(d.rightTerm, Ik, In) CmemLayout = self._memLayout(d.result, Im, In) @@ -88,7 +88,7 @@ def generate(self, cpp, routineCache, gemm_cfg): alignedStartC = self._alignedStart(d.result, d.outerLoopIndices) and self._alignedStart(d.result, d.innerLoopIndices), prefetchName = innerPrefetchName ) - + if not d.add: lr = dict() m, n, k = gemmDescr.mnk() @@ -97,7 +97,7 @@ def generate(self, cpp, routineCache, gemm_cfg): lr.update( self._defuse(n, d.rightTerm, In) ) writeBB = boundingBoxFromLoopRanges(d.result.indices, lr) initializeWithZero(cpp, self._arch, d.result, writeBB) - + class LoGBody(object): def __call__(s): if hasInnerLoops: @@ -127,4 +127,3 @@ def __call__(s): return flops return forLoops(cpp, d.outerLoopIndices, d.loopRanges, InnerLoopBody(), pragmaSimd=False) - diff --git a/yateto/codegen/product/factory.py b/yateto/codegen/product/factory.py index 16c027f..a2e3059 100644 --- a/yateto/codegen/product/factory.py +++ b/yateto/codegen/product/factory.py @@ -11,21 +11,20 @@ def __init__(self, alpha, add: bool, result: IndexedTensorDescription, leftTerm: self.isACsc = self.leftTerm.memoryLayout.isCSC() self.isBCsc = self.rightTerm.memoryLayout.isCSC() - + rA = loopRanges(self.leftTerm, self.result.indices) rB = loopRanges(self.rightTerm, self.result.indices) rC = loopRanges(self.result, self.result.indices) assert testLoopRangesEqual(rA, rB) assert testLoopRangesAContainedInB(rA, rC) assert testLoopRangesAContainedInB(rB, rC) - + rA.update(rB) - self.loopRanges = rA + self.loopRanges = rA def generator(arch, descr, target): if target == 'cpu': return Generic(arch, descr) elif target == 'gpu': raise RuntimeError("Product operation has not been implemented for GPU-like architectures") - diff --git a/yateto/codegen/visitor.py b/yateto/codegen/visitor.py index a7bc4a4..2e1c6d9 100644 --- a/yateto/codegen/visitor.py +++ b/yateto/codegen/visitor.py @@ -109,7 +109,7 @@ class OptimizedKernelGenerator(KernelGenerator): TEMP_MEM_REQUIRED_NAME = 'TmpMemRequiredInBytes' TEMP_MAX_MEM_REQUIRED_NAME = 'TmpMaxMemRequiredInBytes' - + def __init__(self, arch, routineCache, routine_exporters): super().__init__(arch) self._routineCache = routineCache @@ -122,7 +122,7 @@ def __init__(self, arch, routineCache, routine_exporters): for entry in routine_exporters: self._routine_factories[entry] = ExportFactory.makeFactory(routine_exporters[entry]) - + class KernelOutline(object): def __init__(self, nonZeroFlops, @@ -158,7 +158,7 @@ def _addTensor(cls, tensor, tensors): tensors[base_name] = tensors[base_name] | {group} else: tensors[base_name] = {group} - + def generateKernelOutline(self, nonZeroFlops, cfg, gemm_cfg, target): scalarsP = ScalarsSet().visit(cfg) variables = SortedGlobalsList().visit(cfg) @@ -194,7 +194,7 @@ def generateKernelOutline(self, nonZeroFlops, cfg, gemm_cfg, target): factory.freeTmp() factory.reset_stream() factory.reset_flags() - function = functionIO.getvalue() + function = functionIO.getvalue() return self.KernelOutline(nonZeroFlops, hwFlops, tensors, @@ -213,7 +213,7 @@ def _addFromKO(cls, koEntries, entries): entries[key] = value else: entries[key] = entries[key] | value - + def generate(self, cpp, header, name, kernelOutlines, familyStride=None): tensors = collections.OrderedDict() @@ -295,7 +295,7 @@ def kernelArgs(base_name_with_namespace, groups, writable, is_constant, target): header(f'{class_name}::{container_type} {base_name};') else: header(f'{typ}{ptr_type} {base_name}{{}};') - + def scalarArgs(base_name_with_namespace, groups): prefix, base_name = Tensor.splitBasename(base_name_with_namespace) typ = self._arch.typename @@ -419,7 +419,7 @@ class UnitTestGenerator(KernelGenerator): STREAM = '_stream' TMP_MEM = '_tmpMem' TMP_SIZE = 128 * 8 - + def __init__(self, arch): super().__init__(arch) @@ -475,7 +475,7 @@ def _name(cls, var): def _viewName(self, var): return '_view_' + self._name(var) - + def _groupStr(self, var): group = var.group() return ','.join([str(g) for g in group]) @@ -487,7 +487,7 @@ def _groupTemplate(self, var): def _groupIndex(self, var): gstr = self._groupStr(var) return '({})'.format(gstr) if gstr else '' - + def generate(self, cpp, namespace, testName, kernelClass, cfg, target, gemm_cfg, testFramework, index=None): if target == 'gpu': if self._arch.backend in ['oneapi', 'acpp', 'hipsycl']: @@ -534,7 +534,7 @@ def generate(self, cpp, namespace, testName, kernelClass, cfg, target, gemm_cfg, for var in variables: factory.tensor(var.tensor, self._tensorName(var)) factory.temporary(self._name(var), var.memoryLayout().requiredReals(), iniZero=True) - + shape = var.memoryLayout().shape() cpp('{supportNS}::DenseTensorView<{dim},{arch.typename},{arch.uintTypename}> {viewName}({utName}, {{{shape}}}, {{{start}}}, {{{stop}}});'.format( supportNS = SUPPORT_LIBRARY_NAMESPACE, @@ -632,33 +632,33 @@ class InitializerGenerator(object): VIEW_FUN_NAME = 'create' VIEW_TYPE_NAME = 'type' VIEW_TYPE_NAME_CONST = 'type_const' - + class TensorView(object): ARGUMENT_NAME = 'values' def typename(self, dim, arch, const): constStr = 'true' if const else 'false' return '::{}::{}<{},{},{},{}>'.format(SUPPORT_LIBRARY_NAMESPACE, type(self).__name__, dim, arch.typename, arch.uintTypename, constStr) - + @classmethod def arguments(cls, arch, const): if const: return '{} const* {}'.format(arch.typename, cls.ARGUMENT_NAME) else: return '{} * {}'.format(arch.typename, cls.ARGUMENT_NAME) - + def generate(cpp, group, memLayout, arch, index, const): raise NotImplementedError - + def listToInitializerList(self, lst): return '{{{}}}'.format(', '.join([str(l) for l in lst])) - + def formatArray(self, numberType, name, values, declarationOnly): lhs = '{} {}[]'.format(numberType, name) if declarationOnly: return '' return '{} {} = {};'.format(MODIFIERS, lhs, self.listToInitializerList(values)) - + class DenseTensorView(TensorView): START_NAME = 'Start' STOP_NAME = 'Stop' @@ -680,7 +680,7 @@ def arrays(self, cpp, memLayout, arch, namespace, index, numberType, declaration class CSCMatrixView(TensorView): ROWIND_NAME = 'RowInd' COLPTR_NAME = 'ColPtr' - + def typename(self, dim, arch, const): constStr = 'true' if const else 'false' return '::{}::{}<{},{},{}>'.format(SUPPORT_LIBRARY_NAMESPACE, type(self).__name__, arch.typename, arch.uintTypename, constStr) @@ -736,14 +736,14 @@ def __init__(self, arch, tensors, scalars): self._groupSize = {baseName: tuple(map(lambda x: x+1, mi)) for baseName, mi in maxIndex.items()} maxIndexScalar = {baseName: tuple(map(max, *groups.keys())) if len(groups) > 1 else next(iter(groups.keys())) for baseName, groups in self._scalarCollect.items()} self._groupSizeScalar = {baseName: tuple(map(lambda x: x+1, mi)) for baseName, mi in maxIndexScalar.items()} - + def _tensorViewGenerator(self, memoryLayout): memLayoutMap = { 'DenseMemoryLayout': self.DenseTensorView, 'CSCMemoryLayout': self.CSCMatrixView } return memLayoutMap[type(memoryLayout).__name__]() - + def iterate_collect(self): cur_namespace = '' cur_dict = collections.OrderedDict() @@ -783,7 +783,7 @@ def iterate_collect_scalar(self): def generateTensorsH(self, header): for namespace, tensor_dict in self.iterate_collect(): with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE): - for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items(): + for (baseName, baseNameWithoutNamespace), tensors in tensor_dict.items(): with header.Struct(baseNameWithoutNamespace): groupSize = self._groupSize[baseName] self._tensor(header, '', tensors, groupSize, False) @@ -809,7 +809,7 @@ def generateTensorsH(self, header): header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) for namespace, scalar_dict in self.iterate_collect_scalar(): with header.Namespace(namespace), header.Namespace(self.TENSOR_NAMESPACE): - for (baseName, baseNameWithoutNamespace), scalars in scalar_dict.items(): + for (baseName, baseNameWithoutNamespace), scalars in scalar_dict.items(): with header.Struct(baseNameWithoutNamespace): groupSize = self._groupSizeScalar[baseName] args = ndargs(len(groupSize)) @@ -827,13 +827,13 @@ def generateTensorsH(self, header): header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) with header.Function('operator()', typedArgs, '{} T const&'.format(INLINE), const=True): header('return {}[{}({})];'.format(self.CONTAINER_DATA_NAME, self.INDEX_FUN_NAME, ', '.join(args))) - + def generateTensorsCpp(self, cpp): for namespace, tensor_dict in self.iterate_collect(): with cpp.Namespace(namespace): for (base_name, base_name_without_namespace), tensors in tensor_dict.items(): self._tensor(cpp, '::'.join([self.TENSOR_NAMESPACE, base_name_without_namespace, '']), tensors, self._groupSize[base_name], True) - + def generateInitH(self, header): for namespace, tensor_dict in self.iterate_collect(): with header.Namespace(namespace), header.Namespace(self.INIT_NAMESPACE): @@ -931,7 +931,7 @@ def _init(self, cpp, baseName, baseNameWithoutNamespace, name, tensors, declarat tv.generate(cpp, ml, self._arch, index(group), False) with cpp.Function(self.VIEW_FUN_NAME, arguments=viewArgsConst, returnType='{} {}'.format(STATIC_INLINE, self.VIEW_TYPE_NAME_CONST)): tv.generate(cpp, ml, self._arch, index(group), True) - + def _array(self, cpp, typ, name, content, groupSize, declarationOnly=False, alwaysArray=True, constexpr=True, static=True): cexpr = CONSTEXPR + ' ' if constexpr else '' stat = STATIC + ' ' if static else '' @@ -964,7 +964,5 @@ def _array(self, cpp, typ, name, content, groupSize, declarationOnly=False, alwa initStr = ', '.join(init) if isGroup: initStr = '{{{}}}'.format(initStr) - - cpp('{}{}{} {}{}{} = {};'.format(cexpr, stat, typ, name, groupIndices, arrayIndices, initStr)) - + cpp('{}{}{} {}{}{} = {};'.format(cexpr, stat, typ, name, groupIndices, arrayIndices, initStr)) diff --git a/yateto/controlflow/__init__.py b/yateto/controlflow/__init__.py index 8b13789..e69de29 100644 --- a/yateto/controlflow/__init__.py +++ b/yateto/controlflow/__init__.py @@ -1 +0,0 @@ - diff --git a/yateto/controlflow/fused_gemm_automata/__init__.py b/yateto/controlflow/fused_gemm_automata/__init__.py index 3939969..6df79ed 100644 --- a/yateto/controlflow/fused_gemm_automata/__init__.py +++ b/yateto/controlflow/fused_gemm_automata/__init__.py @@ -1 +1 @@ -from .automata import Context \ No newline at end of file +from .automata import Context diff --git a/yateto/controlflow/graph.py b/yateto/controlflow/graph.py index e1b1420..a5af954 100644 --- a/yateto/controlflow/graph.py +++ b/yateto/controlflow/graph.py @@ -17,7 +17,7 @@ def variables(self): def maySubstitute(self, when, by): return self.substituted(when, by).memoryLayout().isCompatible(self.eqspp()) - + def substituted(self, when, by, memoryLayout=None): return by if self == when else self @@ -38,13 +38,13 @@ def eqspp(self): def __hash__(self): return hash(self.name) - + def __str__(self): return self.name - + def __repr__(self): return str(self) - + def __eq__(self, other): isEq = self.name == other.viewed().name # and self._memoryLayout == other._memoryLayout assert not isEq or (self.writable == other.viewed().writable and self._memoryLayout == other.viewed()._memoryLayout) @@ -53,7 +53,7 @@ def __eq__(self, other): def setWritable(self, name): if self.name == name: self.writable = True - + def viewed(self): return self @@ -66,18 +66,18 @@ def __init__(self, variable, memoryLayout, eqspp): @property def tensor(self): return self.variable.tensor - + @property def writable(self): return self.variable.writable - + @property def is_temporary(self): return self.variable.is_temporary - + def maySubstitute(self, when, by): return self.substituted(when, by).memoryLayout().isCompatible(self.eqspp()) - + def substituted(self, when, by, memoryLayout=None): return by if self == when else self @@ -104,13 +104,13 @@ def eqspp(self): def __hash__(self): return hash(self.variable.name) - + def __str__(self): return f'{self.variable.name}' - + def __repr__(self): return str(self) - + def __eq__(self, other): isEq = self.variable == other.viewed() and self._memoryLayout == other._memoryLayout return isEq @@ -170,10 +170,10 @@ def isRHSExpression(self): def isRHSVariable(self): return not self.isRHSExpression() - + def isCompound(self): return self.add - + def hasTrivialScalar(self): return self.scalar is None or self.scalar == 1.0 diff --git a/yateto/controlflow/transformer.py b/yateto/controlflow/transformer.py index 3500a7d..5d28db8 100644 --- a/yateto/controlflow/transformer.py +++ b/yateto/controlflow/transformer.py @@ -4,7 +4,7 @@ from .fused_gemm_automata import Context as FusedGemmsContext -class MergeScalarMultiplications(object): +class MergeScalarMultiplications(object): def visit(self, cfg): n = len(cfg)-1 i = 1 @@ -34,7 +34,7 @@ def visit(self, cfg): for i in range(n): ua = cfg[i].action v = cfg[i+1] - + if not ua.isCompound() \ and ua.isRHSVariable() \ and ua.term.writable \ @@ -49,7 +49,7 @@ def visit(self, cfg): for j in range(i, n): cfg[j].action = cfg[j].action.substituted(when, by) cfg = LivenessAnalysis().visit(cfg) - + return cfg class SubstituteBackward(object): diff --git a/yateto/controlflow/visitor.py b/yateto/controlflow/visitor.py index 8841e06..1c111a2 100644 --- a/yateto/controlflow/visitor.py +++ b/yateto/controlflow/visitor.py @@ -6,19 +6,19 @@ class AST2ControlFlow(Visitor): TEMPORARY_RESULT = '_tmp' - + def __init__(self, simpleMemoryLayout=False): self._tmp = 0 self._cfg = [] self._writable = set() self._simpleMemoryLayout = simpleMemoryLayout - + def cfg(self): return self._cfg + [ProgramPoint(None)] def _ml(self, node): return DenseMemoryLayout(node.shape()) if self._simpleMemoryLayout else node.memoryLayout() - + def _addTransformOp(self, permute, variable): if not self._simpleMemoryLayout: permute.setEqspp( permute.computeSparsityPattern() ) @@ -44,7 +44,7 @@ def _addPermuteIfRequired(self, indices, term, variable): # permute needed, run before broadcast inode = Permute.subPermute(term, indices) intermediate = self._addTransformOp(inode, variable) - + result = intermediate if len(term.indices) != len(indices): # broadcast needed, more output than input indices @@ -54,13 +54,13 @@ def _addPermuteIfRequired(self, indices, term, variable): def generic_visit(self, node): variables = [self.visit(child) for child in node] - + result = self._nextTemporary(node) action = ProgramAction(result, Expression(node, self._ml(node), variables), False) self._addAction(action) - + return result - + def visit_SliceView(self, node): var = self.visit(node.term()) ml = node.getMemoryLayout(var.memoryLayout()) @@ -79,18 +79,18 @@ def visit_Add(self, node): action = ProgramAction(tmp, rhs, add) self._addAction(action) add = True - + return tmp - + def visit_ScalarMultiplication(self, node): variable = self.visit(node.term()) result = self._nextTemporary(node) action = ProgramAction(result, variable, False, node.scalar()) self._addAction(action) - + return result - + def visit_Assign(self, node): self.updateWritable(node[0].name()) variables = [self.visit(child) for child in node] @@ -98,9 +98,9 @@ def visit_Assign(self, node): rhs = self._addPermuteIfRequired(node.indices, node.rightTerm(), variables[1]) action = ProgramAction(variables[0], rhs, False) self._addAction(action) - + return variables[0] - + def visit_IndexedTensor(self, node): return Variable(node.name(), node.name() in self._writable, self._ml(node), node.eqspp(), node.tensor, is_temporary=node.tensor.temporary) diff --git a/yateto/gemm_configuration.py b/yateto/gemm_configuration.py index 366a6c8..6eed1ca 100644 --- a/yateto/gemm_configuration.py +++ b/yateto/gemm_configuration.py @@ -13,7 +13,7 @@ class GemmTool(ABC): def __init__(self, operation_name: str, includes: List[str] = []): self.operation_name = operation_name self.includes = includes - + def archSupported(self): return True @@ -57,7 +57,7 @@ class MKL(BLASlike): def __init__(self, arch): self._arch = arch super().__init__('cblas_{}gemm'.format(arch.precision.lower()), ['mkl_cblas.h']) - + def archSupported(self): return self._arch.host_name.lower() in {'snb', 'hsw', 'skx', 'knl'} or self._arch.host_name.lower().startswith('avx') diff --git a/yateto/generator.py b/yateto/generator.py index ce65cd9..4087395 100644 --- a/yateto/generator.py +++ b/yateto/generator.py @@ -63,7 +63,7 @@ def prepareUntilUnitTest(self): ast2cf.visit(ast) self.cfg = ast2cf.cfg() self.cfg = LivenessAnalysis().visit(self.cfg) - + def prepareUntilCodeGen(self, cost_estimator, enableFusedGemm: bool): self.nonZeroFlops = 0 for a in self.ast: @@ -117,34 +117,34 @@ def __init__(self, namespace=None): self.namespace = namespace else: self.namespace = '' - + def items(self): return self._kernels.items() - + def __len__(self): return max(self._kernels.keys()) + 1 - - @classmethod + + @classmethod def baseName(self, name): return re.match(Kernel.BASE_NAME, name).group(0) - + @classmethod def isValidName(cls, name): return re.match(cls.VALID_NAME, name) is not None - + @classmethod def group(cls, name): m = re.search(cls.GROUP_INDEX, name) return int(m.group(1)) - + def setStride(self, stride): self._stride = stride - + def stride(self): if self._stride is not None: return self._stride return (1,) - + @classmethod def linear(cls, stride, group): assert len(stride) == len(group) @@ -158,7 +158,7 @@ def add(self, name, ast, prefetch=None, namespace=None, target='cpu'): if not self.name: self.name = baseName assert baseName == self.name - + group = self.group(name) internalName = '_{}_{}'.format(baseName, group) self._kernels[group] = Kernel(internalName, ast, prefetch, namespace, target) @@ -174,7 +174,7 @@ def kernels(self): def prepareUntilUnitTest(self): for kernel in self._kernels.values(): kernel.prepareUntilUnitTest() - + def prepareUntilCodeGen(self, costEstimator, enableFusedGemm: bool): for kernel in self._kernels.values(): kernel.prepareUntilCodeGen(costEstimator, enableFusedGemm) @@ -189,10 +189,10 @@ class GlobalRoutineCache: def __init__(self): self.cache = RoutineCache() self.dirs = [] - + def register(self, outputDir): self.dirs += [outputDir] - + def generate(self, outputDir, namespace='yateto'): print('Calling external code generators...') fRoutines = Generator.FileNames(outputDir, Generator.ROUTINES_FILE_NAME) @@ -200,7 +200,7 @@ def generate(self, outputDir, namespace='yateto'): with Cpp(fRoutines.h) as header: with header.HeaderGuard(Generator._headerGuardName(namespace, Generator.ROUTINES_FILE_NAME)): self.cache.generate(header, fRoutines.cpp, fGpulikeRoutines.cpp) - + for subdir in self.dirs: relpath = os.path.relpath(outputDir, subdir) rfRoutines = Generator.FileNames(subdir, Generator.ROUTINES_FILE_NAME) @@ -217,7 +217,7 @@ class Generator(object): DOCTEST_FILE_NAME = 'test-kernel' HEADER_GUARD_SUFFIX = 'H_' SUPPORT_LIBRARY_HEADER = 'yateto.h' - + class FileNames(object): HEADER = 'h' CPP = 'cpp' @@ -227,7 +227,7 @@ def __init__(self, outputDir, name): self.cppName = '{}.{}'.format(name, self.CPP) self.h = os.path.join(outputDir, self.hName) self.cpp = os.path.join(outputDir, self.cppName) - + def __init__(self, arch): self._kernels = list() self._kernelFamilies = dict() @@ -242,7 +242,7 @@ def add(self, name: str, ast: Node, prefetch=None, namespace=None, target='cpu') if baseName not in self._kernelFamilies: self._kernelFamilies[baseName] = KernelFamily() self._kernelFamilies[baseName].add(name, ast, prefetch, namespace, target) - else: + else: if not Kernel.isValidName(name): raise ValueError(f'Kernel name invalid (must match regexp {Kernel.VALID_NAME}): {name}') kernel = Kernel(name, ast, prefetch, namespace=namespace, target=target) @@ -273,7 +273,7 @@ def addFamily(self, ast = astGenerator(*p) prefetch = prefetchGenerator(*p) if prefetchGenerator is not None else None family.add(indexedName, ast, prefetch, namespace, target=target) - + @classmethod def _headerGuardName(self, namespace, fileBaseName): partlist = namespace.upper().split('::') + [fileBaseName.upper(), self.HEADER_GUARD_SUFFIX] @@ -310,7 +310,7 @@ def generate(self, fTensors = self.FileNames(outputDir, self.TENSORS_FILE_NAME) fInit = self.FileNames(outputDir, self.INIT_FILE_NAME) fRoutines = self.FileNames(outputDir, self.ROUTINES_FILE_NAME) - fGpulikeRoutines = self.FileNames(outputDir, self.GPULIKE_ROUTINES_FILE_NAME) + fGpulikeRoutines = self.FileNames(outputDir, self.GPULIKE_ROUTINES_FILE_NAME) print('Generating unit tests...') def unit_test_body(cpp, testFramework): diff --git a/yateto/input.py b/yateto/input.py index 3200215..2867c20 100644 --- a/yateto/input.py +++ b/yateto/input.py @@ -83,9 +83,9 @@ def openMaybeCompressed(basefilename): def parseXMLMatrixFile(xmlFile, clones=dict(), transpose=lambda name: False, alignStride=lambda name: False, namespace=None): with openMaybeCompressed(xmlFile) as file: root = etree.fromstring(file.read()) - + matrices = dict() - + for node in root: if node.tag == 'matrix': name = node.get('name') @@ -131,7 +131,7 @@ def parseJSONMatrixFile(jsonFile, clones=dict(), transpose=lambda name: False, a def memoryLayoutFromFile(xmlFile, db, clones, strict=False): with openMaybeCompressed(xmlFile) as file: root = etree.fromstring(file.read()) - + strtobool = ['yes', 'true', '1'] groups = dict() diff --git a/yateto/memory.py b/yateto/memory.py index fa3ea9a..11a4497 100644 --- a/yateto/memory.py +++ b/yateto/memory.py @@ -13,11 +13,11 @@ def __init__(self, shape): def shape(self): return self._shape - + @abstractmethod def address(self, entry): pass - + @abstractmethod def subtensorOffset(self, topLeftEntry): pass @@ -32,7 +32,7 @@ def mayVectorizeDim(self, dim): def mayFuse(self, positions): return len(positions) == 1 - + @classmethod @abstractmethod def fromSpp(cls, spp, **kwargs): @@ -61,7 +61,7 @@ class DenseMemoryLayout(MemoryLayout): @classmethod def setAlignmentArch(cls, arch): cls.ALIGNMENT_ARCH = arch - + def __init__(self, shape, boundingBox=None, stride=None, alignStride=False): super().__init__(shape) @@ -78,13 +78,13 @@ def __init__(self, shape, boundingBox=None, stride=None, alignStride=False): self._stride = stride else: self._computeStride() - + def _computeStride(self): stride = [1] for i in range(len(self._bbox)-1): stride.append(stride[i] * self._bbox[i].size()) self._stride = tuple(stride) - + def _alignBB(self): if self.ALIGNMENT_ARCH is not None: self._range0 = self._bbox[0] @@ -92,7 +92,7 @@ def _alignBB(self): self._bbox = BoundingBox([rnew] + self._bbox[1:]) else: warnings.warn('Set architecture with DenseMemoryLayout.setAlignmentArch(arch) if you want to use the align stride feature.', UserWarning) - + def alignedStride(self): if self.ALIGNMENT_ARCH is None: return False @@ -120,7 +120,7 @@ def __contains__(self, entry): def permuted(self, permutation): newShape = tuple([self._shape[p] for p in permutation]) - + originalBB = BoundingBox([self._range0] + self._bbox[1:]) if self._range0 else self._bbox newBB = BoundingBox([copy.copy(originalBB[p]) for p in permutation]) return DenseMemoryLayout(newShape, newBB, alignStride=self._range0 is not None) @@ -131,7 +131,7 @@ def address(self, entry): def subtensorOffset(self, topLeftEntry): return self.address(topLeftEntry) - + def notWrittenAddresses(self, writeBB): if writeBB == self._bbox: return [] @@ -143,10 +143,10 @@ def notWrittenAddresses(self, writeBB): def stride(self): return self._stride - + def stridei(self, dim): return self._stride[dim] - + def bbox(self): return self._bbox @@ -158,7 +158,7 @@ def requiredReals(self): return 1 size = self._bbox[-1].size() * self._stride[-1] return size - + def addressString(self, indices, I = None, prefix='_', offsets=()): if len(self._bbox) == 0: return '0' @@ -189,13 +189,13 @@ def isAlignedAddressString(self, indices, I = None): def mayFuse(self, positions): return all( [self._stride[j] == self._shape[i]*self._stride[i] for i,j in zip(positions[:-1], positions[1:])] ) - + def _subShape(self, positions): sub = 1 for p in positions: sub *= self._shape[p] return sub - + def _subRange(self, positions): start = 0 stop = 0 @@ -205,7 +205,7 @@ def _subRange(self, positions): stop += s * (self._bbox[p].stop-1) s *= self._shape[p] return Range(start, stop+1) - + def _firstStride(self, positions): return self._stride[ positions[0] ] @@ -238,7 +238,7 @@ def unfold(self, indices, I, J): stride = (self._firstStride(positionsI), self._firstStride(positionsJ)) return DenseMemoryLayout(shape, bbox, stride) - + def defuse(self, fusedRange, indices, I): positions = indices.positions(I) s = self._subShape(positions) @@ -256,7 +256,7 @@ def defuse(self, fusedRange, indices, I): def isCompatible(self, spp): return BoundingBox.fromSpp(spp) in self.bbox() - + def subslice(self, index, start, end): return MemoryLayoutView(self, index, start, end) @@ -265,16 +265,16 @@ def __eq__(self, other): def __str__(self): return '{}(shape: {}, bounding box: {}, stride: {})'.format(type(self).__name__, self._shape, self._bbox, self._stride) - + def isCSC(self): return False - + def spp(self): raise NotImplementedError() - + def storage(self): return self - + def alignmentOffset(self, dim): return 0 @@ -287,7 +287,7 @@ def __init__(self, spp, alignStride=False): self.aligned = alignStride self._spp = spp - + if len(self._shape) != 2: raise ValueError('CSCMemoryLayout may only be used for matrices.') @@ -296,7 +296,7 @@ def __init__(self, spp, alignStride=False): range0 = self._bbox[0] rnew = Range( DenseMemoryLayout.ALIGNMENT_ARCH.alignedLower(range0.start), DenseMemoryLayout.ALIGNMENT_ARCH.alignedUpper(range0.stop) ) self._bbox = BoundingBox([rnew] + self._bbox[1:]) - + nonzeros = spp.nonzero() nonzeros = sorted(zip(nonzeros[0], nonzeros[1]), key=lambda x: (x[1], x[0])) @@ -309,13 +309,13 @@ def __init__(self, spp, alignStride=False): for i in range(lower, upper): nonzeros_pre.add((np.int64(i), nonzero[1])) - + nonzeros = list(nonzeros_pre) nonzeros = sorted(zip([nonzero[0] for nonzero in nonzeros], [nonzero[1] for nonzero in nonzeros]), key=lambda x: (x[1], x[0])) - + self._rowIndex = np.ndarray(shape=(len(nonzeros),), dtype=int) self._colPtr = np.ndarray(shape=(self._shape[1]+1,), dtype=int) - + lastCol = 0 self._colPtr[0] = 0 for i,entry in enumerate(nonzeros): @@ -335,31 +335,31 @@ def bbox(self): def bboxi(self, dim): return self._bbox[dim] - + def rowIndex(self): return self._rowIndex - + def colPointer(self): return self._colPtr - + def isAlignedAddressString(self, indices, I = None): if I is None: I = set(indices) positions = indices.positions(I) return len(positions) == 0 or (positions[0] == 0 and all(p != 0 for p in positions[1:])) - + def address(self, entry): assert entry in self._bbox start = self._colPtr[ entry[1] ] stop = self._colPtr[ entry[1]+1 ] subRowInd = self._rowIndex[start:stop] - + find = np.where(subRowInd == entry[0])[0] assert len(find) == 1 return start + find[0] - + def subtensorOffset(self, topLeftEntry): assert topLeftEntry in self._bbox assert topLeftEntry[0] <= self._bbox[0].start @@ -399,16 +399,16 @@ def isCompatible(self, spp): def __eq__(self, other): return self._bbox == other._bbox and np.array_equal(self._rowIndex, other._rowIndex) and np.array_equal(self._colPtr, other._colPtr) - + def subslice(self, index, start, end): return MemoryLayoutView(self, index, start, end) - + def spp(self): return self._spp - + def storage(self): return self - + def alignmentOffset(self, dim): return 0 @@ -427,19 +427,19 @@ def __init__(self, base, index, start, end): self.index = index self.start = start self.end = end - + def relidx(self, index): return tuple(index[i] if i != self.index else index[i] + self.start for i in range(len(self._shape))) - + def relbox(self, bbox): return BoundingBox([Range(max(bbox[i].start + self.start, self.start), min(bbox[i].stop + self.start, self.end)) if i == self.index else bbox[i] for i in range(len(self._shape))]) - + def relspp(self, spp): subslice = tuple(slice(self.start, self.end) if i == self.index else slice(None) for i in range(spp.ndim)) superarray = np.zeros(tuple(self.base.shape()), dtype=bool) superarray[subslice] = spp.as_ndarray() return aspp.general(superarray) - + def relranges(self): starts, ends = self.base.relranges() starts[self.index] = max(starts[self.index], self.start) @@ -448,23 +448,23 @@ def relranges(self): def __contains__(self, bbox): return self.base.__contains__(self.relbox(bbox)) - + def __eq__(self, other): return self.storage() == other.storage() and self.relranges() == other.relranges() - + def address(self, entry): return self.base.address(self.relidx(entry)) - + def subtensorOffset(self, topLeftEntry): return self.base.subtensorOffset(self.relidx(topLeftEntry)) - + def alignedStride(self): return self.base.alignedStride() and (self.index != 0 or DenseMemoryLayout.ALIGNMENT_ARCH.checkAlignment(self.end - self.start)) - + def fromSpp(self): # cannot be implemented. Call should result in error. raise NotImplementedError() - + def isCompatible(self, spp): # only a rough criterion. Can possibly be refined. if spp.as_ndarray().shape != tuple(self.shape()): @@ -474,19 +474,19 @@ def isCompatible(self, spp): def mayVectorizeDim(self, dim): return self.base.mayVectorizeDim(dim) - + def isAlignedAddressString(self, indices, I = None): return self.base.isAlignedAddressString(indices, I) - + def addressString(self, indices, I = None, prefix='_', offsets=()): if len(offsets) == 0: offsets = [0] * len(self._shape) newOffsets = tuple(offsets[i] if self.index != i else offsets[i] + self.start for i in range(len(self._shape))) return self.base.addressString(indices, I, prefix, newOffsets) - + def subslice(self, index, start, end): return MemoryLayoutView(self, index, start, end) - + def unfold(self, indices, I, J): positionsI = indices.positions(I) positionsJ = indices.positions(J) @@ -504,7 +504,7 @@ def unfold(self, indices, I, J): scale *= shape[p] return MemoryLayoutView(self.base.unfold(indices, I, J), newIndex, self.start * scale, self.end * scale) - + def withDummyDimension(self): return MemoryLayoutView(self.base.withDummyDimension(), self.index, self.start, self.end) @@ -520,11 +520,11 @@ def defuse(self, fusedRange, indices, I): return self.base.defuse(newFusedRange, indices, I) else: return self.base.defuse(fusedRange, indices, I) - + def stride(self): # pass through return self.base.stride() - + def stridei(self, dim): # pass through return self.base.stridei(dim) @@ -533,16 +533,16 @@ def notWrittenAddresses(self, writeBB): # focus only on the subview outside = set(self.base.notWrittenAddresses(self.bbox())) return list(set(self.base.notWrittenAddresses(self.relbox(writeBB))) - outside) - + def bbox(self): return self.relbox(self.base.bbox()) - + def storage(self): return self.base.storage() - + def permuted(self, permutation): return MemoryLayoutView(self.base.permuted(permutation), permutation[self.index], self.start, self.end) - + def entries(self, rowRange, colRange): if self.index == 0: return self.base.entries(Range(rowRange.start + self.start, rowRange.stop + self.start), colRange) @@ -550,13 +550,13 @@ def entries(self, rowRange, colRange): return self.base.entries(rowRange, Range(colRange.start + self.start, colRange.stop + self.start)) else: raise NotImplementedError() - + def mayFuse(self, positions): return (self.index not in positions or positions[-1] == self.index) and self.base.mayFuse(positions) def __repr__(self): return f'MemoryLayoutView(index: {self.index}; range: [{self.start},{self.end}); base: {self.base})' - + def alignmentOffset(self, dim): val = self.base.alignmentOffset(dim) if self.index == dim: diff --git a/yateto/type.py b/yateto/type.py index e186a44..874645e 100644 --- a/yateto/type.py +++ b/yateto/type.py @@ -8,7 +8,7 @@ class AbstractType(object): @classmethod def isValidName(cls, name): return re.match(cls.VALID_NAME, name) is not None - + def name(self): return self._name @@ -21,12 +21,12 @@ class IdentifiedType(AbstractType): def __init__(self, name, namespace=None): if not self.isValidName(name): raise ValueError('Invalid name (must match regexp {}): {}'.format(self.VALID_NAME, name)) - + self._name = name self.namespace = namespace self.datatype = None # TODO - + def __str__(self): return self._name @@ -39,14 +39,14 @@ def getGroup(cls, name): def group(self): return self.getGroup(self._name) - + @classmethod def getBaseName(cls, name): return re.match(cls.BASE_NAME, name).group(0) - + def baseName(self): return self.getBaseName(self._name) - + @classmethod def splitBasename(cls, base_name_with_namespace): name_parts = base_name_with_namespace.rsplit('::', 1) @@ -56,23 +56,23 @@ def splitBasename(cls, base_name_with_namespace): prefix = '' base_name = name_parts[-1] return prefix, base_name - + def prefix(self): return '{}::'.format(self.namespace) if self.namespace else '' - + def baseNameWithNamespace(self): return '{}{}'.format(self.prefix(), self.baseName()) def nameWithNamespace(self): return '{}{}'.format(self.prefix(), self.name()) - + def __hash__(self): return hash(self._name) -class Scalar(IdentifiedType): +class Scalar(IdentifiedType): def __init__(self, name, namespace=None): super().__init__(name, namespace=namespace) - + def __hash__(self): return hash(self._name) @@ -88,10 +88,10 @@ def __init__(self, super().__init__(name, namespace=namespace) if not isinstance(shape, tuple): raise ValueError('shape must be a tuple') - + if any(x < 1 for x in shape): raise ValueError('shape must not contain entries smaller than 1') - + if not self.isValidName(name): raise ValueError('Tensor name invalid (must match regexp {}): {}'.format(self.VALID_NAME, name)) @@ -125,7 +125,7 @@ def __init__(self, else: self._spp = aspp.dense(shape) self._groupSpp = self._spp - + self.setMemoryLayout(memoryLayoutClass, alignStride) def __hash__(self): @@ -148,16 +148,16 @@ def setGroupSpp(self, spp): def __getitem__(self, indexNames): return IndexedTensor(self, indexNames) - + def shape(self): return self._shape - + def memoryLayout(self): return self._memoryLayout - + def spp(self, groupSpp=True): return self._groupSpp if groupSpp else self._spp - + def values(self): return self._values @@ -186,7 +186,7 @@ def __eq__(self, other): if equal: assert self._shape == other._shape and aspp.array_equal(self._spp, other._spp) and self._memoryLayout == other._memoryLayout return equal - + def __str__(self): return '{}: {}'.format(self._name, self._shape) @@ -196,13 +196,13 @@ def update(self, collection): def __getitem__(self, key): return self.__dict__[key] - + def __setitem__(self, key, value): self.__dict__[key] = value def __contains__(self, key): return key in self.__dict__ - + @classmethod def group(cls, name): group = Tensor.getGroup(name)