From 938b0bab55034b28d382c1f27aab1d85184b7248 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 29 Jul 2025 19:09:20 +0200 Subject: [PATCH 1/9] Rename all modules `pspamm` -> `pypspamm` --- README.md | 2 +- pspamm.py | 4 +-- pspamm/codegen/architectures/__init__.py | 15 ---------- pspamm/cursors/__init__.py | 6 ---- pyproject.toml | 4 +-- {pspamm => pypspamm}/VERSION | 0 {pspamm => pypspamm}/__init__.py | 0 {pspamm => pypspamm}/architecture.py | 0 {pspamm => pypspamm}/cli.py | 10 +++---- {pspamm => pypspamm}/codegen/analysis.py | 4 +-- pypspamm/codegen/architectures/__init__.py | 15 ++++++++++ .../codegen/architectures/arm/blocksize.py | 0 .../codegen/architectures/arm/generator.py | 12 ++++---- .../architectures/arm/inlineprinter.py | 8 ++--- .../codegen/architectures/arm/operands.py | 2 +- .../architectures/arm_sve/blocksize.py | 0 .../architectures/arm_sve/generator.py | 12 ++++---- .../architectures/arm_sve/inlineprinter.py | 8 ++--- .../codegen/architectures/arm_sve/operands.py | 2 +- .../codegen/architectures/hsw/blocksize.py | 0 .../codegen/architectures/hsw/generator.py | 16 +++++----- .../architectures/hsw/inlineprinter.py | 8 ++--- .../codegen/architectures/hsw/operands.py | 2 +- .../codegen/architectures/knl/blocksize.py | 0 .../codegen/architectures/knl/generator.py | 16 +++++----- .../architectures/knl/inlineprinter.py | 8 ++--- .../codegen/architectures/knl/operands.py | 2 +- .../codegen/architectures/lsx/blocksize.py | 0 .../codegen/architectures/lsx/generator.py | 16 +++++----- .../architectures/lsx/inlineprinter.py | 8 ++--- .../codegen/architectures/lsx/operands.py | 2 +- .../codegen/architectures/rvv/blocksize.py | 0 .../codegen/architectures/rvv/generator.py | 12 ++++---- .../architectures/rvv/inlineprinter.py | 8 ++--- .../codegen/architectures/rvv/operands.py | 2 +- {pspamm => pypspamm}/codegen/ast.py | 4 +-- {pspamm => pypspamm}/codegen/ccode.py | 10 +++---- {pspamm => pypspamm}/codegen/forms.py | 2 +- {pspamm => pypspamm}/codegen/generator.py | 6 ++-- {pspamm => pypspamm}/codegen/operands.py | 0 {pspamm => pypspamm}/codegen/precision.py | 0 {pspamm => pypspamm}/codegen/prune.py | 0 {pspamm => pypspamm}/codegen/regcache.py | 0 {pspamm => pypspamm}/codegen/schedule.py | 0 {pspamm => pypspamm}/codegen/sugar.py | 26 ++++++++-------- {pspamm => pypspamm}/codegen/virtual.py | 0 {pspamm => pypspamm}/codegen/visitor.py | 2 +- pypspamm/cursors/__init__.py | 6 ++++ .../cursors/abstractcursor.py | 8 ++--- {pspamm => pypspamm}/cursors/blockcursor.py | 10 +++---- {pspamm => pypspamm}/cursors/coords.py | 0 {pspamm => pypspamm}/cursors/densecursor.py | 6 ++-- {pspamm => pypspamm}/cursors/matrix.py | 0 {pspamm => pypspamm}/matmul.py | 30 +++++++++---------- {pspamm => pypspamm}/metagen/arm.py | 0 {pspamm => pypspamm}/metagen/metagen.py | 4 +-- tests/testsuite_generator.py | 2 +- tests/unit_test.py | 4 +-- 58 files changed, 162 insertions(+), 162 deletions(-) delete mode 100644 pspamm/codegen/architectures/__init__.py delete mode 100644 pspamm/cursors/__init__.py rename {pspamm => pypspamm}/VERSION (100%) rename {pspamm => pypspamm}/__init__.py (100%) rename {pspamm => pypspamm}/architecture.py (100%) rename {pspamm => pypspamm}/cli.py (93%) rename {pspamm => pypspamm}/codegen/analysis.py (82%) create mode 100644 pypspamm/codegen/architectures/__init__.py rename {pspamm => pypspamm}/codegen/architectures/arm/blocksize.py (100%) rename {pspamm => pypspamm}/codegen/architectures/arm/generator.py (98%) rename {pspamm => pypspamm}/codegen/architectures/arm/inlineprinter.py (97%) rename {pspamm => pypspamm}/codegen/architectures/arm/operands.py (98%) rename {pspamm => pypspamm}/codegen/architectures/arm_sve/blocksize.py (100%) rename {pspamm => pypspamm}/codegen/architectures/arm_sve/generator.py (98%) rename {pspamm => pypspamm}/codegen/architectures/arm_sve/inlineprinter.py (98%) rename {pspamm => pypspamm}/codegen/architectures/arm_sve/operands.py (98%) rename {pspamm => pypspamm}/codegen/architectures/hsw/blocksize.py (100%) rename {pspamm => pypspamm}/codegen/architectures/hsw/generator.py (97%) rename {pspamm => pypspamm}/codegen/architectures/hsw/inlineprinter.py (97%) rename {pspamm => pypspamm}/codegen/architectures/hsw/operands.py (97%) rename {pspamm => pypspamm}/codegen/architectures/knl/blocksize.py (100%) rename {pspamm => pypspamm}/codegen/architectures/knl/generator.py (97%) rename {pspamm => pypspamm}/codegen/architectures/knl/inlineprinter.py (97%) rename {pspamm => pypspamm}/codegen/architectures/knl/operands.py (98%) rename {pspamm => pypspamm}/codegen/architectures/lsx/blocksize.py (100%) rename {pspamm => pypspamm}/codegen/architectures/lsx/generator.py (97%) rename {pspamm => pypspamm}/codegen/architectures/lsx/inlineprinter.py (98%) rename {pspamm => pypspamm}/codegen/architectures/lsx/operands.py (97%) rename {pspamm => pypspamm}/codegen/architectures/rvv/blocksize.py (100%) rename {pspamm => pypspamm}/codegen/architectures/rvv/generator.py (98%) rename {pspamm => pypspamm}/codegen/architectures/rvv/inlineprinter.py (98%) rename {pspamm => pypspamm}/codegen/architectures/rvv/operands.py (97%) rename {pspamm => pypspamm}/codegen/ast.py (98%) rename {pspamm => pypspamm}/codegen/ccode.py (76%) rename {pspamm => pypspamm}/codegen/forms.py (98%) rename {pspamm => pypspamm}/codegen/generator.py (95%) rename {pspamm => pypspamm}/codegen/operands.py (100%) rename {pspamm => pypspamm}/codegen/precision.py (100%) rename {pspamm => pypspamm}/codegen/prune.py (100%) rename {pspamm => pypspamm}/codegen/regcache.py (100%) rename {pspamm => pypspamm}/codegen/schedule.py (100%) rename {pspamm => pypspamm}/codegen/sugar.py (84%) rename {pspamm => pypspamm}/codegen/virtual.py (100%) rename {pspamm => pypspamm}/codegen/visitor.py (97%) create mode 100644 pypspamm/cursors/__init__.py rename {pspamm => pypspamm}/cursors/abstractcursor.py (90%) rename {pspamm => pypspamm}/cursors/blockcursor.py (96%) rename {pspamm => pypspamm}/cursors/coords.py (100%) rename {pspamm => pypspamm}/cursors/densecursor.py (96%) rename {pspamm => pypspamm}/cursors/matrix.py (100%) rename {pspamm => pypspamm}/matmul.py (95%) rename {pspamm => pypspamm}/metagen/arm.py (100%) rename {pspamm => pypspamm}/metagen/metagen.py (96%) diff --git a/README.md b/README.md index 198b3b4..2f793c6 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Currently supported: ## Installation -PspaMM is a Python package. I.e. after cloning, may install it via pip. +PSpaMM is a Python package. I.e. after cloning, may install it via pip. Alternatively, you can install it directly by running diff --git a/pspamm.py b/pspamm.py index c6feffc..75b62ce 100755 --- a/pspamm.py +++ b/pspamm.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 if __name__=='__main__': - import pspamm.cli - pspamm.cli.main() + import pypspamm.cli + pypspamm.cli.main() diff --git a/pspamm/codegen/architectures/__init__.py b/pspamm/codegen/architectures/__init__.py deleted file mode 100644 index 6962d5a..0000000 --- a/pspamm/codegen/architectures/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from pspamm.codegen.architectures.arm_sve.generator import * -from pspamm.codegen.architectures.arm_sve.inlineprinter import * -from pspamm.codegen.architectures.arm_sve.operands import * - -from pspamm.codegen.architectures.arm.generator import * -from pspamm.codegen.architectures.arm.inlineprinter import * -from pspamm.codegen.architectures.arm.operands import * - -from pspamm.codegen.architectures.knl.generator import * -from pspamm.codegen.architectures.knl.inlineprinter import * -from pspamm.codegen.architectures.knl.operands import * - -from pspamm.codegen.architectures.hsw.generator import * -from pspamm.codegen.architectures.hsw.inlineprinter import * -from pspamm.codegen.architectures.hsw.operands import * diff --git a/pspamm/cursors/__init__.py b/pspamm/cursors/__init__.py deleted file mode 100644 index 5805965..0000000 --- a/pspamm/cursors/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from pspamm.cursors.matrix import Matrix -from pspamm.cursors.coords import Coords -from pspamm.cursors.abstractcursor import BlockInfo, CursorLocation, Cursor -from pspamm.cursors.blockcursor import BlockCursor, sparse_mask -from pspamm.cursors.densecursor import DenseCursor - diff --git a/pyproject.toml b/pyproject.toml index 7c04566..4b1419a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,8 @@ dynamic = ["version", "readme", "dependencies"] [tool.setuptools.dynamic] readme = {file = ["README.md"], content-type = "text/markdown"} -version = {file = ["pspamm/VERSION"]} +version = {file = ["pypspamm/VERSION"]} dependencies = {file = ["requirements.txt"]} [project.scripts] -pspamm-generator = "pspamm.cli:main" +pspamm-generator = "pypspamm.cli:main" diff --git a/pspamm/VERSION b/pypspamm/VERSION similarity index 100% rename from pspamm/VERSION rename to pypspamm/VERSION diff --git a/pspamm/__init__.py b/pypspamm/__init__.py similarity index 100% rename from pspamm/__init__.py rename to pypspamm/__init__.py diff --git a/pspamm/architecture.py b/pypspamm/architecture.py similarity index 100% rename from pspamm/architecture.py rename to pypspamm/architecture.py diff --git a/pspamm/cli.py b/pypspamm/cli.py similarity index 93% rename from pspamm/cli.py rename to pypspamm/cli.py index ab594f1..31cf9b8 100755 --- a/pspamm/cli.py +++ b/pypspamm/cli.py @@ -2,14 +2,14 @@ import argparse -import pspamm.architecture +import pypspamm.architecture -from pspamm.matmul import * +from pypspamm.matmul import * -from pspamm.codegen.ccode import * -from pspamm.codegen.architectures import * +from pypspamm.codegen.ccode import * +from pypspamm.codegen.architectures import * -from pspamm.metagen.metagen import * +from pypspamm.metagen.metagen import * mtx_formats = ['any','csc','csr','bsc','bsr','bcsc','bcsr'] diff --git a/pspamm/codegen/analysis.py b/pypspamm/codegen/analysis.py similarity index 82% rename from pspamm/codegen/analysis.py rename to pypspamm/codegen/analysis.py index 0157744..7ed70b8 100644 --- a/pspamm/codegen/analysis.py +++ b/pypspamm/codegen/analysis.py @@ -1,5 +1,5 @@ -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.sugar import * +from pypspamm.codegen.visitor import Visitor +from pypspamm.codegen.sugar import * from typing import List, Set diff --git a/pypspamm/codegen/architectures/__init__.py b/pypspamm/codegen/architectures/__init__.py new file mode 100644 index 0000000..317a44c --- /dev/null +++ b/pypspamm/codegen/architectures/__init__.py @@ -0,0 +1,15 @@ +from pypspamm.codegen.architectures.arm_sve.generator import * +from pypspamm.codegen.architectures.arm_sve.inlineprinter import * +from pypspamm.codegen.architectures.arm_sve.operands import * + +from pypspamm.codegen.architectures.arm.generator import * +from pypspamm.codegen.architectures.arm.inlineprinter import * +from pypspamm.codegen.architectures.arm.operands import * + +from pypspamm.codegen.architectures.knl.generator import * +from pypspamm.codegen.architectures.knl.inlineprinter import * +from pypspamm.codegen.architectures.knl.operands import * + +from pypspamm.codegen.architectures.hsw.generator import * +from pypspamm.codegen.architectures.hsw.inlineprinter import * +from pypspamm.codegen.architectures.hsw.operands import * diff --git a/pspamm/codegen/architectures/arm/blocksize.py b/pypspamm/codegen/architectures/arm/blocksize.py similarity index 100% rename from pspamm/codegen/architectures/arm/blocksize.py rename to pypspamm/codegen/architectures/arm/blocksize.py diff --git a/pspamm/codegen/architectures/arm/generator.py b/pypspamm/codegen/architectures/arm/generator.py similarity index 98% rename from pspamm/codegen/architectures/arm/generator.py rename to pypspamm/codegen/architectures/arm/generator.py index ebeb71c..2919ff9 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pypspamm/codegen/architectures/arm/generator.py @@ -1,10 +1,10 @@ -from pspamm.cursors import * +from pypspamm.cursors import * -from pspamm.codegen.architectures.arm.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * +from pypspamm.codegen.architectures.arm.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.sugar import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * class Generator(AbstractGenerator): diff --git a/pspamm/codegen/architectures/arm/inlineprinter.py b/pypspamm/codegen/architectures/arm/inlineprinter.py similarity index 97% rename from pspamm/codegen/architectures/arm/inlineprinter.py rename to pypspamm/codegen/architectures/arm/inlineprinter.py index a8a8bc5..9704ffc 100644 --- a/pspamm/codegen/architectures/arm/inlineprinter.py +++ b/pypspamm/codegen/architectures/arm/inlineprinter.py @@ -1,8 +1,8 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.visitor import Visitor +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * class InlinePrinter(Visitor): diff --git a/pspamm/codegen/architectures/arm/operands.py b/pypspamm/codegen/architectures/arm/operands.py similarity index 98% rename from pspamm/codegen/architectures/arm/operands.py rename to pypspamm/codegen/architectures/arm/operands.py index 9efa435..481f9fa 100644 --- a/pspamm/codegen/architectures/arm/operands.py +++ b/pypspamm/codegen/architectures/arm/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_ARM: diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pypspamm/codegen/architectures/arm_sve/blocksize.py similarity index 100% rename from pspamm/codegen/architectures/arm_sve/blocksize.py rename to pypspamm/codegen/architectures/arm_sve/blocksize.py diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pypspamm/codegen/architectures/arm_sve/generator.py similarity index 98% rename from pspamm/codegen/architectures/arm_sve/generator.py rename to pypspamm/codegen/architectures/arm_sve/generator.py index bafa122..50b8c23 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pypspamm/codegen/architectures/arm_sve/generator.py @@ -1,10 +1,10 @@ -from pspamm.cursors import * +from pypspamm.cursors import * -from pspamm.codegen.architectures.arm_sve.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * +from pypspamm.codegen.architectures.arm_sve.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.sugar import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * class Generator(AbstractGenerator): diff --git a/pspamm/codegen/architectures/arm_sve/inlineprinter.py b/pypspamm/codegen/architectures/arm_sve/inlineprinter.py similarity index 98% rename from pspamm/codegen/architectures/arm_sve/inlineprinter.py rename to pypspamm/codegen/architectures/arm_sve/inlineprinter.py index 27d3dc8..33cb70a 100644 --- a/pspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pypspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -1,8 +1,8 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.visitor import Visitor +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * class InlinePrinter(Visitor): diff --git a/pspamm/codegen/architectures/arm_sve/operands.py b/pypspamm/codegen/architectures/arm_sve/operands.py similarity index 98% rename from pspamm/codegen/architectures/arm_sve/operands.py rename to pypspamm/codegen/architectures/arm_sve/operands.py index 90328ff..e64f523 100644 --- a/pspamm/codegen/architectures/arm_sve/operands.py +++ b/pypspamm/codegen/architectures/arm_sve/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_ARM: diff --git a/pspamm/codegen/architectures/hsw/blocksize.py b/pypspamm/codegen/architectures/hsw/blocksize.py similarity index 100% rename from pspamm/codegen/architectures/hsw/blocksize.py rename to pypspamm/codegen/architectures/hsw/blocksize.py diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pypspamm/codegen/architectures/hsw/generator.py similarity index 97% rename from pspamm/codegen/architectures/hsw/generator.py rename to pypspamm/codegen/architectures/hsw/generator.py index d1caba2..3bacf69 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pypspamm/codegen/architectures/hsw/generator.py @@ -1,11 +1,11 @@ -from pspamm.cursors import * - -from pspamm.codegen.architectures.hsw.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * -from pspamm.codegen.regcache import * +from pypspamm.cursors import * + +from pypspamm.codegen.architectures.hsw.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.sugar import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.regcache import * class Generator(AbstractGenerator): template = """ diff --git a/pspamm/codegen/architectures/hsw/inlineprinter.py b/pypspamm/codegen/architectures/hsw/inlineprinter.py similarity index 97% rename from pspamm/codegen/architectures/hsw/inlineprinter.py rename to pypspamm/codegen/architectures/hsw/inlineprinter.py index a1d6fc2..3141e51 100644 --- a/pspamm/codegen/architectures/hsw/inlineprinter.py +++ b/pypspamm/codegen/architectures/hsw/inlineprinter.py @@ -1,8 +1,8 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.visitor import Visitor +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * class InlinePrinter(Visitor): diff --git a/pspamm/codegen/architectures/hsw/operands.py b/pypspamm/codegen/architectures/hsw/operands.py similarity index 97% rename from pspamm/codegen/architectures/hsw/operands.py rename to pypspamm/codegen/architectures/hsw/operands.py index d26ba47..84a693f 100644 --- a/pspamm/codegen/architectures/hsw/operands.py +++ b/pypspamm/codegen/architectures/hsw/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_HSW: diff --git a/pspamm/codegen/architectures/knl/blocksize.py b/pypspamm/codegen/architectures/knl/blocksize.py similarity index 100% rename from pspamm/codegen/architectures/knl/blocksize.py rename to pypspamm/codegen/architectures/knl/blocksize.py diff --git a/pspamm/codegen/architectures/knl/generator.py b/pypspamm/codegen/architectures/knl/generator.py similarity index 97% rename from pspamm/codegen/architectures/knl/generator.py rename to pypspamm/codegen/architectures/knl/generator.py index f039f80..4269365 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pypspamm/codegen/architectures/knl/generator.py @@ -1,11 +1,11 @@ -from pspamm.cursors import * - -from pspamm.codegen.architectures.knl.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * -from pspamm.codegen.regcache import * +from pypspamm.cursors import * + +from pypspamm.codegen.architectures.knl.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.sugar import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.regcache import * class Generator(AbstractGenerator): template = """ diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pypspamm/codegen/architectures/knl/inlineprinter.py similarity index 97% rename from pspamm/codegen/architectures/knl/inlineprinter.py rename to pypspamm/codegen/architectures/knl/inlineprinter.py index 3de0659..22fc81a 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pypspamm/codegen/architectures/knl/inlineprinter.py @@ -1,8 +1,8 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.visitor import Visitor +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * class InlinePrinter(Visitor): diff --git a/pspamm/codegen/architectures/knl/operands.py b/pypspamm/codegen/architectures/knl/operands.py similarity index 98% rename from pspamm/codegen/architectures/knl/operands.py rename to pypspamm/codegen/architectures/knl/operands.py index c9ed45e..03b6613 100644 --- a/pspamm/codegen/architectures/knl/operands.py +++ b/pypspamm/codegen/architectures/knl/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_KNL: diff --git a/pspamm/codegen/architectures/lsx/blocksize.py b/pypspamm/codegen/architectures/lsx/blocksize.py similarity index 100% rename from pspamm/codegen/architectures/lsx/blocksize.py rename to pypspamm/codegen/architectures/lsx/blocksize.py diff --git a/pspamm/codegen/architectures/lsx/generator.py b/pypspamm/codegen/architectures/lsx/generator.py similarity index 97% rename from pspamm/codegen/architectures/lsx/generator.py rename to pypspamm/codegen/architectures/lsx/generator.py index 836277d..a3625b8 100644 --- a/pspamm/codegen/architectures/lsx/generator.py +++ b/pypspamm/codegen/architectures/lsx/generator.py @@ -1,11 +1,11 @@ -from pspamm.cursors import * - -from pspamm.codegen.architectures.lsx.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * -from pspamm.codegen.regcache import * +from pypspamm.cursors import * + +from pypspamm.codegen.architectures.lsx.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.sugar import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.regcache import * class Generator(AbstractGenerator): template = """ diff --git a/pspamm/codegen/architectures/lsx/inlineprinter.py b/pypspamm/codegen/architectures/lsx/inlineprinter.py similarity index 98% rename from pspamm/codegen/architectures/lsx/inlineprinter.py rename to pypspamm/codegen/architectures/lsx/inlineprinter.py index fe13715..0926e79 100644 --- a/pspamm/codegen/architectures/lsx/inlineprinter.py +++ b/pypspamm/codegen/architectures/lsx/inlineprinter.py @@ -1,8 +1,8 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.visitor import Visitor +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * class InlinePrinter(Visitor): diff --git a/pspamm/codegen/architectures/lsx/operands.py b/pypspamm/codegen/architectures/lsx/operands.py similarity index 97% rename from pspamm/codegen/architectures/lsx/operands.py rename to pypspamm/codegen/architectures/lsx/operands.py index 5267726..4921c9c 100644 --- a/pspamm/codegen/architectures/lsx/operands.py +++ b/pypspamm/codegen/architectures/lsx/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_LSX: diff --git a/pspamm/codegen/architectures/rvv/blocksize.py b/pypspamm/codegen/architectures/rvv/blocksize.py similarity index 100% rename from pspamm/codegen/architectures/rvv/blocksize.py rename to pypspamm/codegen/architectures/rvv/blocksize.py diff --git a/pspamm/codegen/architectures/rvv/generator.py b/pypspamm/codegen/architectures/rvv/generator.py similarity index 98% rename from pspamm/codegen/architectures/rvv/generator.py rename to pypspamm/codegen/architectures/rvv/generator.py index 1efe040..51503a4 100644 --- a/pspamm/codegen/architectures/rvv/generator.py +++ b/pypspamm/codegen/architectures/rvv/generator.py @@ -1,10 +1,10 @@ -from pspamm.cursors import * +from pypspamm.cursors import * -from pspamm.codegen.architectures.rvv.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * +from pypspamm.codegen.architectures.rvv.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.sugar import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * class Generator(AbstractGenerator): diff --git a/pspamm/codegen/architectures/rvv/inlineprinter.py b/pypspamm/codegen/architectures/rvv/inlineprinter.py similarity index 98% rename from pspamm/codegen/architectures/rvv/inlineprinter.py rename to pypspamm/codegen/architectures/rvv/inlineprinter.py index 527eac8..8c0351f 100644 --- a/pspamm/codegen/architectures/rvv/inlineprinter.py +++ b/pypspamm/codegen/architectures/rvv/inlineprinter.py @@ -1,8 +1,8 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.visitor import Visitor +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * class InlinePrinter(Visitor): diff --git a/pspamm/codegen/architectures/rvv/operands.py b/pypspamm/codegen/architectures/rvv/operands.py similarity index 97% rename from pspamm/codegen/architectures/rvv/operands.py rename to pypspamm/codegen/architectures/rvv/operands.py index 54f4bda..603b227 100644 --- a/pspamm/codegen/architectures/rvv/operands.py +++ b/pypspamm/codegen/architectures/rvv/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_RV: diff --git a/pspamm/codegen/ast.py b/pypspamm/codegen/ast.py similarity index 98% rename from pspamm/codegen/ast.py rename to pypspamm/codegen/ast.py index ba743b4..8044a3b 100644 --- a/pspamm/codegen/ast.py +++ b/pypspamm/codegen/ast.py @@ -1,9 +1,9 @@ from typing import List, TYPE_CHECKING -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * if TYPE_CHECKING: - from pspamm.codegen.arm.visitors import Visitor + from pypspamm.codegen.arm.visitors import Visitor class AsmStmt: diff --git a/pspamm/codegen/ccode.py b/pypspamm/codegen/ccode.py similarity index 76% rename from pspamm/codegen/ccode.py rename to pypspamm/codegen/ccode.py index 7ca2b6d..ee14125 100644 --- a/pspamm/codegen/ccode.py +++ b/pypspamm/codegen/ccode.py @@ -1,12 +1,12 @@ -from pspamm.codegen.ast import * -from pspamm.codegen.analysis import * -from pspamm.codegen.precision import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.analysis import * +from pypspamm.codegen.precision import * -import pspamm.architecture +import pypspamm.architecture def make_cfunc(funcName:str, template:str, body:Block, flop:int, starting_regs:List[Register], precision: Precision) -> str: - Printer_class = pspamm.architecture.get_class("pspamm.codegen.architectures." + pspamm.architecture.arch + ".inlineprinter").InlinePrinter + Printer_class = pypspamm.architecture.get_class("pypspamm.codegen.architectures." + pypspamm.architecture.arch + ".inlineprinter").InlinePrinter printer = Printer_class(precision) printer.lmargin = 4 diff --git a/pspamm/codegen/forms.py b/pypspamm/codegen/forms.py similarity index 98% rename from pspamm/codegen/forms.py rename to pypspamm/codegen/forms.py index 1b8c44a..dd9e50a 100644 --- a/pspamm/codegen/forms.py +++ b/pypspamm/codegen/forms.py @@ -1,6 +1,6 @@ from typing import List -from pspamm.codegen.sugar import * +from pypspamm.codegen.sugar import * # TODO: We might eventually want to make this part of our syntax tree # in order to do unrolls and other fancy stuff with it diff --git a/pspamm/codegen/generator.py b/pypspamm/codegen/generator.py similarity index 95% rename from pspamm/codegen/generator.py rename to pypspamm/codegen/generator.py index 1eeeca5..70e782b 100644 --- a/pspamm/codegen/generator.py +++ b/pypspamm/codegen/generator.py @@ -1,6 +1,6 @@ -from pspamm.cursors import * -from pspamm.codegen.ast import * -from pspamm.codegen.precision import * +from pypspamm.cursors import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.precision import * from abc import ABC, abstractmethod class AbstractGenerator(ABC): diff --git a/pspamm/codegen/operands.py b/pypspamm/codegen/operands.py similarity index 100% rename from pspamm/codegen/operands.py rename to pypspamm/codegen/operands.py diff --git a/pspamm/codegen/precision.py b/pypspamm/codegen/precision.py similarity index 100% rename from pspamm/codegen/precision.py rename to pypspamm/codegen/precision.py diff --git a/pspamm/codegen/prune.py b/pypspamm/codegen/prune.py similarity index 100% rename from pspamm/codegen/prune.py rename to pypspamm/codegen/prune.py diff --git a/pspamm/codegen/regcache.py b/pypspamm/codegen/regcache.py similarity index 100% rename from pspamm/codegen/regcache.py rename to pypspamm/codegen/regcache.py diff --git a/pspamm/codegen/schedule.py b/pypspamm/codegen/schedule.py similarity index 100% rename from pspamm/codegen/schedule.py rename to pypspamm/codegen/schedule.py diff --git a/pspamm/codegen/sugar.py b/pypspamm/codegen/sugar.py similarity index 84% rename from pspamm/codegen/sugar.py rename to pypspamm/codegen/sugar.py index 178c75e..5f90ccd 100644 --- a/pspamm/codegen/sugar.py +++ b/pypspamm/codegen/sugar.py @@ -1,14 +1,14 @@ from typing import Union -from pspamm.codegen.ast import * -from pspamm.codegen.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.operands import * -import pspamm.architecture +import pypspamm.architecture # Convenient statement constructors def add(src: Union[Operand, int], dest: Register, comment: str = None, additional: Register = None): stmt = AddStmt() - stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) + stmt.src = src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) stmt.dest = dest stmt.comment = comment stmt.additional = additional @@ -16,7 +16,7 @@ def add(src: Union[Operand, int], dest: Register, comment: str = None, additiona def label(name: str): stmt = LabelStmt() - stmt.label = pspamm.architecture.operands.l(name) + stmt.label = pypspamm.architecture.operands.l(name) return stmt def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: str = None, bcast: Union[int, None] = None, pred: Register = None, sub=False): @@ -50,19 +50,19 @@ def bcst(bcast_src: Register, dest: Register, comment: str = None): def cmp(lhs: Union[Operand, int], rhs: Union[Operand, int]): stmt = CmpStmt() - stmt.lhs = lhs if isinstance(lhs, Operand) else pspamm.architecture.operands.c(lhs) - stmt.rhs = rhs if isinstance(rhs, Operand) else pspamm.architecture.operands.c(rhs) + stmt.lhs = lhs if isinstance(lhs, Operand) else pypspamm.architecture.operands.c(lhs) + stmt.rhs = rhs if isinstance(rhs, Operand) else pypspamm.architecture.operands.c(rhs) return stmt def jump(label: str, cmpreg = None, backwards=True): stmt = JumpStmt() - stmt.destination = pspamm.architecture.operands.l(label) + stmt.destination = pypspamm.architecture.operands.l(label) stmt.cmpreg = cmpreg return stmt def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, pred = None, expand=None, temp=None): stmt = MovStmt() - stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) + stmt.src = src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) stmt.dest = dest stmt.comment = comment stmt.pred = pred @@ -86,7 +86,7 @@ def lea(src: Register, dest: Operand, offset: int, comment:str = None): def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, dest2: Operand = None, pred: Register = None, is_B: bool = False, scalar_offs: bool = False, add_reg: AsmType.i64 = None, sub128: bool = False, expand=None, dest3: Operand = None, dest4: Operand = None): stmt = LoadStmt() - stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) + stmt.src = src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) stmt.dest = dest stmt.dest2 = dest2 stmt.dest3 = dest3 @@ -112,7 +112,7 @@ def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None def st(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, src2: Operand = None, pred: Register = None, scalar_offs: bool = False, add_reg: AsmType.i64 = None, expand=None, src3: Operand=None, src4: Operand=None): stmt = StoreStmt() - stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) + stmt.src = src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) stmt.src2 = src2 stmt.src3 = src3 stmt.src4 = src4 @@ -146,14 +146,14 @@ def prefetch(dest: Operand, comment:str = None, pred: Register = None, precision def data(value: Union[Operand, int], asmType=AsmType.i64): stmt = DataStmt() - stmt.value = value if isinstance(value, Operand) else pspamm.architecture.operands.c(value) + stmt.value = value if isinstance(value, Operand) else pypspamm.architecture.operands.c(value) stmt.asmType = asmType return stmt def rvsetvl(actual: Register, requested: Union[Register, int]): stmt = RVSetVLStmt() stmt.actual = actual - stmt.requested = requested if isinstance(requested, Operand) else pspamm.architecture.operands.c(requested) + stmt.requested = requested if isinstance(requested, Operand) else pypspamm.architecture.operands.c(requested) return stmt # Fluent interface diff --git a/pspamm/codegen/virtual.py b/pypspamm/codegen/virtual.py similarity index 100% rename from pspamm/codegen/virtual.py rename to pypspamm/codegen/virtual.py diff --git a/pspamm/codegen/visitor.py b/pypspamm/codegen/visitor.py similarity index 97% rename from pspamm/codegen/visitor.py rename to pypspamm/codegen/visitor.py index 7e6f941..8ae6a72 100644 --- a/pspamm/codegen/visitor.py +++ b/pypspamm/codegen/visitor.py @@ -1,4 +1,4 @@ -from pspamm.codegen.ast import * +from pypspamm.codegen.ast import * class Visitor: diff --git a/pypspamm/cursors/__init__.py b/pypspamm/cursors/__init__.py new file mode 100644 index 0000000..a21a3a4 --- /dev/null +++ b/pypspamm/cursors/__init__.py @@ -0,0 +1,6 @@ +from pypspamm.cursors.matrix import Matrix +from pypspamm.cursors.coords import Coords +from pypspamm.cursors.abstractcursor import BlockInfo, CursorLocation, Cursor +from pypspamm.cursors.blockcursor import BlockCursor, sparse_mask +from pypspamm.cursors.densecursor import DenseCursor + diff --git a/pspamm/cursors/abstractcursor.py b/pypspamm/cursors/abstractcursor.py similarity index 90% rename from pspamm/cursors/abstractcursor.py rename to pypspamm/cursors/abstractcursor.py index 3481830..8f43283 100644 --- a/pspamm/cursors/abstractcursor.py +++ b/pypspamm/cursors/abstractcursor.py @@ -1,8 +1,8 @@ -from pspamm.cursors.matrix import Matrix -from pspamm.cursors.coords import Coords +from pypspamm.cursors.matrix import Matrix +from pypspamm.cursors.coords import Coords -from pspamm.codegen.operands import * -from pspamm.codegen.ast import AsmStmt, Command +from pypspamm.codegen.operands import * +from pypspamm.codegen.ast import AsmStmt, Command from typing import List, Tuple diff --git a/pspamm/cursors/blockcursor.py b/pypspamm/cursors/blockcursor.py similarity index 96% rename from pspamm/cursors/blockcursor.py rename to pypspamm/cursors/blockcursor.py index b9109b4..8126117 100644 --- a/pspamm/cursors/blockcursor.py +++ b/pypspamm/cursors/blockcursor.py @@ -1,8 +1,8 @@ -from pspamm.cursors.abstractcursor import * -from pspamm.cursors.matrix import Matrix -from pspamm.cursors.coords import Coords +from pypspamm.cursors.abstractcursor import * +from pypspamm.cursors.matrix import Matrix +from pypspamm.cursors.coords import Coords -from pspamm.codegen.sugar import * +from pypspamm.codegen.sugar import * from typing import cast class BlockCursor(Cursor): @@ -106,7 +106,7 @@ def look(self, offset_bytes = self.offset(src_loc, dest_loc) * self.scalar_bytes comment = f"{self.name}[{dest_block.down},{dest_block.right}][{dest_cell.down},{dest_cell.right}]" - addr = pspamm.architecture.operands.mem(self.base_ptr, offset_bytes) + addr = pypspamm.architecture.operands.mem(self.base_ptr, offset_bytes) return (addr, comment) diff --git a/pspamm/cursors/coords.py b/pypspamm/cursors/coords.py similarity index 100% rename from pspamm/cursors/coords.py rename to pypspamm/cursors/coords.py diff --git a/pspamm/cursors/densecursor.py b/pypspamm/cursors/densecursor.py similarity index 96% rename from pspamm/cursors/densecursor.py rename to pypspamm/cursors/densecursor.py index 7a59e6a..92f0384 100644 --- a/pspamm/cursors/densecursor.py +++ b/pypspamm/cursors/densecursor.py @@ -1,7 +1,7 @@ from typing import List, Tuple, cast -from pspamm.codegen.sugar import * -from pspamm.cursors import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * @@ -82,7 +82,7 @@ def look(self, dest_offset_abs = self.offset(src.current_block, dest_block, dest_cell) rel_offset = self.scalar_bytes * (dest_offset_abs - src_offset_abs) - addr = pspamm.architecture.operands.mem(self.base_ptr, rel_offset) + addr = pypspamm.architecture.operands.mem(self.base_ptr, rel_offset) return (addr, comment) diff --git a/pspamm/cursors/matrix.py b/pypspamm/cursors/matrix.py similarity index 100% rename from pspamm/cursors/matrix.py rename to pypspamm/cursors/matrix.py diff --git a/pspamm/matmul.py b/pypspamm/matmul.py similarity index 95% rename from pspamm/matmul.py rename to pypspamm/matmul.py index e856ba6..995ebef 100644 --- a/pspamm/matmul.py +++ b/pypspamm/matmul.py @@ -1,16 +1,16 @@ from typing import Tuple -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.forms import * -from pspamm.codegen.precision import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.sugar import * +from pypspamm.codegen.forms import * +from pypspamm.codegen.precision import * -from pspamm.cursors import * +from pypspamm.cursors import * -from pspamm.codegen.virtual import * -from pspamm.codegen.prune import * +from pypspamm.codegen.virtual import * +from pypspamm.codegen.prune import * -import pspamm.architecture +import pypspamm.architecture import numpy @@ -168,13 +168,13 @@ def __init__(self, 'bf16' : Precision.BFLOAT16 }[precision.lower()] - pspamm.architecture.init() - pspamm.architecture.arch = arch - pspamm.architecture.Generator = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".generator").Generator - pspamm.architecture.operands = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".operands") - pspamm.architecture.blocksize = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".blocksize").Default + pypspamm.architecture.init() + pypspamm.architecture.arch = arch + pypspamm.architecture.Generator = pypspamm.architecture.get_class("pypspamm.codegen.architectures." + arch + ".generator").Generator + pypspamm.architecture.operands = pypspamm.architecture.get_class("pypspamm.codegen.architectures." + arch + ".operands") + pypspamm.architecture.blocksize = pypspamm.architecture.get_class("pypspamm.codegen.architectures." + arch + ".blocksize").Default - self.generator = pspamm.architecture.Generator(self.precision) + self.generator = pypspamm.architecture.Generator(self.precision) # flag that determines if a matmul kernel uses sve instructions -> needed for sve predicates self.masks = self.generator.has_masks() @@ -189,7 +189,7 @@ def __init__(self, bk = 2 if arch == 'knl' else 1 if bm == None or bn == None: - (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) + (self.bm, self.bn, self.bk) = pypspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) else: self.bm = bm self.bn = bn diff --git a/pspamm/metagen/arm.py b/pypspamm/metagen/arm.py similarity index 100% rename from pspamm/metagen/arm.py rename to pypspamm/metagen/arm.py diff --git a/pspamm/metagen/metagen.py b/pypspamm/metagen/metagen.py similarity index 96% rename from pspamm/metagen/metagen.py rename to pypspamm/metagen/metagen.py index 1b4408a..e4416f6 100644 --- a/pspamm/metagen/metagen.py +++ b/pypspamm/metagen/metagen.py @@ -1,5 +1,5 @@ -from pspamm.matmul import MatMul -from pspamm.codegen.ccode import * +from pypspamm.matmul import MatMul +from pypspamm.codegen.ccode import * class MetaGenerator: def __init__(self): diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 45d0376..3ba455b 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -4,7 +4,7 @@ import random import sys import os.path -from pspamm.codegen.precision import * +from pypspamm.codegen.precision import * BASEDIR = 'build' diff --git a/tests/unit_test.py b/tests/unit_test.py index 5bf40c6..12575d3 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -3,7 +3,7 @@ import testsuite_generator as generator from importlib import import_module -from pspamm.codegen.precision import * +from pypspamm.codegen.precision import * import sys import re @@ -16,7 +16,7 @@ archname = parsedarch.group('name') archprec = parsedarch.group('prec') -blocksize = import_module("pspamm.codegen.architectures." + archname + ".blocksize") +blocksize = import_module("pypspamm.codegen.architectures." + archname + ".blocksize") scripts = { "arm": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxK, blocksize.Cube], From 191d2ac98328a7b85943dc6133ad9608051e0c97 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 29 Jul 2025 19:12:26 +0200 Subject: [PATCH 2/9] Add setup.py; restrict authors to significant contributors --- LICENSE | 1 + pyproject.toml | 2 -- setup.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 setup.py diff --git a/LICENSE b/LICENSE index 4a67232..5b240a5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ BSD 3-Clause License +Copyright (c) 2018-2025 SeisSol Group Copyright (c) 2018, Peter Wauligmann, Nathan Brei All rights reserved. diff --git a/pyproject.toml b/pyproject.toml index 4b1419a..2a439aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,8 +12,6 @@ authors = [ { name = "Nathan Brei", email = "nathan.w.brei@gmail.com" }, { name = "Alex Puscas", email = "alex-puscas@gmx.de" }, { name = "David Schneller", email = "david.schneller@tum.de" }, - { name = "Lukas Krenz", email = "lukas@krenz.land" }, - { name = "Carsten Uphoff", email = "uphoff@in.tum.de" }, ] maintainers = [ { name = "David Schneller", email = "david.schneller@tum.de" }, diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..71ce9a5 --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +import setuptools + +with open("pypspamm/VERSION", "r") as fh: + current_version = fh.read().strip() + +with open("README.md", "r") as fh: + long_description = fh.read() + +with open("requirements.txt", "r") as fh: + install_requires = [s.strip() for s in fh.readlines() if s.strip() != ''] + +setuptools.setup( + name="PspaMM", + version=current_version, + license="BSD-3-Clause", + author="Peter Wauligmann, Nathan Brei, Alex Puscas, David Schneller", + author_email="david.schneller@tum.de", + description="An inline assembly generator for sparse matrix multiplications", + long_description=long_description, + long_description_content_type="text/markdown", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + ], + url="https://github.com/pspamm/pspamm", + python_requires='>=3.7', + install_requires=install_requires, + include_package_data=True, + entry_points={ + "console_scripts": [ + "pspamm-generator = pypspamm.cli:main", + ] + } +) From 2967da7bbc1eaea042b0dab0f4f58c683348f68c Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 29 Jul 2025 19:14:40 +0200 Subject: [PATCH 3/9] Apply black+isort --- pspamm.py | 3 +- pypspamm/architecture.py | 4 +- pypspamm/cli.py | 55 +- pypspamm/codegen/analysis.py | 5 +- pypspamm/codegen/architectures/__init__.py | 15 +- .../codegen/architectures/arm/blocksize.py | 50 +- .../codegen/architectures/arm/generator.py | 312 ++- .../architectures/arm/inlineprinter.py | 53 +- .../codegen/architectures/arm/operands.py | 19 +- .../architectures/arm_sve/blocksize.py | 31 +- .../architectures/arm_sve/generator.py | 519 +++-- .../architectures/arm_sve/inlineprinter.py | 59 +- .../codegen/architectures/arm_sve/operands.py | 11 +- .../codegen/architectures/hsw/blocksize.py | 33 +- .../codegen/architectures/hsw/generator.py | 326 ++- .../architectures/hsw/inlineprinter.py | 61 +- .../codegen/architectures/hsw/operands.py | 33 +- .../codegen/architectures/knl/blocksize.py | 42 +- .../codegen/architectures/knl/generator.py | 247 ++- .../architectures/knl/inlineprinter.py | 58 +- .../codegen/architectures/knl/operands.py | 35 +- .../codegen/architectures/lsx/blocksize.py | 15 +- .../codegen/architectures/lsx/generator.py | 255 ++- .../architectures/lsx/inlineprinter.py | 94 +- .../codegen/architectures/lsx/operands.py | 34 +- .../codegen/architectures/rvv/blocksize.py | 16 +- .../codegen/architectures/rvv/generator.py | 330 ++- .../architectures/rvv/inlineprinter.py | 62 +- .../codegen/architectures/rvv/operands.py | 16 +- pypspamm/codegen/ast.py | 203 +- pypspamm/codegen/ccode.py | 41 +- pypspamm/codegen/forms.py | 95 +- pypspamm/codegen/generator.py | 66 +- pypspamm/codegen/operands.py | 60 +- pypspamm/codegen/precision.py | 48 +- pypspamm/codegen/prune.py | 15 +- pypspamm/codegen/regcache.py | 3 +- pypspamm/codegen/schedule.py | 56 +- pypspamm/codegen/sugar.py | 129 +- pypspamm/codegen/virtual.py | 40 +- pypspamm/codegen/visitor.py | 1 + pypspamm/cursors/__init__.py | 7 +- pypspamm/cursors/abstractcursor.py | 53 +- pypspamm/cursors/blockcursor.py | 176 +- pypspamm/cursors/coords.py | 23 +- pypspamm/cursors/densecursor.py | 101 +- pypspamm/cursors/matrix.py | 40 +- pypspamm/matmul.py | 725 ++++-- pypspamm/metagen/arm.py | 13 +- pypspamm/metagen/metagen.py | 52 +- setup.py | 52 +- tests/testsuite_generator.py | 217 +- tests/unit_test.py | 1976 +++++++++++++++-- 53 files changed, 5117 insertions(+), 1868 deletions(-) diff --git a/pspamm.py b/pspamm.py index 75b62ce..bbf1583 100755 --- a/pspamm.py +++ b/pspamm.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 -if __name__=='__main__': +if __name__ == "__main__": import pypspamm.cli + pypspamm.cli.main() diff --git a/pypspamm/architecture.py b/pypspamm/architecture.py index c6f8f35..03e55b7 100755 --- a/pypspamm/architecture.py +++ b/pypspamm/architecture.py @@ -1,5 +1,6 @@ from importlib import import_module + def init(): global arch global generator @@ -8,5 +9,6 @@ def init(): generator = None operands = None -def get_class( kls ): + +def get_class(kls): return import_module(kls) diff --git a/pypspamm/cli.py b/pypspamm/cli.py index 31cf9b8..1255acb 100755 --- a/pypspamm/cli.py +++ b/pypspamm/cli.py @@ -3,16 +3,12 @@ import argparse import pypspamm.architecture - -from pypspamm.matmul import * - -from pypspamm.codegen.ccode import * from pypspamm.codegen.architectures import * - +from pypspamm.codegen.ccode import * +from pypspamm.matmul import * from pypspamm.metagen.metagen import * - -mtx_formats = ['any','csc','csr','bsc','bsr','bcsc','bcsr'] +mtx_formats = ["any", "csc", "csr", "bsc", "bsr", "bcsc", "bcsr"] def generate(alg: MatMul) -> None: @@ -29,16 +25,21 @@ def generate(alg: MatMul) -> None: f.write(text) - def main() -> None: - parser = argparse.ArgumentParser(description='Generate a sparse matrix multiplication algorithm for C = alpha * A * B + beta * C.') + parser = argparse.ArgumentParser( + description="Generate a sparse matrix multiplication algorithm for C = alpha * A * B + beta * C." + ) parser.add_argument("m", type=int, help="Number of rows of A and C") parser.add_argument("n", type=int, help="Number of cols of B and C") parser.add_argument("k", type=int, help="Number of cols of A, rows of B") - parser.add_argument("lda", type=int, help="Leading dimension of A (zero if A is sparse)") - parser.add_argument("ldb", type=int, help="Leading dimension of B (zero if B is sparse)") + parser.add_argument( + "lda", type=int, help="Leading dimension of A (zero if A is sparse)" + ) + parser.add_argument( + "ldb", type=int, help="Leading dimension of B (zero if B is sparse)" + ) parser.add_argument("ldc", type=int, help="Leading dimension of C") parser.add_argument("alpha", type=str, help="alpha, 1.0 or generic") @@ -50,23 +51,41 @@ def main() -> None: parser.add_argument("--bk", type=int, help="Size of k-blocks") parser.add_argument("--arch", help="Architecture", default="knl") - parser.add_argument("--precision", help="Precision of the matrix multiplication, either half (h), single (s), or double (d)", default="d") + parser.add_argument( + "--precision", + help="Precision of the matrix multiplication, either half (h), single (s), or double (d)", + default="d", + ) parser.add_argument("--prefetching", help="Prefetching") - parser.add_argument("--mtx_filename", help="Path to MTX file describing the sparse matrix") - parser.add_argument("--mtx_format", help="Constraint on sparsity pattern", choices=mtx_formats, default="Any") - - parser.add_argument("--amtx_filename", help="Path to MTX file describing the sparse matrix") - parser.add_argument("--bmtx_filename", help="Path to MTX file describing the sparse matrix") + parser.add_argument( + "--mtx_filename", help="Path to MTX file describing the sparse matrix" + ) + parser.add_argument( + "--mtx_format", + help="Constraint on sparsity pattern", + choices=mtx_formats, + default="Any", + ) + + parser.add_argument( + "--amtx_filename", help="Path to MTX file describing the sparse matrix" + ) + parser.add_argument( + "--bmtx_filename", help="Path to MTX file describing the sparse matrix" + ) parser.add_argument("--output_funcname", help="Name for generated C++ function") parser.add_argument("--output_filename", help="Path to destination C++ file") - parser.add_argument("--output_overwrite", action="store_true", help="Overwrite output file") + parser.add_argument( + "--output_overwrite", action="store_true", help="Overwrite output file" + ) args = parser.parse_args() alg = MatMul(**args.__dict__) generate(alg) + if __name__ == "__main__": main() diff --git a/pypspamm/codegen/analysis.py b/pypspamm/codegen/analysis.py index 7ed70b8..5ae84ca 100644 --- a/pypspamm/codegen/analysis.py +++ b/pypspamm/codegen/analysis.py @@ -1,7 +1,8 @@ -from pypspamm.codegen.visitor import Visitor +from typing import List, Set + from pypspamm.codegen.sugar import * +from pypspamm.codegen.visitor import Visitor -from typing import List, Set class Analyzer: def __init__(self, starting_regs: List[Register] = None): diff --git a/pypspamm/codegen/architectures/__init__.py b/pypspamm/codegen/architectures/__init__.py index 317a44c..340b8ea 100644 --- a/pypspamm/codegen/architectures/__init__.py +++ b/pypspamm/codegen/architectures/__init__.py @@ -1,15 +1,12 @@ -from pypspamm.codegen.architectures.arm_sve.generator import * -from pypspamm.codegen.architectures.arm_sve.inlineprinter import * -from pypspamm.codegen.architectures.arm_sve.operands import * - from pypspamm.codegen.architectures.arm.generator import * from pypspamm.codegen.architectures.arm.inlineprinter import * from pypspamm.codegen.architectures.arm.operands import * - -from pypspamm.codegen.architectures.knl.generator import * -from pypspamm.codegen.architectures.knl.inlineprinter import * -from pypspamm.codegen.architectures.knl.operands import * - +from pypspamm.codegen.architectures.arm_sve.generator import * +from pypspamm.codegen.architectures.arm_sve.inlineprinter import * +from pypspamm.codegen.architectures.arm_sve.operands import * from pypspamm.codegen.architectures.hsw.generator import * from pypspamm.codegen.architectures.hsw.inlineprinter import * from pypspamm.codegen.architectures.hsw.operands import * +from pypspamm.codegen.architectures.knl.generator import * +from pypspamm.codegen.architectures.knl.inlineprinter import * +from pypspamm.codegen.architectures.knl.operands import * diff --git a/pypspamm/codegen/architectures/arm/blocksize.py b/pypspamm/codegen/architectures/arm/blocksize.py index c8e3740..2d98520 100644 --- a/pypspamm/codegen/architectures/arm/blocksize.py +++ b/pypspamm/codegen/architectures/arm/blocksize.py @@ -1,20 +1,19 @@ - class Old: @classmethod - def getBlocksize(cls, m , n, bk, v_size, prec): + def getBlocksize(cls, m, n, bk, v_size, prec): bm = m bn = n - + if cls.ARM_condition(bm, bn, bk, v_size): - while cls.ARM_condition(bm, bn, bk+1, v_size): + while cls.ARM_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) while not cls.ARM_condition(bm, bn, bk, v_size): bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) - while cls.ARM_condition(bm, bn, bk+1, v_size): + while cls.ARM_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -36,7 +35,8 @@ def lowerToNextDiv(cls, m, n, bm, bn, v_size): def ARM_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm + bn*bk <= 32 + return (bn + bk) * vm + bn * bk <= 32 + class Max: @classmethod @@ -45,25 +45,25 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): if cls.ARM_condition(i, j, bk, v_size): - if i*j > maxval: - maxval = i*j + if i * j > maxval: + maxval = i * j bm = i bn = j - while cls.ARM_condition(bm, bn, bk+1, v_size): + while cls.ARM_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) - @classmethod def ARM_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm + bn*bk <= 32 + return (bn + bk) * vm + bn * bk <= 32 + class MaxK: @classmethod @@ -74,15 +74,15 @@ def getBlocksize(cls, m, n, bk, v_size, prec): elem128 = 16 // prec.size() - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): if cls.ARM_condition(i, j, bk, v_size, elem128): - if i*j > maxval: - maxval = i*j + if i * j > maxval: + maxval = i * j bm = i bn = j - while cls.ARM_condition(bm, bn, bk+1, v_size, elem128): + while cls.ARM_condition(bm, bn, bk + 1, v_size, elem128): bk += 1 return (bm, bn, bk) @@ -92,7 +92,8 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vm = -(bm // -v_size) vk = -(bk // -elem128) - return (bn+bk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn * vk <= 32 + class Cube: @classmethod @@ -103,12 +104,12 @@ def getBlocksize(cls, m, n, bk, v_size, prec): elem128 = 16 // prec.size() - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): for k in range(1, 200): if cls.ARM_condition(i, j, k, v_size, elem128): - if i*j*k > maxval: - maxval = i*j*k + if i * j * k > maxval: + maxval = i * j * k bm = i bn = j bk = k @@ -120,6 +121,7 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vm = -(bm // -v_size) vk = -(bk // -elem128) - return (bn+bk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn * vk <= 32 + Default = MaxK diff --git a/pypspamm/codegen/architectures/arm/generator.py b/pypspamm/codegen/architectures/arm/generator.py index 2919ff9..bd81201 100644 --- a/pypspamm/codegen/architectures/arm/generator.py +++ b/pypspamm/codegen/architectures/arm/generator.py @@ -1,10 +1,9 @@ -from pypspamm.cursors import * - from pypspamm.codegen.architectures.arm.operands import * from pypspamm.codegen.ast import * -from pypspamm.codegen.sugar import * from pypspamm.codegen.generator import * from pypspamm.codegen.precision import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * class Generator(AbstractGenerator): @@ -35,27 +34,38 @@ def use_broadcast(self): def has_masks(self): return False - + def init_mask(self, m, bm, v_size, tempreg, maskregs): return block("") - + def make_argument_load(self, starting_regs, prefetch): asm = block("Load arguments") - asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) - asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) + asm.add(ld(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(ld(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(ld(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(ld(InputOperand(f"3", "m", "alpha"), starting_regs[3], False)) + asm.add(ld(InputOperand(f"4", "m", "beta"), starting_regs[4], False)) if prefetch: - asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) + asm.add(ld(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) return asm - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): - assert(bm % v_size == 0) - vm = bm//v_size + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + assert bm % v_size == 0 + vm = bm // v_size elem128 = 16 // self.get_precision().size() vk = -(bk // -elem128) - assert((bn+bk) * vm + bn * vk <= 32) # Needs to fit in NEON v registers + assert (bn + bk) * vm + bn * vk <= 32 # Needs to fit in NEON v registers prec = { Precision.DOUBLE: "2d", @@ -63,17 +73,19 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: Precision.HALF: "8h", }[self.get_precision()] - A_regs = Matrix([[v(vm*c + r, prec) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[v(vm*bk + bn * r + c, prec) for c in range(bn)] for r in range(vk)]) - C_regs = Matrix([[v(32 - vm*bn + vm*c + r, prec) for c in range(bn)] - for r in range(vm)]) + A_regs = Matrix([[v(vm * c + r, prec) for c in range(bk)] for r in range(vm)]) + B_regs = Matrix( + [[v(vm * bk + bn * r + c, prec) for c in range(bn)] for r in range(vk)] + ) + C_regs = Matrix( + [[v(32 - vm * bn + vm * c + r, prec) for c in range(bn)] for r in range(vm)] + ) # get vector register number of the first vector in B_regs - b_reg = vm*bk + b_reg = vm * bk alpha_reg = [v(b_reg, prec), v(b_reg, prec)] beta_reg = [v(b_reg + 1, prec), v(b_reg + 1, prec)] - starting_regs = [r(0), r(1), r(2), r(3), r(4), r(5), r(11)] additional_regs = [r(8), xzr, r(10)] @@ -82,19 +94,27 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: prefetch_reg = prefetch is not None - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [], prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + [], + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: asm = block("No register based scaling") return asm def init_block(self, size): return block("") - + class LoadStoreLocation: def __init__(self, addr, register, comment, pfaddr=None): self.addr = addr @@ -102,49 +122,67 @@ def __init__(self, addr, register, comment, pfaddr=None): self.comment = comment self.pfaddr = pfaddr - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None - ) -> Block: + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + ) -> Block: rows, cols = registers.shape locations = [] for ic in range(cols): for ir in range(rows): - if (mask is None) or (mask[ir,ic]): - all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(v_size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] + if (mask is None) or (mask[ir, ic]): + all_coords = [ + Coords(down=ir * v_size + i, right=ic) for i in range(v_size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] if not any(has_nonzero): continue elif any(has_nonzero) and not all(has_nonzero): - raise NotImplementedError("Element-wise sparsity in A is not yet implemented.") + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented." + ) - cell_offset = Coords(down=ir*v_size, right=ic) + cell_offset = Coords(down=ir * v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) addr.disp += self.precision.size() * load_offset if prefetching: - pfaddr, _ = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) + pfaddr, _ = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) pfaddr.disp += self.precision.size() * load_offset else: pfaddr = None - locations += [self.LoadStoreLocation(addr, registers[ir,ic], comment, pfaddr)] + locations += [ + self.LoadStoreLocation(addr, registers[ir, ic], comment, pfaddr) + ] - return self.fuse_loadstore_block(locations, store, cursor.name, block_offset, additional_regs) + return self.fuse_loadstore_block( + locations, store, cursor.name, block_offset, additional_regs + ) - def fuse_loadstore_block(self, locations, store, name, block_offset, additional_regs): - offsets = list(sorted([(location.addr.disp,location) for location in locations])) + def fuse_loadstore_block( + self, locations, store, name, block_offset, additional_regs + ): + offsets = list( + sorted([(location.addr.disp, location) for location in locations]) + ) action = "Store" if store else "Load" asm = block(f"{action} {name} register block @ {block_offset}") @@ -152,6 +190,7 @@ def fuse_loadstore_block(self, locations, store, name, block_offset, additional_ curpf = 0 cur11 = -1000 fuse_cache = [] + def try_flush_cache(force, cur11): if len(fuse_cache) == 0: return @@ -165,10 +204,13 @@ def try_flush_cache(force, cur11): max_offset = [65520, 1008, 48, 64][len(fuse_cache) - 1] div_offset = [16, 16, 24, 32][len(fuse_cache) - 1] - comment = f'{op1.comment}' - if op2 is not None: comment += f', {op2.comment}' - if op3 is not None: comment += f', {op3.comment}' - if op4 is not None: comment += f', {op4.comment}' + comment = f"{op1.comment}" + if op2 is not None: + comment += f", {op2.comment}" + if op3 is not None: + comment += f", {op3.comment}" + if op4 is not None: + comment += f", {op4.comment}" offset = op1.addr.disp - cur11 if cur11 >= 0 else op1.addr.disp @@ -185,22 +227,42 @@ def try_flush_cache(force, cur11): cur11 += offset op1.addr.disp = 0 op1.addr.base = additional_regs[0] - + op1r = op1.register op2r = op2.register if op2 is not None else None op3r = op3.register if op3 is not None else None op4r = op4.register if op4 is not None else None if store: - asm.add(st(op1r, op1.addr, True, comment, src2=op2r, src3=op3r, src4=op4r)) + asm.add( + st( + op1r, + op1.addr, + True, + comment, + src2=op2r, + src3=op3r, + src4=op4r, + ) + ) else: - asm.add(ld(op1.addr, op1r, True, comment, dest2=op2r, dest3=op3r, dest4=op4r)) - + asm.add( + ld( + op1.addr, + op1r, + True, + comment, + dest2=op2r, + dest3=op3r, + dest4=op4r, + ) + ) + fuse_cache.clear() - + return cur11 - for _,location in offsets: + for _, location in offsets: if len(fuse_cache) > 0: can_fuse = location.addr.disp == fuse_cache[-1].addr.disp + 16 @@ -213,7 +275,14 @@ def try_flush_cache(force, cur11): if location.pfaddr is not None: if location.pfaddr.disp - curpf >= 32768: - asm.add(add(location.pfaddr.disp, additional_regs[2], "increment the prefetch register", location.pfaddr.base)) + asm.add( + add( + location.pfaddr.disp, + additional_regs[2], + "increment the prefetch register", + location.pfaddr.base, + ) + ) curpf = location.pfaddr.disp if curpf > 0: reg = additional_regs[2] @@ -221,10 +290,18 @@ def try_flush_cache(force, cur11): else: reg = location.pfaddr.base disp = location.pfaddr.disp - asm.add(prefetch(mem(reg, disp), "", access_type="LD", closeness="L2", temporality="KEEP")) + asm.add( + prefetch( + mem(reg, disp), + "", + access_type="LD", + closeness="L2", + temporality="KEEP", + ) + ) cur11 = try_flush_cache(True, cur11) - + return asm def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: @@ -233,41 +310,44 @@ def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block asm = block("zero registers") for ic in range(cols): - for ir in range(rows): - asm.add(mov(additional_regs[1], registers[ir,ic], True)) + for ir in range(rows): + asm.add(mov(additional_regs[1], registers[ir, ic], True)) return asm - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. """ asm = block("Block GEMM microkernel") - bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) - bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - assert(bm % v_size == 0) + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + assert bm % v_size == 0 mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + asm.add( + self.move_register_block( + A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False + ) + ) elem128 = 16 // self.get_precision().size() vk = -(bk // -elem128) @@ -276,37 +356,57 @@ def make_microkernel(self, bs = [] firstloc = {} locations = [] - for Vmi in range(bm//v_size): - for bni in range(bn): # inside this n-block - for bki in range(bk): # inside this k-block + for Vmi in range(bm // v_size): + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block bki_reg = bki // elem128 to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) + to_acell = Coords(down=Vmi * v_size, right=bki) if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): if (bki_reg, bni) not in firstloc: B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) - firstloc[(bki_reg, bni)] = self.LoadStoreLocation(B_cell_addr, B_regs[bki_reg, bni], B_comment) - if A.has_nonzero_cell(A_ptr, to_A_block, to_acell) and B_regs[bki_reg, bni] not in bs: + firstloc[(bki_reg, bni)] = self.LoadStoreLocation( + B_cell_addr, B_regs[bki_reg, bni], B_comment + ) + if ( + A.has_nonzero_cell(A_ptr, to_A_block, to_acell) + and B_regs[bki_reg, bni] not in bs + ): locations += [firstloc[(bki_reg, bni)]] bs.append(B_regs[bki_reg, bni]) - asm.add(self.fuse_loadstore_block(locations, False, B.name, to_B_block, additional_regs)) + asm.add( + self.fuse_loadstore_block( + locations, False, B.name, to_B_block, additional_regs + ) + ) cell_indices = {} - for bki in range(bk): # inside this k-block + for bki in range(bk): # inside this k-block # TODO: refactor cell_indices into the cursors/blocks - for Vmi in range(bm//v_size): - for bni in range(bn): # inside this n-block + for Vmi in range(bm // v_size): + for bni in range(bn): # inside this n-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) + to_acell = Coords(down=Vmi * v_size, right=bki) bki_reg = bki // elem128 if (Vmi, bki_reg, bni) not in cell_indices: cell_indices[(Vmi, bki_reg, bni)] = 0 - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): _, B_comment = B.look(B_ptr, to_B_block, to_bcell) comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" - asm.add(fma(B_regs[bki_reg, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=cell_indices[(Vmi, bki_reg, bni)], sub=sub)) - + asm.add( + fma( + B_regs[bki_reg, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + bcast=cell_indices[(Vmi, bki_reg, bni)], + sub=sub, + ) + ) + if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): cell_indices[(Vmi, bki_reg, bni)] += 1 diff --git a/pypspamm/codegen/architectures/arm/inlineprinter.py b/pypspamm/codegen/architectures/arm/inlineprinter.py index 9704ffc..6a83336 100644 --- a/pypspamm/codegen/architectures/arm/inlineprinter.py +++ b/pypspamm/codegen/architectures/arm/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List + from pypspamm.codegen.ast import * -from pypspamm.codegen.visitor import Visitor from pypspamm.codegen.operands import * from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -16,7 +17,6 @@ class InlinePrinter(Visitor): output = None stack = None - def __init__(self, precision: Precision): self.output = [] self.stack = [] @@ -26,10 +26,9 @@ def __init__(self, precision: Precision): def show(self): print("\n".join(self.output)) - def addLine(self, stmt: str, comment: str): - line = " "*self.lmargin + self.indent*self.depth + line = " " * self.lmargin + self.indent * self.depth if stmt is not None and comment is not None and self.show_comments: stmt = '"' + stmt + '\\r\\n"' @@ -43,8 +42,6 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - - def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly m = stmt.mult_src.ugly @@ -65,7 +62,11 @@ def visitMul(self, stmt: MulStmt): self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): - b = stmt.bcast_src.ugly if self.precision == Precision.DOUBLE else stmt.bcast_src.ugly_b32 + b = ( + stmt.bcast_src.ugly + if self.precision == Precision.DOUBLE + else stmt.bcast_src.ugly_b32 + ) a = stmt.dest.ugly s = f"dup {a}, {b}" self.addLine(s, stmt.comment) @@ -74,25 +75,35 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): + if isinstance(stmt.src, Constant) and ( + stmt.src.value > 4095 or stmt.src.value < -4095 + ): if (stmt.src.value >> 16) & 0xFFFF > 0 and stmt.src.value < 0: s = "mov x11, #-1" val1 = (stmt.src.value) & 0xFFFF s1 = f"movk x11, #{val1}" - val2 = ((stmt.src.value >> 16) & 0xFFFF) + val2 = (stmt.src.value >> 16) & 0xFFFF s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s, "") - self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") - self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") + self.addLine( + s1, "load lower 16 bit of immediate that requires more than 16 bit" + ) + self.addLine( + s2, "load upper 16 bit of immediate that requires more than 16 bit" + ) elif (stmt.src.value >> 16) & 0xFFFF: val1 = (stmt.src.value) & 0xFFFF s1 = f"mov x11, #{val1}" - val2 = ((stmt.src.value >> 16) & 0xFFFF) + val2 = (stmt.src.value >> 16) & 0xFFFF s2 = f"movk x11, #{val2}, lsl #16" - self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") - self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") + self.addLine( + s1, "load lower 16 bit of immediate that requires more than 16 bit" + ) + self.addLine( + s2, "load upper 16 bit of immediate that requires more than 16 bit" + ) else: s = f"mov x11, {stmt.src.ugly}" self.addLine(s, "load lower 16 bit of immediate ") @@ -131,7 +142,6 @@ def visitMov(self, stmt: MovStmt): s = f"mov {stmt.dest.ugly}, {src_str}" self.addLine(s, stmt.comment) - def visitLoad(self, stmt: LoadStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly @@ -142,10 +152,10 @@ def visitLoad(self, stmt: LoadStmt): s = f"ldr {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.dest4 is not None: - dispadd = '' if stmt.src.disp == 0 else f', {stmt.src.disp}' + dispadd = "" if stmt.src.disp == 0 else f", {stmt.src.disp}" s = f"ld1 {{ {stmt.dest.ugly},{stmt.dest2.ugly},{stmt.dest3.ugly},{stmt.dest4.ugly} }}, {stmt.src.ugly_base}{dispadd}" elif stmt.dest3 is not None: - dispadd = '' if stmt.src.disp == 0 else f', {stmt.src.disp}' + dispadd = "" if stmt.src.disp == 0 else f", {stmt.src.disp}" s = f"ld1 {{ {stmt.dest.ugly},{stmt.dest2.ugly},{stmt.dest3.ugly} }}, {stmt.src.ugly_base}{dispadd}" elif stmt.dest2 is not None: s = f"ldp {stmt.dest.ugly_scalar}, {stmt.dest2.ugly_scalar}, {src_str}" @@ -155,7 +165,6 @@ def visitLoad(self, stmt: LoadStmt): raise NotImplementedError() self.addLine(s, stmt.comment) - def visitStore(self, stmt: StoreStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly @@ -166,10 +175,10 @@ def visitStore(self, stmt: StoreStmt): s = f"str {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.src4 is not None: - dispadd = '' if stmt.dest.disp == 0 else f', {stmt.dest.disp}' + dispadd = "" if stmt.dest.disp == 0 else f", {stmt.dest.disp}" s = f"ld1 {{ {stmt.src.ugly},{stmt.src2.ugly},{stmt.src3.ugly},{stmt.src4.ugly} }}, {stmt.dest.ugly_base}{dispadd}" elif stmt.src3 is not None: - dispadd = '' if stmt.dest.disp == 0 else f', {stmt.dest.disp}' + dispadd = "" if stmt.dest.disp == 0 else f", {stmt.dest.disp}" s = f"ld1 {{ {stmt.src.ugly},{stmt.src2.ugly},{stmt.src3.ugly} }}, {stmt.dest.ugly_base}{dispadd}" elif stmt.src2 is not None: s = f"stp {stmt.src.ugly_scalar}, {stmt.src2.ugly_scalar}, {stmt.dest.ugly}" @@ -178,13 +187,13 @@ def visitStore(self, stmt: StoreStmt): else: raise NotImplementedError() self.addLine(s, stmt.comment) - + def visitPrefetch(self, stmt: PrefetchStmt): cache_level = stmt.closeness temporality = stmt.temporality src_string = stmt.dest.ugly - s = f'prfm P{stmt.access_type}{cache_level}{temporality}, {src_string}' + s = f"prfm P{stmt.access_type}{cache_level}{temporality}, {src_string}" self.addLine(s, stmt.comment) def visitBlock(self, block: Block): diff --git a/pypspamm/codegen/architectures/arm/operands.py b/pypspamm/codegen/architectures/arm/operands.py index 481f9fa..405194e 100644 --- a/pypspamm/codegen/architectures/arm/operands.py +++ b/pypspamm/codegen/architectures/arm/operands.py @@ -48,25 +48,21 @@ class Register_ARM(Register): @property def ugly(self): return self.value - + @property def ugly_precision(self): return self.value.split(".")[1] - + @property def ugly_lsl_shift(self): - return { - "d": 3, - "s": 2, - "h": 1 - }[self.ugly_precision] + return {"d": 3, "s": 2, "h": 1}[self.ugly_precision] @property def clobbered(self): if self.value == "xzr": return None # removed [this comment should stay here for now---in case there's some compiler expecting it]: .replace("x", "r") - return (self.value.split(".")[0]) + return self.value.split(".")[0] @property def ugly_scalar(self): @@ -75,7 +71,7 @@ def ugly_scalar(self): @property def ugly_scalar_1d(self): return (self.value.split(".")[0]).replace("v", "d") - + @property def ugly_b32(self): return (self.value.split(".")[0]).replace("x", "w") @@ -91,15 +87,16 @@ class MemoryAddress_ARM(MemoryAddress): @property def ugly(self): return f"[{self.base.ugly}, {self.disp}]" - + @property def ugly_base(self): return f"[{self.base.ugly}]" - + @property def ugly_offset(self): # TODO: is this already dynamic? -> if precision is single, we need LSL #2 return str(self.disp) + def mem(base, offset): return MemoryAddress_ARM(base, offset) diff --git a/pypspamm/codegen/architectures/arm_sve/blocksize.py b/pypspamm/codegen/architectures/arm_sve/blocksize.py index 3a46568..6b7ae65 100644 --- a/pypspamm/codegen/architectures/arm_sve/blocksize.py +++ b/pypspamm/codegen/architectures/arm_sve/blocksize.py @@ -18,9 +18,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = j if maxval == 0: - raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + raise RuntimeError( + "Could not find an appropriate block size. We suggest padding the matrix dimensions" + ) - while cls.ARM_condition(bm, bn, bk+1, v_size): + while cls.ARM_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -28,13 +30,14 @@ def getBlocksize(cls, m, n, bk, v_size, prec): @classmethod def ARM_condition(cls, bm, bn, bk, v_size): # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn*bk <= 32 + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 32 @classmethod def tileable(cls, m, bm): return m % bm == 0 + class MaxK: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -55,9 +58,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = j if maxval == 0: - raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + raise RuntimeError( + "Could not find an appropriate block size. We suggest padding the matrix dimensions" + ) - while cls.ARM_condition(bm, bn, bk+1, v_size, elem128): + while cls.ARM_condition(bm, bn, bk + 1, v_size, elem128): bk += 1 return (bm, bn, bk) @@ -66,15 +71,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vkext = -(bk // -elem128) - isvkext = bn*vkext <= 16 if elem128 == 2 else bn*vkext <= 8 + isvkext = bn * vkext <= 16 if elem128 == 2 else bn * vkext <= 8 vm = -(bm // -v_size) vk = vkext if isvkext else bk - return (bn + bk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn * vk <= 32 @classmethod def tileable(cls, m, bm): return m % bm == 0 + class Cube: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -97,7 +103,9 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bk = k if maxval == 0: - raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + raise RuntimeError( + "Could not find an appropriate block size. We suggest padding the matrix dimensions" + ) return (bm, bn, bk) @@ -105,13 +113,14 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vkext = -(bk // -elem128) - isvkext = bn*vkext <= 16 if elem128 == 2 else bn*vkext <= 8 + isvkext = bn * vkext <= 16 if elem128 == 2 else bn * vkext <= 8 vm = -(bm // -v_size) vk = vkext if isvkext else bk - return (bn + bk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn * vk <= 32 @classmethod def tileable(cls, m, bm): return m % bm == 0 + Default = MaxK diff --git a/pypspamm/codegen/architectures/arm_sve/generator.py b/pypspamm/codegen/architectures/arm_sve/generator.py index 50b8c23..5e76da7 100644 --- a/pypspamm/codegen/architectures/arm_sve/generator.py +++ b/pypspamm/codegen/architectures/arm_sve/generator.py @@ -1,10 +1,9 @@ -from pypspamm.cursors import * - from pypspamm.codegen.architectures.arm_sve.operands import * from pypspamm.codegen.ast import * -from pypspamm.codegen.sugar import * from pypspamm.codegen.generator import * from pypspamm.codegen.precision import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * class Generator(AbstractGenerator): @@ -27,7 +26,7 @@ class Generator(AbstractGenerator): prefetch_count = 0 is_sparse = False - v_len = 4 # vector register length: v_len * 128 bit + v_len = 4 # vector register length: v_len * 128 bit predicates = {} def get_v_size(self): @@ -38,31 +37,33 @@ def get_precision(self): def get_template(self): return self.template - + def use_broadcast(self): return True def has_masks(self): return True - + def make_argument_load(self, starting_regs, prefetch): asm = block("Load arguments") - asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) - asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) + asm.add(ld(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(ld(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(ld(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(ld(InputOperand(f"3", "m", "alpha"), starting_regs[3], False)) + asm.add(ld(InputOperand(f"4", "m", "beta"), starting_regs[4], False)) if prefetch: - asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) + asm.add(ld(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) return asm - def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Register_ARM: + def pred_n_trues( + self, num_trues: int, v_size: int, suffix: str = None + ) -> Register_ARM: """pred takes num_trues=num of true elements and suffix=type of predicate (m or z) for merging or zeroing - we only use p7 as all-true predicate and p0 as overhead predicate - e.g. pred_n_trues(n=4, v_size=8, suffix="m") returns the predicate p0/m with the first 4 elements - set to true""" - assert (num_trues > 0) - assert (suffix == "m" or suffix == "z" or suffix is None) + we only use p7 as all-true predicate and p0 as overhead predicate + e.g. pred_n_trues(n=4, v_size=8, suffix="m") returns the predicate p0/m with the first 4 elements + set to true""" + assert num_trues > 0 + assert suffix == "m" or suffix == "z" or suffix is None # we only use p7 or p0 as predicates (1 == p0, 8 == p7) index = 7 if num_trues >= v_size else self.predicates[num_trues] @@ -77,8 +78,21 @@ def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Regis def set_sparse(self): self.is_sparse = True - def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int, prefetch:str): - vm = self.ceil_div(bm, v_size) # vm can be 0 if bm < v_size -> makes ceil_div necessary + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + vm = self.ceil_div( + bm, v_size + ) # vm can be 0 if bm < v_size -> makes ceil_div necessary # k-broadcasting only works in 128-bit lanes elem128 = 16 // self.get_precision().size() @@ -86,7 +100,7 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i # inline broadcasting is only allowed for the lower-numbered registers self.inline_broadcast = False - if bn*vkext <= 16 if self.get_precision().size() == 8 else bn*vkext <= 8: + if bn * vkext <= 16 if self.get_precision().size() == 8 else bn * vkext <= 8: self.inline_broadcast = True if bk == 1: self.inline_broadcast = False @@ -95,8 +109,8 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i vk = vkext else: vk = bk - - assert ((bn + bk) * vm + bn * vk <= 32) # Needs to fit in SVE z registers + + assert (bn + bk) * vm + bn * vk <= 32 # Needs to fit in SVE z registers prec = { Precision.DOUBLE: "d", @@ -107,17 +121,33 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i # make place for the two broadcasting registers a_offset = 1 if bn * vk == 1 else 0 - assert ((bn + bk) * vm + bn * vk + a_offset <= 32) - - A_regs = Matrix([[z(vm * c + r + bn * vk + a_offset, prec) for c in range(bk)] for r in range(vm)]) + assert (bn + bk) * vm + bn * vk + a_offset <= 32 + + A_regs = Matrix( + [ + [z(vm * c + r + bn * vk + a_offset, prec) for c in range(bk)] + for r in range(vm) + ] + ) B_regs = Matrix([[z(bn * r + c, prec) for c in range(bn)] for r in range(vk)]) - C_regs = Matrix([[z(32 - vm * bn + vm * c + r, prec) for c in range(bn)] for r in range(vm)]) + C_regs = Matrix( + [[z(32 - vm * bn + vm * c + r, prec) for c in range(bn)] for r in range(vm)] + ) b_reg = 0 alpha_reg = [z(b_reg, prec), z(b_reg, prec)] beta_reg = [z(b_reg + 1, prec), z(b_reg + 1, prec)] - starting_regs = [r(0), r(1), r(2), r(3), r(4), r(5), r(6), r(11)] # r6 is needed for predicate creation, r5 is added in init_prefetching() + starting_regs = [ + r(0), + r(1), + r(2), + r(3), + r(4), + r(5), + r(6), + r(11), + ] # r6 is needed for predicate creation, r5 is added in init_prefetching() additional_regs = [r(8), l("0.0"), r(10), r(6)] # r10 used for scaling offsets @@ -129,12 +159,20 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i prefetch_reg = prefetch is not None - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs, prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + mask_regs, + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: asm = block("No register based scaling") return asm @@ -142,25 +180,14 @@ def make_scaling_offsets(self, def init_block(self, size): return block("") - def init_mask(self, - m: int, - bm: int, - v_size: int, - tempreg, - maskreg - ) -> Block: + def init_mask(self, m: int, bm: int, v_size: int, tempreg, maskreg) -> Block: asm = block("No register based scaling") return asm - def init_registers(self, - m: int, - bm: int, - k: int, - bk: int, - v_size: int, - nnz: int - ) -> None: + def init_registers( + self, m: int, bm: int, k: int, bk: int, v_size: int, nnz: int + ) -> None: bmmod = bm % v_size elem128 = 16 // self.get_precision().size() @@ -168,7 +195,7 @@ def init_registers(self, kmod = (k % bk) % elem128 if self.inline_broadcast else 0 mmod = (m % bm) % v_size - eol = "\\n\\t" # define the "end of line" sequence for easy assembly + eol = "\\n\\t" # define the "end of line" sequence for easy assembly # determine the predicate suffix p_suffix = { Precision.DOUBLE: "d", @@ -195,54 +222,91 @@ def init_registers(self, # https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2 # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate - overhead_bm = "\"mov {gen_reg}{overhead_counter}, #{overhead_bm}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" - overhead_bk = "\"mov {gen_reg}{overhead_counter}, #{overhead_bk}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_bk_overhead else "" - overhead_k = "\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p2.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_k_overhead else "" - overhead_nnz = "\"mov {gen_reg}{overhead_counter}, #{overhead_nnz}{eol}\"\n\t\"whilelo p3.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_nnz_overhead else "" - overhead_m = "\"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}\"\n\t\"whilelo p4.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if mmod != 0 else "" - all_true = "\"ptrue p7.{suffix}, #31{eol}\"" # define all true predicate - init_registers = (comment + overhead_bm + overhead_bk + overhead_k + overhead_nnz + overhead_m + all_true).format(suffix=p_suffix, - gen_reg=gen_reg, - overhead_counter=overhead_counter, - v_size=v_size, - overhead_bm=bmmod, - overhead_bk=bkmod, - overhead_k=kmod, - overhead_m=mmod, - overhead_nnz=nnz % elem128, - eol=eol) + overhead_bm = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_bm}{eol}"\n\t"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if bmmod != 0 + else "" + ) + overhead_bk = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_bk}{eol}"\n\t"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if self.has_bk_overhead + else "" + ) + overhead_k = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}"\n\t"whilelo p2.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if self.has_k_overhead + else "" + ) + overhead_nnz = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_nnz}{eol}"\n\t"whilelo p3.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if self.has_nnz_overhead + else "" + ) + overhead_m = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}"\n\t"whilelo p4.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if mmod != 0 + else "" + ) + all_true = '"ptrue p7.{suffix}, #31{eol}"' # define all true predicate + init_registers = ( + comment + + overhead_bm + + overhead_bk + + overhead_k + + overhead_nnz + + overhead_m + + all_true + ).format( + suffix=p_suffix, + gen_reg=gen_reg, + overhead_counter=overhead_counter, + v_size=v_size, + overhead_bm=bmmod, + overhead_bk=bkmod, + overhead_k=kmod, + overhead_m=mmod, + overhead_nnz=nnz % elem128, + eol=eol, + ) self.predicates[v_size] = 7 - if bmmod != 0: self.predicates[bmmod] = 0 - if bkmod != 0: self.predicates[bkmod] = 1 - if kmod != 0: self.predicates[kmod] = 2 - if mmod != 0: self.predicates[mmod] = 4 + if bmmod != 0: + self.predicates[bmmod] = 0 + if bkmod != 0: + self.predicates[bkmod] = 1 + if kmod != 0: + self.predicates[kmod] = 2 + if mmod != 0: + self.predicates[mmod] = 4 # since .format() doesn't allow partial formatting, we need to re-include the # placeholders that are replaced at the end of generating a kernel - self.template = self.get_template().format(init_registers=init_registers, - funcName="{funcName}", - body_text="{body_text}", - clobbered="{clobbered}", - flop="{flop}", - real_type="{real_type}", - args="{args}") - - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None, - is_B: bool = False - ) -> Block: + self.template = self.get_template().format( + init_registers=init_registers, + funcName="{funcName}", + body_text="{body_text}", + clobbered="{clobbered}", + flop="{flop}", + real_type="{real_type}", + args="{args}", + ) + + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + is_B: bool = False, + ) -> Block: rows, cols = registers.shape action = "Store" if store else "Load" @@ -252,13 +316,19 @@ def move_register_block(self, b_row, b_col, i, _ = cursor.get_block(cursor_ptr, block_offset) cur11 = 0 - #TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit) - threshold = 1 if self.is_sparse else (16 // self.v_len) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes + # TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit) + threshold = ( + 1 if self.is_sparse else (16 // self.v_len) + ) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes # DONE if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically) - mul_vl = 16 * self.v_len # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4) + mul_vl = ( + 16 * self.v_len + ) # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4) max_mem_ins_mult = 7 # A64FX allows a maximum positive offset of 7 in memory instructions, e.g. ld1d z1.d, p0/z, [x0, 7, MUL VL] (TODO: tune, if ever different) - max_offset = mul_vl * max_mem_ins_mult # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL + max_offset = ( + mul_vl * max_mem_ins_mult + ) # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL prev_disp = 0 prev_overhead = True @@ -271,15 +341,31 @@ def move_register_block(self, if (mask is None) or (mask[ir, ic]): processed = ir * process_size size = min(process_size, b_row - processed) - all_coords = [Coords(down=ir*process_size+i,right=ic) for i in range(size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] + all_coords = [ + Coords(down=ir * process_size + i, right=ic) + for i in range(size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] if not any(has_nonzero): continue elif any(has_nonzero) and not all(has_nonzero) and not is_B: - raise NotImplementedError("Element-wise sparsity in A is not yet implemented.") - - p = self.pred_n_trues(size, v_size) if not is_B else self.pred_n_trues(process_size, v_size) - p_zeroing = self.pred_n_trues(size, v_size, "z") if not is_B else self.pred_n_trues(process_size, v_size, "z") + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented." + ) + + p = ( + self.pred_n_trues(size, v_size) + if not is_B + else self.pred_n_trues(process_size, v_size) + ) + p_zeroing = ( + self.pred_n_trues(size, v_size, "z") + if not is_B + else self.pred_n_trues(process_size, v_size, "z") + ) cell_offset = Coords(down=ir * process_size, right=ic) # addr = base "pointer" + relative offset in bytes @@ -289,13 +375,29 @@ def move_register_block(self, offset = addr.disp - prev_disp # count how many elements we have processed between last step and this step - cont_counter = (offset // mul_vl) + cont_counter = offset // mul_vl larger_max_offset = cont_counter > max_mem_ins_mult non_dividing_offset = offset % mul_vl != 0 - if larger_max_offset or (prev_overhead and addr.disp > 0) or non_dividing_offset: - offset_comment = f"disp > {max_offset}" if larger_max_offset else ("disp % VL != 0" if non_dividing_offset else "previous mem. instr. used p0") - asm.add(add(addr.disp, additional_regs[0], offset_comment, addr.base)) + if ( + larger_max_offset + or (prev_overhead and addr.disp > 0) + or non_dividing_offset + ): + offset_comment = ( + f"disp > {max_offset}" + if larger_max_offset + else ( + "disp % VL != 0" + if non_dividing_offset + else "previous mem. instr. used p0" + ) + ) + asm.add( + add( + addr.disp, additional_regs[0], offset_comment, addr.base + ) + ) prev_disp = addr.disp addr.base = additional_regs[0] prev_base = addr.base @@ -303,26 +405,72 @@ def move_register_block(self, # adjust addr.disp to a multiple of a SVE vector's length if prev_base is None: prev_base = addr.base - + addr.base = prev_base addr.disp = (addr.disp - prev_disp) // mul_vl if store: - asm.add(st(registers[ir, ic], addr, True, comment, pred=p, scalar_offs=False, - add_reg=additional_regs[2])) + asm.add( + st( + registers[ir, ic], + addr, + True, + comment, + pred=p, + scalar_offs=False, + add_reg=additional_regs[2], + ) + ) # perform prefetching after a store instruction, similar to KNL case if prefetching: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) addr.disp += self.precision.size() * load_offset if prev_disp > 0: - asm.add(add(prev_disp, additional_regs[3], "increment the prefetch register", addr.base)) - asm.add(prefetch(mem(additional_regs[3] if prev_disp > 0 else addr.base, (addr.disp - prev_disp) // mul_vl), - "", p, prec, access_type="LD", closeness="L2", temporality="KEEP")) + asm.add( + add( + prev_disp, + additional_regs[3], + "increment the prefetch register", + addr.base, + ) + ) + asm.add( + prefetch( + mem( + ( + additional_regs[3] + if prev_disp > 0 + else addr.base + ), + (addr.disp - prev_disp) // mul_vl, + ), + "", + p, + prec, + access_type="LD", + closeness="L2", + temporality="KEEP", + ) + ) else: - asm.add(ld(addr, registers[ir, ic], True, comment, pred=p_zeroing, is_B=is_B, scalar_offs=False, - add_reg=additional_regs[2])) - - prev_overhead = p is None or int(p.ugly[1]) == 0 # determine if we previously used p0 (overhead predicate) + asm.add( + ld( + addr, + registers[ir, ic], + True, + comment, + pred=p_zeroing, + is_B=is_B, + scalar_offs=False, + add_reg=additional_regs[2], + ) + ) + + prev_overhead = ( + p is None or int(p.ugly[1]) == 0 + ) # determine if we previously used p0 (overhead predicate) return asm @@ -337,26 +485,26 @@ def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block return asm - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size: int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. """ asm = block("Block GEMM microkernel") @@ -365,8 +513,14 @@ def make_microkernel(self, bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) # tell sparse_mask() that we use sve - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + mask = sparse_mask( + A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True + ) + asm.add( + self.move_register_block( + A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False + ) + ) # x = 0; bs = [] @@ -378,7 +532,7 @@ def make_microkernel(self, # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 # in both cases: instruction encodes the immediate offset within 6 bits if not self.inline_broadcast: - max_offs = (2 ** 6 - 1) * multiple + max_offs = (2**6 - 1) * multiple divider = 1 elem128 = 1 vk = bk @@ -388,8 +542,10 @@ def make_microkernel(self, elem128 = 16 // self.get_precision().size() vk = -(bk // -elem128) - preg = self.pred_n_trues(elem128, elem128, 'z') - preg_last = preg if bk % elem128 == 0 else self.pred_n_trues(bk % elem128, elem128, 'z') + preg = self.pred_n_trues(elem128, elem128, "z") + preg_last = ( + preg if bk % elem128 == 0 else self.pred_n_trues(bk % elem128, elem128, "z") + ) firstloc = {} for Vmi in range(Vm): # set to all v_size predicates to true, we want to replicate a B element into a whole vector @@ -397,33 +553,68 @@ def make_microkernel(self, for bki in range(bk): # inside this k-block bki_reg = bki // elem128 to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) + to_acell = Coords(down=Vmi * v_size, right=bki) if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): if (bki_reg, bni) not in firstloc: B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) firstloc[(bki_reg, bni)] = (B_cell_addr, B_comment) - if A.has_nonzero_cell(A_ptr, to_A_block, to_acell) and B_regs[bki_reg, bni] not in bs: + if ( + A.has_nonzero_cell(A_ptr, to_A_block, to_acell) + and B_regs[bki_reg, bni] not in bs + ): p_zeroing = preg_last if bki_reg + 1 == vk else preg B_cell_addr = firstloc[(bki_reg, bni)][0] B_comment = firstloc[(bki_reg, bni)][1] # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value - if B_cell_addr.disp > max_offs or B_cell_addr.disp % divider != 0: + if ( + B_cell_addr.disp > max_offs + or B_cell_addr.disp % divider != 0 + ): moved = B_cell_addr.disp - cur11 - if moved > 0 and moved <= max_offs and moved % divider == 0: + if ( + moved > 0 + and moved <= max_offs + and moved % divider == 0 + ): B_cell_addr.disp = moved else: - asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) + asm.add( + add( + B_cell_addr.disp, + additional_regs[0], + "", + B_cell_addr.base, + ) + ) cur11 = B_cell_addr.disp B_cell_addr.disp = 0 B_cell_addr.base = additional_regs[0] - + if not self.inline_broadcast: - asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment, pred=p_zeroing, is_B=True)) + asm.add( + ld( + B_cell_addr, + B_regs[bki_reg, bni], + True, + B_comment, + pred=p_zeroing, + is_B=True, + ) + ) else: - asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment, pred=p_zeroing, sub128=True)) + asm.add( + ld( + B_cell_addr, + B_regs[bki_reg, bni], + True, + B_comment, + pred=p_zeroing, + sub128=True, + ) + ) bs.append(B_regs[bki_reg, bni]) # TODO: refactor cell_indices into the cursors/blocks @@ -431,23 +622,37 @@ def make_microkernel(self, for bki in range(bk): # inside this k-block for Vmi in range(Vm): p_merging = self.pred_n_trues(bm - Vmi * v_size, v_size, "m") - end_index = bm if Vmi + 1 == Vm else Vmi * v_size + v_size # end_index helps us print the right index ranges + end_index = ( + bm if Vmi + 1 == Vm else Vmi * v_size + v_size + ) # end_index helps us print the right index ranges for bni in range(bn): # inside this n-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) + to_acell = Coords(down=Vmi * v_size, right=bki) bki_reg = bki // elem128 if (Vmi, bki_reg, bni) not in cell_indices: cell_indices[(Vmi, bki_reg, bni)] = 0 - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): _, B_comment = B.look(B_ptr, to_B_block, to_bcell) comment = f"C[{Vmi * v_size}:{end_index},{bni}] += A[{Vmi * v_size}:{end_index},{bki}]*{B_comment}" - + if not self.inline_broadcast: bcast = None else: bcast = cell_indices[(Vmi, bki_reg, bni)] - asm.add(fma(B_regs[bki_reg, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=bcast, sub=sub)) - + asm.add( + fma( + B_regs[bki_reg, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + pred=p_merging, + bcast=bcast, + sub=sub, + ) + ) + if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): cell_indices[(Vmi, bki_reg, bni)] += 1 return asm diff --git a/pypspamm/codegen/architectures/arm_sve/inlineprinter.py b/pypspamm/codegen/architectures/arm_sve/inlineprinter.py index 33cb70a..ad1c9ca 100644 --- a/pypspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pypspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List + from pypspamm.codegen.ast import * -from pypspamm.codegen.visitor import Visitor from pypspamm.codegen.operands import * from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -19,14 +20,19 @@ def __init__(self, precision: Precision): self.output = [] self.stack = [] self.precision = precision - self.ugly_precision ={ + self.ugly_precision = { Precision.DOUBLE: "d", Precision.SINGLE: "w", Precision.HALF: "h", Precision.BFLOAT16: "h", }[self.precision] - assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) + assert precision in ( + Precision.BFLOAT16, + Precision.HALF, + Precision.SINGLE, + Precision.DOUBLE, + ) def show(self): print("\n".join(self.output)) @@ -89,7 +95,9 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): + if isinstance(stmt.src, Constant) and ( + stmt.src.value > 4095 or stmt.src.value < -4095 + ): # This condition is probably related to immediate values being restricted to 12 bits for add instructions # https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/ADD--immediate- # https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ADD--immediate---Add--immediate-- @@ -97,19 +105,27 @@ def visitAdd(self, stmt: AddStmt): s = "mov x11, #-1" val1 = (stmt.src.value) & 0xFFFF s1 = f"movk x11, #{val1}" - val2 = ((stmt.src.value >> 16) & 0xFFFF) + val2 = (stmt.src.value >> 16) & 0xFFFF s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s, "") - self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") - self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") + self.addLine( + s1, "load lower 16 bit of immediate that requires more than 16 bit" + ) + self.addLine( + s2, "load upper 16 bit of immediate that requires more than 16 bit" + ) elif (stmt.src.value >> 16) != 0: val1 = (stmt.src.value) & 0xFFFF s1 = "mov x11, #{val1}" - val2 = ((stmt.src.value >> 16) & 0xFFFF) + val2 = (stmt.src.value >> 16) & 0xFFFF s2 = "movk x11, #{val2}, lsl #16" - self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") - self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") + self.addLine( + s1, "load lower 16 bit of immediate that requires more than 16 bit" + ) + self.addLine( + s2, "load upper 16 bit of immediate that requires more than 16 bit" + ) else: s = f"mov x11, {stmt.src.ugly}" self.addLine(s, "load lower 16 bit of immediate ") @@ -152,13 +168,18 @@ def visitMov(self, stmt: MovStmt): def visitLoad(self, stmt: LoadStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly - elif isinstance(stmt.dest, MemoryAddress) and (stmt.src.ugly_offset != "0" and stmt.scalar_offs): - self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.src.ugly_offset}", f"move immediate offset into {stmt.add_reg.ugly}") + elif isinstance(stmt.dest, MemoryAddress) and ( + stmt.src.ugly_offset != "0" and stmt.scalar_offs + ): + self.addLine( + f"mov {stmt.add_reg.ugly}, #{stmt.src.ugly_offset}", + f"move immediate offset into {stmt.add_reg.ugly}", + ) # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision src_str = f"[{stmt.src.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.dest.ugly_lsl_shift}]" elif stmt.typ == AsmType.f64x4 or stmt.typ == AsmType.f64x2: # (note: the 128-bit and 256-bit broadcasts need the following more rudimentary format here) - if stmt.src.ugly_offset == '0': + if stmt.src.ugly_offset == "0": src_str = f"[{stmt.src.ugly_base}]" else: src_str = f"[{stmt.src.ugly_base}, #{stmt.src.ugly_offset}]" @@ -186,9 +207,15 @@ def visitLoad(self, stmt: LoadStmt): def visitStore(self, stmt: StoreStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly - elif isinstance(stmt.dest, MemoryAddress) and stmt.dest.ugly_offset != "0" and stmt.scalar_offs: - self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.dest.ugly_offset}", - f"move immediate offset into {stmt.add_reg.ugly}") + elif ( + isinstance(stmt.dest, MemoryAddress) + and stmt.dest.ugly_offset != "0" + and stmt.scalar_offs + ): + self.addLine( + f"mov {stmt.add_reg.ugly}, #{stmt.dest.ugly_offset}", + f"move immediate offset into {stmt.add_reg.ugly}", + ) # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision regsize = stmt.add_dest.size() // 16 dest_str = f"[{stmt.dest.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.src.ugly_lsl_shift}]" diff --git a/pypspamm/codegen/architectures/arm_sve/operands.py b/pypspamm/codegen/architectures/arm_sve/operands.py index e64f523..c56cff0 100644 --- a/pypspamm/codegen/architectures/arm_sve/operands.py +++ b/pypspamm/codegen/architectures/arm_sve/operands.py @@ -52,18 +52,14 @@ def ugly_precision(self): @property def ugly_lsl_shift(self): - return { - "d": 3, - "s": 2, - "h": 1 - }[self.ugly_precision] + return {"d": 3, "s": 2, "h": 1}[self.ugly_precision] @property def clobbered(self): if self.value == "xzr": return None # removed [this comment should stay here for now---in case there's some compiler expecting it]: .replace("x", "r") - return (self.value.split(".")[0].split("/")[0]) + return self.value.split(".")[0].split("/")[0] @property def ugly_scalar(self): @@ -71,7 +67,7 @@ def ugly_scalar(self): @property def ugly_scalar_1d(self): - #turns "Vn.2d" into "Dn" + # turns "Vn.2d" into "Dn" return (self.value.split(".")[0]).replace("v", "d") @@ -80,6 +76,7 @@ def ugly_scalar_1d(self): z = lambda n, prec: Register_ARM(AsmType.f64x8, "z" + str(n) + "." + prec) p = lambda n: Register_ARM(AsmType.i64, "p" + str(n)) + class MemoryAddress_ARM(MemoryAddress): @property def ugly(self): diff --git a/pypspamm/codegen/architectures/hsw/blocksize.py b/pypspamm/codegen/architectures/hsw/blocksize.py index 0a38028..9ad370b 100644 --- a/pypspamm/codegen/architectures/hsw/blocksize.py +++ b/pypspamm/codegen/architectures/hsw/blocksize.py @@ -4,16 +4,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bm = m bn = n - + if cls.HSW_condition(bm, bn, bk, v_size): - while cls.HSW_condition(bm, bn, bk+1, v_size): + while cls.HSW_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn) while not cls.HSW_condition(bm, bn, bk, v_size): bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) - while cls.HSW_condition(bm, bn, bk+1, v_size): + while cls.HSW_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn) @@ -37,6 +37,7 @@ def HSW_condition(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return (bn + bk) * vm + bn * bk <= 16 + class Max: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -44,17 +45,19 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): # can be replaced by cls.HSW_condition_extended here # (but that seemed to be slower in the end) if cls.HSW_condition(i, j, bk, v_size): - if i*j > maxval and (cls.HSW_condition(i, j, bk, v_size) or j > 1): - maxval = i*j + if i * j > maxval and ( + cls.HSW_condition(i, j, bk, v_size) or j > 1 + ): + maxval = i * j bm = i - bn = j + bn = j - while cls.HSW_condition(bm, bn, bk+1, v_size): + while cls.HSW_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -71,6 +74,7 @@ def HSW_condition_extended(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return bn * vm + bn * bk + 1 <= 16 + class Cube: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -78,14 +82,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): for k in range(1, 200): # can be replaced by cls.HSW_condition_extended here # (but that seemed to be slower in the end) if cls.HSW_condition(i, j, bk, v_size): - if i*j*k >= maxval and (cls.HSW_condition(i, j, k, v_size) or j > 1): - maxval = i*j*k + if i * j * k >= maxval and ( + cls.HSW_condition(i, j, k, v_size) or j > 1 + ): + maxval = i * j * k bm = i bn = j bk = k @@ -104,4 +110,5 @@ def HSW_condition_extended(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return bn * vm + bn * bk + 1 <= 16 + Default = Max diff --git a/pypspamm/codegen/architectures/hsw/generator.py b/pypspamm/codegen/architectures/hsw/generator.py index 3bacf69..c468616 100644 --- a/pypspamm/codegen/architectures/hsw/generator.py +++ b/pypspamm/codegen/architectures/hsw/generator.py @@ -1,11 +1,11 @@ -from pypspamm.cursors import * - from pypspamm.codegen.architectures.hsw.operands import * from pypspamm.codegen.ast import * -from pypspamm.codegen.sugar import * from pypspamm.codegen.generator import * from pypspamm.codegen.precision import * from pypspamm.codegen.regcache import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * + class Generator(AbstractGenerator): template = """ @@ -40,7 +40,7 @@ def has_masks(self): def init_mask(self, m, bm, v_size, tempreg, maskregs): return block("") - + def scale_base(self): return 256 @@ -53,13 +53,13 @@ def pred_n_trues(self, count, v_size, mode): def make_argument_load(self, starting_regs, prefetch): asm = block("Load arguments") - asm.add(mov(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(mov(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(mov(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(mov(InputOperand(f'3', 'm', 'alpha_p'), starting_regs[3], False)) - asm.add(mov(InputOperand(f'4', 'm', 'beta_p'), starting_regs[4], False)) + asm.add(mov(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(mov(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(mov(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(mov(InputOperand(f"3", "m", "alpha_p"), starting_regs[3], False)) + asm.add(mov(InputOperand(f"4", "m", "beta_p"), starting_regs[4], False)) if prefetch: - asm.add(mov(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) + asm.add(mov(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) return asm def make_expand_predicate(self, mask): @@ -67,14 +67,25 @@ def make_expand_predicate(self, mask): offset = 0 for i, value in enumerate(mask): if value: - combined |= offset << (8*i) + combined |= offset << (8 * i) offset += 1 else: - combined |= 255 << (8*i) + combined |= 255 << (8 * i) return combined - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): - assert(bm % v_size == 0) + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + assert bm % v_size == 0 vm = self.ceil_div(bm, v_size) # Needs to fit in AVX/AVX2 ymm registers @@ -82,32 +93,32 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: self.preloadA = True else: self.preloadA = False - assert(bn * vm + bn * bk + 1 <= 16) + assert bn * vm + bn * bk + 1 <= 16 - vmm = { - 1: xmm, - 2: ymm - }[self.v_len] + vmm = {1: xmm, 2: ymm}[self.v_len] if self.preloadA: - A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) - Aoffset = vm*bk + A_regs = Matrix([[vmm(vm * c + r) for c in range(bk)] for r in range(vm)]) + Aoffset = vm * bk else: A_regs = Matrix([[vmm(0) for c in range(bk)] for r in range(vm)]) Aoffset = 1 - - B_regs = Matrix([[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[vmm(16 - vm*bn + vm*c + r) for c in range(bn)] - for r in range(vm)]) + + B_regs = Matrix( + [[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)] + ) + C_regs = Matrix( + [[vmm(16 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)] + ) starting_regs = [rdi, rsi, rdx, rbx, rcx] b_reg = Aoffset alpha_reg = [xmm(b_reg), vmm(b_reg)] beta_reg = [xmm(b_reg + 1), vmm(b_reg + 1)] - additional_regs = [r(9),r(10),r(11),r(15),rax] # ,r(13),r(14) + additional_regs = [r(9), r(10), r(11), r(15), rax] # ,r(13),r(14) - prefetch_reg = prefetch == 'BL2viaC' + prefetch_reg = prefetch == "BL2viaC" if prefetch_reg: starting_regs += [r(8)] else: @@ -115,24 +126,34 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: loop_regs = [r(12), r(13), r(14)] - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [], prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + [], + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: asm = block("Optimize usage of offsets when accessing B Matrix") for i in range(1, len(additional_regs)): - asm.add(mov(c(self.scale_base() * (2*i - 1)), additional_regs[i], False)) - + asm.add(mov(c(self.scale_base() * (2 * i - 1)), additional_regs[i], False)) + return asm def init_block(self, size): return block("") - def reg_based_scaling(self, asm, addr: MemoryAddress, additional_regs: List[Register]): + def reg_based_scaling( + self, asm, addr: MemoryAddress, additional_regs: List[Register] + ): halfscale = self.scale_base() // 2 if addr.disp >= halfscale: base = (addr.disp + halfscale) // self.scale_base() @@ -147,21 +168,22 @@ def reg_based_scaling(self, asm, addr: MemoryAddress, additional_regs: List[Regi addr.scaling = scaling addr.disp = ((addr.disp + halfscale) % self.scale_base()) - halfscale - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None, - temp = None - ) -> Block: + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + temp=None, + ) -> Block: rows, cols = registers.shape action = "Store" if store else "Load" @@ -169,58 +191,78 @@ def move_register_block(self, for ic in range(cols): for ir in range(rows): - if (mask is None) or (mask[ir,ic]): - all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(v_size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] + if (mask is None) or (mask[ir, ic]): + all_coords = [ + Coords(down=ir * v_size + i, right=ic) for i in range(v_size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] if all(has_nonzero): cell_offset = all_coords[0] - addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr, comment = cursor.look( + cursor_ptr, block_offset, cell_offset + ) addr.disp += self.precision.size() * load_offset self.reg_based_scaling(asm, addr, additional_regs) if store: - asm.add(mov(registers[ir,ic], addr, True, comment)) - if prefetching == 'BL2viaC' and pf_cursor is not None: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) + asm.add(mov(registers[ir, ic], addr, True, comment)) + if prefetching == "BL2viaC" and pf_cursor is not None: + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) addr.disp += self.precision.size() * load_offset self.reg_based_scaling(asm, addr, additional_regs) asm.add(prefetch(addr, closeness="L2")) else: - asm.add(mov(addr, registers[ir,ic], True, comment)) + asm.add(mov(addr, registers[ir, ic], True, comment)) elif any(has_nonzero): - raise NotImplementedError("Element-wise sparsity in A is not yet fully implemented.") + raise NotImplementedError( + "Element-wise sparsity in A is not yet fully implemented." + ) firsti = 0 for i in range(v_size): if has_nonzero[i]: firsti = i break - addr, comment = cursor.look(cursor_ptr, block_offset, all_coords[firsti]) + addr, comment = cursor.look( + cursor_ptr, block_offset, all_coords[firsti] + ) # assume contiguous memory here - asm.add(mov(self.make_expand_predicate(all_coords), additional_regs[0], False)) + asm.add( + mov( + self.make_expand_predicate(all_coords), + additional_regs[0], + False, + ) + ) return asm - def move_register_single(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - ir, - ic, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0 - ) -> Block: + def move_register_single( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + ir, + ic, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + ) -> Block: asm = block("") - if (mask is None) or (mask[ir,ic]): - cell_offset = Coords(down=ir*v_size, right=ic) + if (mask is None) or (mask[ir, ic]): + cell_offset = Coords(down=ir * v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) addr.disp += self.precision.size() * load_offset - asm.add(mov(addr, registers[ir,ic], True, comment)) + asm.add(mov(addr, registers[ir, ic], True, comment)) return asm def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: @@ -230,53 +272,79 @@ def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block for ic in range(cols): for ir in range(rows): - asm.add(mov(0, registers[ir,ic], True)) + asm.add(mov(0, registers[ir, ic], True)) return asm - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. """ asm = block("Block GEMM microkernel") - bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) - bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - assert(bm % v_size == 0) + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + assert bm % v_size == 0 mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) if self.preloadA: - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False, temp=B_regs[0,0])) + asm.add( + self.move_register_block( + A, + A_ptr, + to_A_block, + A_regs, + v_size, + additional_regs, + mask, + store=False, + temp=B_regs[0, 0], + ) + ) else: - asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, 0, 0, mask, store=False)) + asm.add( + self.move_register_single( + A, + A_ptr, + to_A_block, + A_regs, + v_size, + additional_regs, + 0, + 0, + mask, + store=False, + ) + ) Vm = self.ceil_div(bm, v_size) bs = [] bsv = [] for Vmi in range(Vm): - for bni in range(bn): # inside this n-block - for bki in range(bk): # inside this k-block + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) self.reg_based_scaling(asm, B_addr, additional_regs) if B_regs[bki, bni] not in bs: @@ -287,15 +355,39 @@ def make_microkernel(self, # just to make sure we do not use registers differently in a block assert bsv[bs.index(B_regs[bki, bni])].ugly == B_addr.ugly - for bki in range(bk): # inside this k-block + for bki in range(bk): # inside this k-block for Vmi in range(Vm): - if not self.preloadA and not (Vmi, bki) == (0,0): - asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, Vmi, bki, mask, store=False)) - for bni in range(bn): # inside this n-block + if not self.preloadA and not (Vmi, bki) == (0, 0): + asm.add( + self.move_register_single( + A, + A_ptr, + to_A_block, + A_regs, + v_size, + additional_regs, + Vmi, + bki, + mask, + store=False, + ) + ) + for bni in range(bn): # inside this n-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): _, B_comment = B.look(B_ptr, to_B_block, to_bcell) comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None, sub=sub)) + asm.add( + fma( + B_regs[bki, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + bcast=None, + sub=sub, + ) + ) return asm diff --git a/pypspamm/codegen/architectures/hsw/inlineprinter.py b/pypspamm/codegen/architectures/hsw/inlineprinter.py index 3141e51..c5f9388 100644 --- a/pypspamm/codegen/architectures/hsw/inlineprinter.py +++ b/pypspamm/codegen/architectures/hsw/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List + from pypspamm.codegen.ast import * -from pypspamm.codegen.visitor import Visitor from pypspamm.codegen.operands import * from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -16,27 +17,20 @@ class InlinePrinter(Visitor): output = None stack = None - def __init__(self, precision: Precision): self.output = [] self.stack = [] assert precision in (Precision.SINGLE, Precision.DOUBLE) self.precision = precision - self.psuffix = { - Precision.DOUBLE: "d", - Precision.SINGLE: "s" - }[precision] - self.bpsuffix = { - Precision.DOUBLE: "q", - Precision.SINGLE: "d" - }[precision] + self.psuffix = {Precision.DOUBLE: "d", Precision.SINGLE: "s"}[precision] + self.bpsuffix = {Precision.DOUBLE: "q", Precision.SINGLE: "d"}[precision] def show(self): print("\n".join(self.output)) def addLine(self, stmt: str, comment: str): - line = " "*self.lmargin + self.indent*self.depth + line = " " * self.lmargin + self.indent * self.depth if stmt is not None and comment is not None and self.show_comments: stmt = '"' + stmt + '\\r\\n"' @@ -76,7 +70,11 @@ def visitBcst(self, stmt: BcstStmt): # reformat bcast_src to be a memory address b = f"0({b})" regsize = stmt.dest.size() - instruction = "vmovddup" if self.precision == Precision.DOUBLE and regsize == 16 else f"vbroadcasts{self.psuffix}" + instruction = ( + "vmovddup" + if self.precision == Precision.DOUBLE and regsize == 16 + else f"vbroadcasts{self.psuffix}" + ) s = f"{instruction} {b}, {a}" self.addLine(s, stmt.comment) @@ -88,7 +86,7 @@ def visitAdd(self, stmt: AddStmt): self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - self.addLine('.align 16', 'Align label') + self.addLine(".align 16", "Align label") s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) @@ -114,21 +112,40 @@ def visitMov(self, stmt: MovStmt): s = f"vxorps {stmt.dest.ugly_xmm}, {stmt.dest.ugly_xmm}, {stmt.dest.ugly_xmm}" self.addLine(s, stmt.comment) elif stmt.pred is not None: - self.addLine(f"vpxor {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly}", "") - self.addLine(f"vpblendd {src_str}, {stmt.dest.ugly}, {stmt.pred}, {stmt.dest.ugly}", "") + self.addLine( + f"vpxor {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly}", "" + ) + self.addLine( + f"vpblendd {src_str}, {stmt.dest.ugly}, {stmt.pred}, {stmt.dest.ugly}", + "", + ) elif stmt.expand: # TODO: unfinished - self.addLine(f"vpxor {stmt.temp.ugly}, {stmt.temp.ugly}, {stmt.temp.ugly}") + self.addLine( + f"vpxor {stmt.temp.ugly}, {stmt.temp.ugly}, {stmt.temp.ugly}" + ) regsize = stmt.dest.size() if self.precision == Precision.SINGLE and regsize == 32: self.addLine(f"vmovq {stmt.pred.ugly}, {stmt.dest.ugly_xmm}", "") - self.addLine(f"vpmovzxb{self.bpsuffix} {stmt.dest.ugly_xmm}, {stmt.dest.ugly}", "") - self.addLine(f"vpermd {src_str}, {stmt.dest.ugly}, {stmt.dest.ugly}", "") + self.addLine( + f"vpmovzxb{self.bpsuffix} {stmt.dest.ugly_xmm}, {stmt.dest.ugly}", + "", + ) + self.addLine( + f"vpermd {src_str}, {stmt.dest.ugly}, {stmt.dest.ugly}", "" + ) elif regsize == 16: - self.addLine(f"vpermilps {src_str}, MISSING_PREDICATE, {stmt.dest.ugly}", "") + self.addLine( + f"vpermilps {src_str}, MISSING_PREDICATE, {stmt.dest.ugly}", "" + ) elif self.precision == Precision.DOUBLE: - self.addLine(f"vpermpd {src_str}, MISSING_PREDICATE, {stmt.dest.ugly}", "") - self.addLine(f"vpblendd {stmt.temp.ugly}, {stmt.dest.ugly}, MISSING_PREDICATE, {stmt.dest.ugly}", "") + self.addLine( + f"vpermpd {src_str}, MISSING_PREDICATE, {stmt.dest.ugly}", "" + ) + self.addLine( + f"vpblendd {stmt.temp.ugly}, {stmt.dest.ugly}, MISSING_PREDICATE, {stmt.dest.ugly}", + "", + ) else: s = f"vmovup{self.psuffix} {src_str}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) @@ -152,7 +169,7 @@ def visitPrefetch(self, stmt: PrefetchStmt): def visitBlock(self, block: Block): self.stack.append(block) self.depth += 1 - if self.show_comments and block.comment != '': + if self.show_comments and block.comment != "": self.addLine(None, block.comment) for stmt in block.contents: stmt.accept(self) diff --git a/pypspamm/codegen/architectures/hsw/operands.py b/pypspamm/codegen/architectures/hsw/operands.py index 84a693f..72e33b5 100644 --- a/pypspamm/codegen/architectures/hsw/operands.py +++ b/pypspamm/codegen/architectures/hsw/operands.py @@ -20,14 +20,14 @@ def c(n): return Constant_HSW(value=int(n)) - class Label_HSW(Label): @property def ugly(self): - #return self.ordinal + # return self.ordinal return self.value.upper() + "_%=" + def l(label: str): return Label_HSW(label) @@ -37,7 +37,7 @@ class Register_HSW(Register): @property def ugly(self): return "%%" + self.value - + @property def ugly_xmm(self): return "%%x" + self.value[1:] @@ -50,20 +50,16 @@ def ugly_xmm(self): rdi = Register_HSW(AsmType.i64, "rdi") rsi = Register_HSW(AsmType.i64, "rsi") -r = lambda n: Register_HSW(AsmType.i64, "r"+str(n)) if n > 7 else gen_regs[n] -xmm = lambda n: Register_HSW(AsmType.f64x2, "xmm"+str(n)) -ymm = lambda n: Register_HSW(AsmType.f64x4, "ymm"+str(n)) - - +r = lambda n: Register_HSW(AsmType.i64, "r" + str(n)) if n > 7 else gen_regs[n] +xmm = lambda n: Register_HSW(AsmType.f64x2, "xmm" + str(n)) +ymm = lambda n: Register_HSW(AsmType.f64x4, "ymm" + str(n)) class MemoryAddress_HSW(MemoryAddress): - - def __init__(self, - base: Register, - disp: int, - index: Register = None, - scaling: int = None) -> None: + + def __init__( + self, base: Register, disp: int, index: Register = None, scaling: int = None + ) -> None: self.base = base self.disp = disp self.index = index @@ -74,15 +70,10 @@ def ugly(self): if self.index is None: return f"{self.disp}({self.base.ugly})" return f"{self.disp}({self.base.ugly},{self.index.ugly},{self.scaling})" - + def registers(self): return [self.base, self.index] + def mem(base, offset, index=None, scaling=None): return MemoryAddress_HSW(base, offset, index, scaling) - - - - - - diff --git a/pypspamm/codegen/architectures/knl/blocksize.py b/pypspamm/codegen/architectures/knl/blocksize.py index b9cd420..8c3afee 100644 --- a/pypspamm/codegen/architectures/knl/blocksize.py +++ b/pypspamm/codegen/architectures/knl/blocksize.py @@ -4,16 +4,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bm = m bn = n - + if cls.KNL_condition(bm, bn, bk, v_size): - while cls.KNL_condition(bm, bn, bk+1, v_size): + while cls.KNL_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn) while not cls.KNL_condition(bm, bn, bk, v_size): bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) - while cls.KNL_condition(bm, bn, bk+1, v_size): + while cls.KNL_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn) @@ -35,7 +35,8 @@ def lowerToNextDiv(cls, m, n, bm, bn, v_size): def KNL_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 + return (bn + bk) * vm <= 32 + class Max: @classmethod @@ -45,16 +46,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(1, m+1): + for i in range(1, m + 1): next_multiple = -(i // -v_size) - for j in range(1, n+1): + for j in range(1, n + 1): if cls.KNL_condition(next_multiple, j, bk, v_size): - if i*j >= maxval: - maxval = i*j + if i * j >= maxval: + maxval = i * j bm = i - bn = j - - while cls.KNL_condition(bm, bn, bk+1, v_size): + bn = j + + while cls.KNL_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -63,12 +64,13 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def KNL_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 + return (bn + bk) * vm <= 32 @classmethod def tileable(cls, m, bm): return m % bm == 0 + class MaxBn: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -76,11 +78,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bm = v_size bn = 1 - for j in range(1, n+1): + for j in range(1, n + 1): if cls.KNL_condition(bm, j, bk, v_size): bn = j - while cls.KNL_condition(bm, bn, bk+1, v_size): + while cls.KNL_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -89,7 +91,8 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def KNL_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 + return (bn + bk) * vm <= 32 + class CubeBn: @classmethod @@ -100,11 +103,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): maxval = 0 - for j in range(1, n+1): + for j in range(1, n + 1): for k in range(1, 200): if cls.KNL_condition(bm, j, k, v_size): - if j*k >= maxval: - maxval = j*k + if j * k >= maxval: + maxval = j * k bn = j bk = k @@ -114,6 +117,7 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def KNL_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 + return (bn + bk) * vm <= 32 + Default = MaxBn diff --git a/pypspamm/codegen/architectures/knl/generator.py b/pypspamm/codegen/architectures/knl/generator.py index 4269365..8da717b 100644 --- a/pypspamm/codegen/architectures/knl/generator.py +++ b/pypspamm/codegen/architectures/knl/generator.py @@ -1,11 +1,11 @@ -from pypspamm.cursors import * - from pypspamm.codegen.architectures.knl.operands import * from pypspamm.codegen.ast import * -from pypspamm.codegen.sugar import * from pypspamm.codegen.generator import * from pypspamm.codegen.precision import * from pypspamm.codegen.regcache import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * + class Generator(AbstractGenerator): template = """ @@ -26,7 +26,7 @@ class Generator(AbstractGenerator): }} """ v_len = 4 - predicates = {0:kmask(0)} + predicates = {0: kmask(0)} def get_v_size(self): return (16 // self.precision.size()) * self.v_len @@ -39,7 +39,7 @@ def use_broadcast(self): def has_masks(self): return True - + def scale_base(self): # larger scaling range for B inline broadcasts return self.precision.size() * 256 @@ -47,44 +47,52 @@ def scale_base(self): def pred_n_trues(self, count, v_size, mode): # a bit hacky at the moment (won't work for all masks) if count < v_size: - return Predicate(self.predicates[count], mode=='z') + return Predicate(self.predicates[count], mode == "z") else: return None - + def make_argument_load(self, starting_regs, prefetch): asm = block("Load arguments") - asm.add(mov(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(mov(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(mov(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(mov(InputOperand(f'3', 'm', 'alpha_p'), starting_regs[3], False)) - asm.add(mov(InputOperand(f'4', 'm', 'beta_p'), starting_regs[4], False)) + asm.add(mov(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(mov(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(mov(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(mov(InputOperand(f"3", "m", "alpha_p"), starting_regs[3], False)) + asm.add(mov(InputOperand(f"4", "m", "beta_p"), starting_regs[4], False)) if prefetch: - asm.add(mov(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) + asm.add(mov(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) return asm - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): vm = self.ceil_div(bm, v_size) - assert((bn+bk) * vm <= 32) # Needs to fit in AVX512 xmm/ymm/zmm registers + assert (bn + bk) * vm <= 32 # Needs to fit in AVX512 xmm/ymm/zmm registers - vmm = { - 1: xmm, - 2: ymm, - 4: zmm - }[self.v_len] + vmm = {1: xmm, 2: ymm, 4: zmm}[self.v_len] - A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) + A_regs = Matrix([[vmm(vm * c + r) for c in range(bk)] for r in range(vm)]) B_regs = Matrix([[]]) - C_regs = Matrix([[vmm(32 - vm*bn + vm*c + r) for c in range(bn)] - for r in range(vm)]) + C_regs = Matrix( + [[vmm(32 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)] + ) starting_regs = [rdi, rsi, rdx, rbx, rcx] alpha_reg = [rbx, rbx] beta_reg = [rcx, rcx] - additional_regs = [r(9),r(10),r(11),r(15),rax] # ,r(13),r(14) + additional_regs = [r(9), r(10), r(11), r(15), rax] # ,r(13),r(14) - prefetch_reg = prefetch == 'BL2viaC' + prefetch_reg = prefetch == "BL2viaC" if prefetch_reg: starting_regs += [r(8)] else: @@ -101,7 +109,18 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: self.predicates[rest2] = kmask(2) self.predicates[0] = kmask(0) - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs, prefetch_reg + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + mask_regs, + prefetch_reg, + ) def init_mask(self, m, bm, v_size, tempreg, maskregs): rest = bm % v_size @@ -117,23 +136,22 @@ def init_mask(self, m, bm, v_size, tempreg, maskregs): asm.add(mov(tempreg, maskregs[1], False)) return asm - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: asm = block("Optimize usage of offsets when accessing B Matrix") scale = self.scale_base() for i in range(1, len(additional_regs)): - asm.add(mov(c((2*i-1) * scale), additional_regs[i], False)) - + asm.add(mov(c((2 * i - 1) * scale), additional_regs[i], False)) + return asm def init_block(self, size): return block("") - def reg_based_scaling(self, asm, addr: MemoryAddress, additional_regs: List[Register]): + def reg_based_scaling( + self, asm, addr: MemoryAddress, additional_regs: List[Register] + ): halfscale = self.scale_base() // 2 if addr.disp >= halfscale: base = (addr.disp + halfscale) // self.scale_base() @@ -148,20 +166,21 @@ def reg_based_scaling(self, asm, addr: MemoryAddress, additional_regs: List[Regi addr.scaling = scaling addr.disp = ((addr.disp + halfscale) % self.scale_base()) - halfscale - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None - ) -> Block: + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + ) -> Block: rows, cols = registers.shape action = "Store" if store else "Load" @@ -175,15 +194,21 @@ def move_register_block(self, for ic in range(cols): for ir in range(rows): - if (mask is None) or (mask[ir,ic]): + if (mask is None) or (mask[ir, ic]): # no register-based scaling here (for now) processed = ir * process_size size = min(process_size, b_row - processed) - all_coords = [Coords(down=ir*process_size+i,right=ic) for i in range(size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] + all_coords = [ + Coords(down=ir * process_size + i, right=ic) + for i in range(size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] if any(has_nonzero): contiguous = True firsti = 0 @@ -203,7 +228,9 @@ def move_register_block(self, lasti = i if lasti is None: lasti = size - addr, comment = cursor.look(cursor_ptr, block_offset, all_coords[firsti]) + addr, comment = cursor.look( + cursor_ptr, block_offset, all_coords[firsti] + ) addr.disp += self.precision.size() * load_offset # assume contiguous memory here @@ -219,10 +246,14 @@ def move_register_block(self, maskFound = True else: # mostly implemented, but there are still bugs - raise NotImplementedError("Element-wise sparsity in A is not yet implemented") + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented" + ) else: - raise NotImplementedError("Element-wise sparsity in A is not yet implemented") - + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented" + ) + if not maskFound: maskreg, needsAssign = maskcache.get(bitmask) if needsAssign: @@ -231,13 +262,33 @@ def move_register_block(self, pred = Predicate(maskreg, True) if store: - asm.add(mov(registers[ir,ic], addr, True, comment, pred=pred, expand=needsExpand)) - if prefetching == 'BL2viaC' and pf_cursor is not None: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, all_coords[firsti]) + asm.add( + mov( + registers[ir, ic], + addr, + True, + comment, + pred=pred, + expand=needsExpand, + ) + ) + if prefetching == "BL2viaC" and pf_cursor is not None: + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, all_coords[firsti] + ) addr.disp += self.precision.size() * load_offset asm.add(prefetch(addr, closeness="L2")) else: - asm.add(mov(addr, registers[ir,ic], True, comment, pred=pred, expand=needsExpand)) + asm.add( + mov( + addr, + registers[ir, ic], + True, + comment, + pred=pred, + expand=needsExpand, + ) + ) return asm def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: @@ -247,49 +298,65 @@ def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block for ic in range(cols): for ir in range(rows): - asm.add(mov(0, registers[ir,ic], True)) + asm.add(mov(0, registers[ir, ic], True)) return asm - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. """ asm = block("Block GEMM microkernel") - bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) - bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + + mask = sparse_mask( + A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True + ) + asm.add( + self.move_register_block( + A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False + ) + ) Vm = max(self.ceil_div(bm, v_size), 1) - for bki in range(bk): # inside this k-block + for bki in range(bk): # inside this k-block for Vmi in range(Vm): - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) self.reg_based_scaling(asm, B_addr, additional_regs) comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" - asm.add(fma(B_addr, A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=0, sub=sub)) + asm.add( + fma( + B_addr, + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + bcast=0, + sub=sub, + ) + ) return asm diff --git a/pypspamm/codegen/architectures/knl/inlineprinter.py b/pypspamm/codegen/architectures/knl/inlineprinter.py index 22fc81a..185f088 100644 --- a/pypspamm/codegen/architectures/knl/inlineprinter.py +++ b/pypspamm/codegen/architectures/knl/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List + from pypspamm.codegen.ast import * -from pypspamm.codegen.visitor import Visitor from pypspamm.codegen.operands import * from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -16,23 +17,27 @@ class InlinePrinter(Visitor): output = None stack = None - def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) + assert precision in ( + Precision.BFLOAT16, + Precision.HALF, + Precision.SINGLE, + Precision.DOUBLE, + ) self.precision = precision self.psuffix = { - Precision.DOUBLE: 'd', - Precision.SINGLE: 's', - Precision.HALF: 'h', - Precision.BFLOAT16: 'h' + Precision.DOUBLE: "d", + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", }[precision] self.alupsuffix = { - Precision.DOUBLE: 'pd', - Precision.SINGLE: 'ps', - Precision.HALF: 'ph', - Precision.BFLOAT16: 'nepbf16' + Precision.DOUBLE: "pd", + Precision.SINGLE: "ps", + Precision.HALF: "ph", + Precision.BFLOAT16: "nepbf16", }[precision] self.bpsuffix = { Precision.DOUBLE: "q", @@ -44,16 +49,15 @@ def __init__(self, precision: Precision): Precision.DOUBLE: 2, Precision.SINGLE: 4, Precision.HALF: 8, - Precision.BFLOAT16: 8 + Precision.BFLOAT16: 8, }[precision] def show(self): print("\n".join(self.output)) - def addLine(self, stmt: str, comment: str): - line = " "*self.lmargin + self.indent*self.depth + line = " " * self.lmargin + self.indent * self.depth if stmt is not None and comment is not None and self.show_comments: stmt = '"' + stmt + '\\r\\n"' @@ -67,13 +71,13 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - def maskformat(self, pred, ignoreZero = False): + def maskformat(self, pred, ignoreZero=False): if pred is None: - return '' + return "" elif pred.zero and not ignoreZero: - return f'%{{{pred.register.ugly}%}}%{{z%}}' + return f"%{{{pred.register.ugly}%}}%{{z%}}" else: - return f'%{{{pred.register.ugly}%}}' + return f"%{{{pred.register.ugly}%}}" def visitFma(self, stmt: FmaStmt): mask = self.maskformat(stmt.pred) @@ -113,9 +117,9 @@ def visitBcst(self, stmt: BcstStmt): a = stmt.dest.ugly regsize = stmt.dest.size() if self.precision == Precision.HALF or self.precision == Precision.BFLOAT16: - instruction = 'vpbroadcastw' + instruction = "vpbroadcastw" elif self.precision == Precision.DOUBLE and regsize == 16: - instruction = 'vmovddup' + instruction = "vmovddup" else: instruction = f"vbroadcasts{self.psuffix}" s = f"{instruction} {b}, {a} {mask}" @@ -125,13 +129,13 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - + # only used for scalar addition right now s = f"addq {stmt.src.ugly}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - self.addLine('.align 16', 'Align label') + self.addLine(".align 16", "Align label") s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) @@ -153,15 +157,15 @@ def visitMov(self, stmt: MovStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - assert(stmt.pred == None) + assert stmt.pred == None # FIXME: no hack - if stmt.dest.ugly[2] == 'k': + if stmt.dest.ugly[2] == "k": s = f"kmovq {src_str}, {stmt.dest.ugly}" else: s = f"movq {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if isinstance(stmt.src, Constant) and stmt.src.value == 0: - suffix = 'd' if self.bpsuffix == 'w' else self.bpsuffix + suffix = "d" if self.bpsuffix == "w" else self.bpsuffix s = f"vpxor{suffix} {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly} {mask}" elif stmt.expand: if isinstance(stmt.src, MemoryAddress): @@ -169,7 +173,7 @@ def visitMov(self, stmt: MovStmt): else: s = f"vpcompress{self.bpsuffix} {src_str}, {stmt.dest.ugly} {mask}" else: - if self.bpsuffix == 'w' and stmt.pred is not None: + if self.bpsuffix == "w" and stmt.pred is not None: instr = "vmovsh" else: instr = f"vmovup{self.psuffix}" @@ -198,7 +202,7 @@ def visitPrefetch(self, stmt: PrefetchStmt): def visitBlock(self, block: Block): self.stack.append(block) self.depth += 1 - if self.show_comments and block.comment != '': + if self.show_comments and block.comment != "": self.addLine(None, block.comment) for stmt in block.contents: stmt.accept(self) diff --git a/pypspamm/codegen/architectures/knl/operands.py b/pypspamm/codegen/architectures/knl/operands.py index 03b6613..572f893 100644 --- a/pypspamm/codegen/architectures/knl/operands.py +++ b/pypspamm/codegen/architectures/knl/operands.py @@ -20,14 +20,14 @@ def c(n): return Constant_KNL(value=int(n)) - class Label_KNL(Label): @property def ugly(self): - #return self.ordinal + # return self.ordinal return self.value.upper() + "_%=" + def l(label: str): return Label_KNL(label) @@ -38,13 +38,12 @@ class Register_KNL(Register): def ugly(self): return "%%" + self.value + class MemoryAddress_KNL(MemoryAddress): - - def __init__(self, - base: Register, - disp: int, - index: Register = None, - scaling: int = None) -> None: + + def __init__( + self, base: Register, disp: int, index: Register = None, scaling: int = None + ) -> None: self.base = base self.disp = disp self.index = index @@ -59,10 +58,11 @@ def ugly(self): @property def clobbered(self): return self.base.clobbered - + def registers(self): return [self.base, self.index] + def mem(base, offset, index=None, scaling=None): return MemoryAddress_KNL(base, offset, index, scaling) @@ -74,25 +74,26 @@ def mem(base, offset, index=None, scaling=None): rdi = Register_KNL(AsmType.i64, "rdi") rsi = Register_KNL(AsmType.i64, "rsi") -r = lambda n: Register_KNL(AsmType.i64, "r"+str(n)) if n > 7 else gen_regs[n] -xmm = lambda n: Register_KNL(AsmType.f64x2, "xmm"+str(n)) -ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm"+str(n)) -zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm"+str(n)) -kmask= lambda n: Register_KNL(AsmType.i64, "k"+str(n)) +r = lambda n: Register_KNL(AsmType.i64, "r" + str(n)) if n > 7 else gen_regs[n] +xmm = lambda n: Register_KNL(AsmType.f64x2, "xmm" + str(n)) +ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm" + str(n)) +zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm" + str(n)) +kmask = lambda n: Register_KNL(AsmType.i64, "k" + str(n)) + class Predicate: def __init__(self, register: Register_KNL, zero: bool): self.register = register self.zero = zero - + @property def ugly(self): # TODO? return self.register.ugly - + @property def clobbered(self): return self.register.clobbered - + def registers(self): return [self.register] diff --git a/pypspamm/codegen/architectures/lsx/blocksize.py b/pypspamm/codegen/architectures/lsx/blocksize.py index db4fa3c..1df01e6 100644 --- a/pypspamm/codegen/architectures/lsx/blocksize.py +++ b/pypspamm/codegen/architectures/lsx/blocksize.py @@ -5,17 +5,19 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): # can be replaced by cls.LSX_condition_extended here # (but that seemed to be slower in the end) if cls.LSX_condition(i, j, bk, v_size): - if i*j > maxval and (cls.LSX_condition(i, j, bk, v_size) or j > 1): - maxval = i*j + if i * j > maxval and ( + cls.LSX_condition(i, j, bk, v_size) or j > 1 + ): + maxval = i * j bm = i - bn = j + bn = j - while cls.LSX_condition(bm, bn, bk+1, v_size): + while cls.LSX_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -26,4 +28,5 @@ def LSX_condition(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return (bn + bk) * vm + bn * bk <= 32 + Default = Max diff --git a/pypspamm/codegen/architectures/lsx/generator.py b/pypspamm/codegen/architectures/lsx/generator.py index a3625b8..3a576b7 100644 --- a/pypspamm/codegen/architectures/lsx/generator.py +++ b/pypspamm/codegen/architectures/lsx/generator.py @@ -1,11 +1,11 @@ -from pypspamm.cursors import * - from pypspamm.codegen.architectures.lsx.operands import * from pypspamm.codegen.ast import * -from pypspamm.codegen.sugar import * from pypspamm.codegen.generator import * from pypspamm.codegen.precision import * from pypspamm.codegen.regcache import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * + class Generator(AbstractGenerator): template = """ @@ -41,32 +41,43 @@ def init_mask(self, m, bm, v_size, tempreg, maskregs): def make_argument_load(self, starting_regs, prefetch): asm = block("Load arguments") - asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) - asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) + asm.add(ld(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(ld(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(ld(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(ld(InputOperand(f"3", "m", "alpha"), starting_regs[3], False)) + asm.add(ld(InputOperand(f"4", "m", "beta"), starting_regs[4], False)) if prefetch: - asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) + asm.add(ld(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) return asm - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): - assert(bm % v_size == 0) + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + assert bm % v_size == 0 vm = self.ceil_div(bm, v_size) assert (bn + bk) * vm + bn * bk <= 32 - vmm = { - 1: vr, - 2: xr - }[self.v_len] + vmm = {1: vr, 2: xr}[self.v_len] - A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) - Aoffset = vm*bk - - B_regs = Matrix([[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[vmm(32 - vm*bn + vm*c + r) for c in range(bn)] - for r in range(vm)]) + A_regs = Matrix([[vmm(vm * c + r) for c in range(bk)] for r in range(vm)]) + Aoffset = vm * bk + + B_regs = Matrix( + [[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)] + ) + C_regs = Matrix( + [[vmm(32 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)] + ) b_reg = Aoffset alpha_reg = [vmm(b_reg)] * 2 @@ -78,34 +89,43 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: loop_regs = [r(28), r(29), r(30)] - prefetch_reg = prefetch == 'BL2viaC' - - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [], prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: + prefetch_reg = prefetch == "BL2viaC" + + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + [], + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: return block("") def init_block(self, size): return block("") - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None, - temp = None - ) -> Block: + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + temp=None, + ) -> Block: rows, cols = registers.shape action = "Store" if store else "Load" @@ -116,12 +136,19 @@ def move_register_block(self, for ic in range(cols): for ir in range(rows): - if (mask is None) or (mask[ir,ic]): - all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(v_size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] + if (mask is None) or (mask[ir, ic]): + all_coords = [ + Coords(down=ir * v_size + i, right=ic) for i in range(v_size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] if all(has_nonzero): cell_offset = all_coords[0] - addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr, comment = cursor.look( + cursor_ptr, block_offset, cell_offset + ) addr.disp += self.precision.size() * load_offset needsmove = False if addr.disp > max_offs: @@ -129,30 +156,43 @@ def move_register_block(self, if moved > 0 and moved <= max_offs: addr.disp = moved else: - asm.add(add(addr.disp, additional_regs[0], "", addr.base)) + asm.add( + add(addr.disp, additional_regs[0], "", addr.base) + ) cur11 = addr.disp addr.disp = 0 needsmove = True addr.base = additional_regs[0] if store: - asm.add(st(registers[ir,ic], addr, True, comment)) - if prefetching == 'BL2viaC' and pf_cursor is not None: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) + asm.add(st(registers[ir, ic], addr, True, comment)) + if prefetching == "BL2viaC" and pf_cursor is not None: + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) addr.disp += self.precision.size() * load_offset if addr.disp > max_offs: moved = addr.disp - cur11 if needsmove: - asm.add(add(addr.disp, additional_regs[3], "", addr.base)) + asm.add( + add( + addr.disp, + additional_regs[3], + "", + addr.base, + ) + ) addr.disp = 0 else: addr.disp = moved addr.base = additional_regs[3] asm.add(prefetch(addr, closeness="L2")) else: - asm.add(ld(addr, registers[ir,ic], True, comment)) + asm.add(ld(addr, registers[ir, ic], True, comment)) elif any(has_nonzero): - raise NotImplementedError("Element-wise sparsity in A is not yet fully implemented.") + raise NotImplementedError( + "Element-wise sparsity in A is not yet fully implemented." + ) return asm def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: @@ -162,39 +202,50 @@ def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block for ic in range(cols): for ir in range(rows): - asm.add(mov(0, registers[ir,ic], True)) + asm.add(mov(0, registers[ir, ic], True)) return asm - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. """ asm = block("Block GEMM microkernel") - bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) - bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - assert(bm % v_size == 0) + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + assert bm % v_size == 0 mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False, temp=B_regs[0,0])) + asm.add( + self.move_register_block( + A, + A_ptr, + to_A_block, + A_regs, + v_size, + additional_regs, + mask, + store=False, + temp=B_regs[0, 0], + ) + ) Vm = self.ceil_div(bm, v_size) cur11 = 0 @@ -202,11 +253,13 @@ def make_microkernel(self, bs = [] for Vmi in range(Vm): - for bni in range(bn): # inside this n-block - for bki in range(bk): # inside this k-block + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) if B_regs[bki, bni] not in bs: # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value @@ -215,22 +268,40 @@ def make_microkernel(self, if moved > 0 and moved <= max_offs: B_cell_addr.disp = moved else: - asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) + asm.add( + add( + B_cell_addr.disp, + additional_regs[0], + "", + B_cell_addr.base, + ) + ) cur11 = B_cell_addr.disp B_cell_addr.disp = 0 B_cell_addr.base = additional_regs[0] - + asm.add(bcst(B_cell_addr, B_regs[bki, bni], B_comment)) bs.append(B_regs[bki, bni]) - for bki in range(bk): # inside this k-block + for bki in range(bk): # inside this k-block for Vmi in range(Vm): - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): _, B_comment = B.look(B_ptr, to_B_block, to_bcell) comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None, sub=sub)) + asm.add( + fma( + B_regs[bki, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + bcast=None, + sub=sub, + ) + ) return asm diff --git a/pypspamm/codegen/architectures/lsx/inlineprinter.py b/pypspamm/codegen/architectures/lsx/inlineprinter.py index 0926e79..199d9d3 100644 --- a/pypspamm/codegen/architectures/lsx/inlineprinter.py +++ b/pypspamm/codegen/architectures/lsx/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List + from pypspamm.codegen.ast import * -from pypspamm.codegen.visitor import Visitor from pypspamm.codegen.operands import * from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -16,27 +17,20 @@ class InlinePrinter(Visitor): output = None stack = None - def __init__(self, precision: Precision): self.output = [] self.stack = [] assert precision in (Precision.SINGLE, Precision.DOUBLE) self.precision = precision - self.psuffix = { - Precision.DOUBLE: "d", - Precision.SINGLE: "s" - }[precision] - self.bpsuffix = { - Precision.DOUBLE: "d", - Precision.SINGLE: "w" - }[precision] + self.psuffix = {Precision.DOUBLE: "d", Precision.SINGLE: "s"}[precision] + self.bpsuffix = {Precision.DOUBLE: "d", Precision.SINGLE: "w"}[precision] def show(self): print("\n".join(self.output)) def addLine(self, stmt: str, comment: str): - line = " "*self.lmargin + self.indent*self.depth + line = " " * self.lmargin + self.indent * self.depth if stmt is not None and comment is not None and self.show_comments: stmt = '"' + stmt + '\\r\\n"' @@ -51,16 +45,13 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) def prefix(self, register): - return { - 16: "v", - 32: "xv" - }[register.size()] - + return {16: "v", 32: "xv"}[register.size()] + def iname(self, root, refreg, bp): prefix = self.prefix(refreg) suffix = self.bpsuffix if bp else self.psuffix return f"{prefix}{root}.{suffix}" - + def to_addi(self, value): ADDILENGTH = 12 ADDIBLOCK = (1 << ADDILENGTH) - 1 @@ -98,9 +89,9 @@ def visitBcst(self, stmt: BcstStmt): a = stmt.dest.ugly # check if we broadcast a general register if isinstance(stmt.bcast_src, Register): - instruction = self.iname('replgr2vr', stmt.dest, True) + instruction = self.iname("replgr2vr", stmt.dest, True) else: - instruction = self.iname('ldrepl', stmt.dest, True) + instruction = self.iname("ldrepl", stmt.dest, True) s = f"{instruction} {a}, {b}" self.addLine(s, stmt.comment) @@ -108,7 +99,9 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - if isinstance(stmt.src, Constant) and (stmt.src.value > 2047 or stmt.src.value < -2048): + if isinstance(stmt.src, Constant) and ( + stmt.src.value > 2047 or stmt.src.value < -2048 + ): # we need an intermediate register here # TODO: do not hard-code x5 here, make well-defined @@ -118,18 +111,32 @@ def visitAdd(self, stmt: AddStmt): addival, luival = self.to_addi(-stmt.src.value) else: addival, luival = self.to_addi(stmt.src.value) - self.addLine(f"lu12i.w {itmp}, {luival}", f"Intermediate add: place upper 12 bits of {stmt.src.value}") + self.addLine( + f"lu12i.w {itmp}, {luival}", + f"Intermediate add: place upper 12 bits of {stmt.src.value}", + ) if addival != 0: - self.addLine(f"addi.d {itmp}, {itmp}, {addival}", f"Intermediate add: place lower 12 bits of {stmt.src.value}") + self.addLine( + f"addi.d {itmp}, {itmp}, {addival}", + f"Intermediate add: place lower 12 bits of {stmt.src.value}", + ) if stmt.src.value < 0: - self.addLine(f"sub.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + self.addLine( + f"sub.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment + ) else: - self.addLine(f"add.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + self.addLine( + f"add.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment + ) else: # if stmt.src is a Constant but outside of the above range of value < -2048 or value > 2047 # we can simply add the Constant to a register - accumulate = stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly - self.addLine(f"addi.d {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment) + accumulate = ( + stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly + ) + self.addLine( + f"addi.d {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment + ) def visitLabel(self, stmt: LabelStmt): s = f"{stmt.label.ugly}:" @@ -145,24 +152,37 @@ def visitJump(self, stmt: JumpStmt): def visitMov(self, stmt: MovStmt): if isinstance(stmt.src, Constant): if stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4]: - assert stmt.src.ugly == '0' - self.addLine(f"{self.prefix(stmt.dest)}ldi {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment) + assert stmt.src.ugly == "0" + self.addLine( + f"{self.prefix(stmt.dest)}ldi {stmt.dest.ugly}, {stmt.src.ugly}", + stmt.comment, + ) else: if stmt.src.value < 2**12: - self.addLine(f"addi.w {stmt.dest.ugly}, $r0, {stmt.src.value}", stmt.comment) + self.addLine( + f"addi.w {stmt.dest.ugly}, $r0, {stmt.src.value}", stmt.comment + ) elif stmt.src.value < 2**32: addival, luival = self.to_addi(stmt.src.value) - self.addLine(f"lu12i.w {stmt.dest.ugly}, {luival}", "Intermediate mov: place upper 12 bits") + self.addLine( + f"lu12i.w {stmt.dest.ugly}, {luival}", + "Intermediate mov: place upper 12 bits", + ) if addival != 0: - self.addLine(f"addi.w {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", stmt.comment) + self.addLine( + f"addi.w {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", + stmt.comment, + ) else: raise NotImplementedError() elif isinstance(stmt.src, Register): if stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4]: - iname = self.iname('replgr2vr', stmt.dest, True) + iname = self.iname("replgr2vr", stmt.dest, True) self.addLine(f"{iname} {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment) else: - self.addLine(f"addi.w {stmt.dest.ugly}, {stmt.src.ugly}, 0", stmt.comment) + self.addLine( + f"addi.w {stmt.dest.ugly}, {stmt.src.ugly}, 0", stmt.comment + ) else: raise NotImplementedError() @@ -176,14 +196,14 @@ def visitPrefetch(self, stmt: PrefetchStmt): # TODO: maybe preldx here? s = f"preld {hint}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) - + def visitLoad(self, stmt: LoadStmt): if stmt.dest.typeinfo == AsmType.f64: s = f"fl{self.ugly_precision} {stmt.dest.ugly}, {stmt.src.ugly}" elif stmt.dest.typeinfo == AsmType.i64: s = f"ld.d {stmt.dest.ugly}, {stmt.src.ugly}" elif stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4] and stmt.aligned: - instr = f'{self.prefix(stmt.dest)}ld' + instr = f"{self.prefix(stmt.dest)}ld" s = f"{instr} {stmt.dest.ugly}, {stmt.src.ugly}" else: raise NotImplementedError() @@ -195,7 +215,7 @@ def visitStore(self, stmt: StoreStmt): elif stmt.src.typeinfo == AsmType.i64: s = f"st.d {stmt.src.ugly}, {stmt.dest.ugly}" elif stmt.src.typeinfo in [AsmType.f64x2, AsmType.f64x4] and stmt.aligned: - instr = f'{self.prefix(stmt.src)}st' + instr = f"{self.prefix(stmt.src)}st" s = f"{instr} {stmt.src.ugly}, {stmt.dest.ugly}" else: raise NotImplementedError() @@ -204,7 +224,7 @@ def visitStore(self, stmt: StoreStmt): def visitBlock(self, block: Block): self.stack.append(block) self.depth += 1 - if self.show_comments and block.comment != '': + if self.show_comments and block.comment != "": self.addLine(None, block.comment) for stmt in block.contents: stmt.accept(self) diff --git a/pypspamm/codegen/architectures/lsx/operands.py b/pypspamm/codegen/architectures/lsx/operands.py index 4921c9c..5d9d5bd 100644 --- a/pypspamm/codegen/architectures/lsx/operands.py +++ b/pypspamm/codegen/architectures/lsx/operands.py @@ -20,14 +20,14 @@ def c(n): return Constant_LSX(value=int(n)) - class Label_LSX(Label): @property def ugly(self): - #return self.ordinal + # return self.ordinal return self.value.upper() + "_%=" + def l(label: str): return Label_LSX(label) @@ -38,20 +38,17 @@ class Register_LSX(Register): def ugly(self): return "$" + self.value -r = lambda n: Register_LSX(AsmType.i64, "r"+str(n)) -vr = lambda n: Register_LSX(AsmType.f64x2, "vr"+str(n)) -xr = lambda n: Register_LSX(AsmType.f64x4, "xr"+str(n)) - +r = lambda n: Register_LSX(AsmType.i64, "r" + str(n)) +vr = lambda n: Register_LSX(AsmType.f64x2, "vr" + str(n)) +xr = lambda n: Register_LSX(AsmType.f64x4, "xr" + str(n)) class MemoryAddress_LSX(MemoryAddress): - - def __init__(self, - base: Register, - disp: int, - index: Register = None, - scaling: int = None) -> None: + + def __init__( + self, base: Register, disp: int, index: Register = None, scaling: int = None + ) -> None: self.base = base self.disp = disp self.index = index @@ -59,19 +56,14 @@ def __init__(self, @property def ugly(self): - #if self.index is None: + # if self.index is None: # return f"{self.disp}({self.base.ugly})" - #return f"{self.disp}({self.base.ugly},{self.index.ugly},{self.scaling})" + # return f"{self.disp}({self.base.ugly},{self.index.ugly},{self.scaling})" return f"{self.base.ugly},{self.disp}" - + def registers(self): return [self.base, self.index] + def mem(base, offset, index=None, scaling=None): return MemoryAddress_LSX(base, offset, index, scaling) - - - - - - diff --git a/pypspamm/codegen/architectures/rvv/blocksize.py b/pypspamm/codegen/architectures/rvv/blocksize.py index be67492..9d06990 100644 --- a/pypspamm/codegen/architectures/rvv/blocksize.py +++ b/pypspamm/codegen/architectures/rvv/blocksize.py @@ -5,11 +5,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bm = v_size bn = 1 - for j in range(1, n+1): + for j in range(1, n + 1): if cls.RVV_condition(bm, j, bk, v_size): bn = j - while cls.RVV_condition(bm, bn, bk+1, v_size): + while cls.RVV_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -18,7 +18,8 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def RVV_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 and bn*bk + 2 <= 32 + return (bn + bk) * vm <= 32 and bn * bk + 2 <= 32 + class CubeBn: @classmethod @@ -29,11 +30,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): maxval = 0 - for j in range(1, n+1): + for j in range(1, n + 1): for k in range(1, 200): if cls.RVV_condition(bm, j, k, v_size): - if j*k >= maxval: - maxval = j*k + if j * k >= maxval: + maxval = j * k bn = j bk = k @@ -43,6 +44,7 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def RVV_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 and bn*bk + 2 <= 32 + return (bn + bk) * vm <= 32 and bn * bk + 2 <= 32 + Default = MaxBn diff --git a/pypspamm/codegen/architectures/rvv/generator.py b/pypspamm/codegen/architectures/rvv/generator.py index 51503a4..8bb3124 100644 --- a/pypspamm/codegen/architectures/rvv/generator.py +++ b/pypspamm/codegen/architectures/rvv/generator.py @@ -1,10 +1,9 @@ -from pypspamm.cursors import * - from pypspamm.codegen.architectures.rvv.operands import * from pypspamm.codegen.ast import * -from pypspamm.codegen.sugar import * from pypspamm.codegen.generator import * from pypspamm.codegen.precision import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * class Generator(AbstractGenerator): @@ -24,7 +23,7 @@ class Generator(AbstractGenerator): """ is_sparse = False - v_len = 1 # vector register length: v_len * 128 bit + v_len = 1 # vector register length: v_len * 128 bit predicates = {} def get_v_size(self): @@ -35,34 +34,49 @@ def get_precision(self): def get_template(self): return self.template - + def use_broadcast(self): return False def has_masks(self): - return False # not yet + return False # not yet - def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Register_RV: + def pred_n_trues( + self, num_trues: int, v_size: int, suffix: str = None + ) -> Register_RV: return None # is called at most one time in matmul.py def set_sparse(self): self.is_sparse = True - + def make_argument_load(self, starting_regs, prefetch): asm = block("Load arguments") - asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) - asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) + asm.add(ld(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(ld(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(ld(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(ld(InputOperand(f"3", "m", "alpha"), starting_regs[3], False)) + asm.add(ld(InputOperand(f"4", "m", "beta"), starting_regs[4], False)) if prefetch: - asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) + asm.add(ld(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) return asm - def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int, prefetch: str): - vm = self.ceil_div(bm, v_size) # vm can be 0 if bm < v_size -> makes ceil_div necessary - + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + vm = self.ceil_div( + bm, v_size + ) # vm can be 0 if bm < v_size -> makes ceil_div necessary + assert bn * bk + 2 <= 32 assert (bn + bk) * vm <= 32 @@ -75,7 +89,9 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i A_regs = Matrix([[v(vm * c + r) for c in range(bk)] for r in range(vm)]) B_regs = Matrix([[f(bn * r + c + 2) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[v(32 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)]) + C_regs = Matrix( + [[v(32 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)] + ) b_reg = 0 alpha_reg = [f(0), f(0)] @@ -92,27 +108,29 @@ def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: i prefetch_reg = prefetch is not None - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs, prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + mask_regs, + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: asm = block("No register based scaling") return asm - def init_mask(self, - m: int, - bm: int, - v_size: int, - tempreg, - maskreg - ) -> Block: + def init_mask(self, m: int, bm: int, v_size: int, tempreg, maskreg) -> Block: asm = block("No register based scaling") return asm - + def init_block(self, size): if size < 32: return rvsetvl(x(0), size) @@ -122,21 +140,22 @@ def init_block(self, size): asm.add(rvsetvl(x(0), x(5))) return asm - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None, - is_B: bool = False - ) -> Block: + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + is_B: bool = False, + ) -> Block: rows, cols = registers.shape action = "Store" if store else "Load" @@ -146,11 +165,15 @@ def move_register_block(self, b_row, b_col, i, _ = cursor.get_block(cursor_ptr, block_offset) cur11 = 0 - #TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit) - threshold = 1 if self.is_sparse else (16 // self.v_len) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes + # TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit) + threshold = ( + 1 if self.is_sparse else (16 // self.v_len) + ) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes # DONE if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically) - mul_vl = 16 * self.v_len # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4) + mul_vl = ( + 16 * self.v_len + ) # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4) max_mem_ins_mult = 0 max_offset = 0 # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL @@ -162,18 +185,36 @@ def move_register_block(self, for ic in range(cols): for ir in range(rows): if (mask is None) or (mask[ir, ic]): - all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(process_size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] + all_coords = [ + Coords(down=ir * v_size + i, right=ic) + for i in range(process_size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] if not any(has_nonzero): continue elif any(has_nonzero) and not all(has_nonzero) and not is_B: - raise NotImplementedError("Element-wise sparsity in A is not yet implemented.") + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented." + ) processed = ir * process_size if processed >= b_row: continue - p = self.pred_n_trues(min(b_row - processed, process_size), v_size) if not is_B else self.pred_n_trues(process_size, v_size) - p_zeroing = self.pred_n_trues(min(b_row - processed, process_size), v_size, "z") if not is_B else self.pred_n_trues(process_size, v_size, "z") + p = ( + self.pred_n_trues(min(b_row - processed, process_size), v_size) + if not is_B + else self.pred_n_trues(process_size, v_size) + ) + p_zeroing = ( + self.pred_n_trues( + min(b_row - processed, process_size), v_size, "z" + ) + if not is_B + else self.pred_n_trues(process_size, v_size, "z") + ) cell_offset = Coords(down=ir * v_size, right=ic) # addr = base "pointer" + relative offset in bytes @@ -183,7 +224,7 @@ def move_register_block(self, offset = addr.disp - prev_disp # count how many elements we have processed between last step and this step - cont_counter = (offset // mul_vl) + cont_counter = offset // mul_vl larger_max_offset = cont_counter > max_mem_ins_mult non_dividing_offset = offset % mul_vl != 0 @@ -193,29 +234,84 @@ def move_register_block(self, if larger_max_offset or addr.disp > 0 or non_dividing_offset: offset_comment = f"move to new vector" - if offset < 2048 and offset >= -2048 and prev_base == additional_regs[0]: + if ( + offset < 2048 + and offset >= -2048 + and prev_base == additional_regs[0] + ): asm.add(add(offset, additional_regs[0], offset_comment)) else: - asm.add(add(addr.disp, additional_regs[0], offset_comment, addr.base)) + asm.add( + add( + addr.disp, + additional_regs[0], + offset_comment, + addr.base, + ) + ) prev_disp = addr.disp addr.base = additional_regs[0] addr.disp = 0 prev_base = additional_regs[0] if store: - asm.add(st(registers[ir, ic], addr, True, comment, pred=p, scalar_offs=False, - add_reg=additional_regs[2])) + asm.add( + st( + registers[ir, ic], + addr, + True, + comment, + pred=p, + scalar_offs=False, + add_reg=additional_regs[2], + ) + ) # perform prefetching after a store instruction, similar to KNL case if prefetching: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) addr.disp += self.precision.size() * load_offset if prev_disp > 0: - asm.add(add(prev_disp, additional_regs[3], "increment the prefetch register", addr.base)) - asm.add(prefetch(mem(additional_regs[3] if prev_disp > 0 else addr.base, addr.disp - prev_disp), - "", p, prec, access_type="r", closeness="L2", temporality="KEEP")) + asm.add( + add( + prev_disp, + additional_regs[3], + "increment the prefetch register", + addr.base, + ) + ) + asm.add( + prefetch( + mem( + ( + additional_regs[3] + if prev_disp > 0 + else addr.base + ), + addr.disp - prev_disp, + ), + "", + p, + prec, + access_type="r", + closeness="L2", + temporality="KEEP", + ) + ) else: - asm.add(ld(addr, registers[ir, ic], True, comment, pred=p_zeroing, is_B=is_B, scalar_offs=False, - add_reg=additional_regs[2])) + asm.add( + ld( + addr, + registers[ir, ic], + True, + comment, + pred=p_zeroing, + is_B=is_B, + scalar_offs=False, + add_reg=additional_regs[2], + ) + ) return asm @@ -230,26 +326,26 @@ def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block return asm - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size: int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. """ asm = block("Block GEMM microkernel") @@ -258,8 +354,14 @@ def make_microkernel(self, bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) # tell sparse_mask() that we use sve - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + mask = sparse_mask( + A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True + ) + asm.add( + self.move_register_block( + A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False + ) + ) bs = [] cur11 = -10000 @@ -276,8 +378,10 @@ def make_microkernel(self, for bni in range(bn): # inside this n-block for bki in range(bk): # inside this k-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) if B_regs[bki, bni] not in bs: @@ -287,25 +391,55 @@ def make_microkernel(self, if moved > 0 and moved <= max_offs: B_cell_addr.disp = moved else: - asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) + asm.add( + add( + B_cell_addr.disp, + additional_regs[0], + "", + B_cell_addr.base, + ) + ) cur11 = B_cell_addr.disp B_cell_addr.disp = 0 B_cell_addr.base = additional_regs[0] - - asm.add(ld(B_cell_addr, B_regs[bki, bni], False, B_comment, pred=None, is_B=True)) + + asm.add( + ld( + B_cell_addr, + B_regs[bki, bni], + False, + B_comment, + pred=None, + is_B=True, + ) + ) bs.append(B_regs[bki, bni]) for bki in range(bk): # inside this k-block for Vmi in range(Vm): p_merging = self.pred_n_trues(bm - Vmi * v_size, v_size, "m") - end_index = bm if Vmi + 1 == Vm else Vmi * v_size + v_size # end_index helps us print the right index ranges + end_index = ( + bm if Vmi + 1 == Vm else Vmi * v_size + v_size + ) # end_index helps us print the right index ranges for bni in range(bn): # inside this n-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): _, B_comment = B.look(B_ptr, to_B_block, to_bcell) comment = f"C[{Vmi * v_size}:{end_index},{bni}] += A[{Vmi * v_size}:{end_index},{bki}]*{B_comment}" - - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=True, sub=sub)) + + asm.add( + fma( + B_regs[bki, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + pred=p_merging, + bcast=True, + sub=sub, + ) + ) return asm diff --git a/pypspamm/codegen/architectures/rvv/inlineprinter.py b/pypspamm/codegen/architectures/rvv/inlineprinter.py index 8c0351f..234a68c 100644 --- a/pypspamm/codegen/architectures/rvv/inlineprinter.py +++ b/pypspamm/codegen/architectures/rvv/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List + from pypspamm.codegen.ast import * -from pypspamm.codegen.visitor import Visitor from pypspamm.codegen.operands import * from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -26,7 +27,12 @@ def __init__(self, precision: Precision): Precision.BFLOAT16: "h", }[self.precision] - assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) + assert precision in ( + Precision.BFLOAT16, + Precision.HALF, + Precision.SINGLE, + Precision.DOUBLE, + ) def to_addi(self, value): ADDILENGTH = 12 @@ -103,7 +109,9 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - if isinstance(stmt.src, Constant) and (stmt.src.value > 2047 or stmt.src.value < -2048): + if isinstance(stmt.src, Constant) and ( + stmt.src.value > 2047 or stmt.src.value < -2048 + ): # we need an intermediate register here # TODO: do not hard-code x5 here, make well-defined @@ -113,18 +121,32 @@ def visitAdd(self, stmt: AddStmt): addival, luival = self.to_addi(-stmt.src.value) else: addival, luival = self.to_addi(stmt.src.value) - self.addLine(f"lui {itmp}, {luival}", f"Intermediate add: place upper 12 bits of {stmt.src.value}") + self.addLine( + f"lui {itmp}, {luival}", + f"Intermediate add: place upper 12 bits of {stmt.src.value}", + ) if addival != 0: - self.addLine(f"addi {itmp}, {itmp}, {addival}", f"Intermediate add: place lower 12 bits of {stmt.src.value}") + self.addLine( + f"addi {itmp}, {itmp}, {addival}", + f"Intermediate add: place lower 12 bits of {stmt.src.value}", + ) if stmt.src.value < 0: - self.addLine(f"sub {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + self.addLine( + f"sub {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment + ) else: - self.addLine(f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + self.addLine( + f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment + ) else: # if stmt.src is a Constant but outside of the above range of value < -2048 or value > 2047 # we can simply add the Constant to a register - accumulate = stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly - self.addLine(f"addi {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment) + accumulate = ( + stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly + ) + self.addLine( + f"addi {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment + ) def visitLabel(self, stmt: LabelStmt): s = f"{stmt.label.ugly}:" @@ -143,12 +165,20 @@ def visitMov(self, stmt: MovStmt): self.addLine(f"vmv.v.i {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment) else: if stmt.src.value < 2**12: - self.addLine(f"addi {stmt.dest.ugly}, x0, {stmt.src.value}", stmt.comment) + self.addLine( + f"addi {stmt.dest.ugly}, x0, {stmt.src.value}", stmt.comment + ) elif stmt.src.value < 2**32: addival, luival = self.to_addi(stmt.src.value) - self.addLine(f"lui {stmt.dest.ugly}, {luival}", "Intermediate mov: place upper 12 bits") + self.addLine( + f"lui {stmt.dest.ugly}, {luival}", + "Intermediate mov: place upper 12 bits", + ) if addival != 0: - self.addLine(f"addi {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", stmt.comment) + self.addLine( + f"addi {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", + stmt.comment, + ) else: raise NotImplementedError() elif isinstance(stmt.src, Register): @@ -188,11 +218,11 @@ def visitStore(self, stmt: StoreStmt): self.addLine(s, stmt.comment) def visitPrefetch(self, stmt: PrefetchStmt): - s = f'prefetch.r {stmt.dest.ugly}' + s = f"prefetch.r {stmt.dest.ugly}" self.addLine(s, stmt.comment) - + def visitRVSetVLStmt(self, stmt: RVSetVLStmt): - opcode = 'setivli' if isinstance(stmt.requested, Constant) else 'setvli' + opcode = "setivli" if isinstance(stmt.requested, Constant) else "setvli" s = f"v{opcode} {stmt.actual.ugly}, {stmt.requested.ugly}, e{self.precision.size() * 8}" self.addLine(s, stmt.comment) @@ -209,7 +239,7 @@ def visitBlock(self, block: Block): def p_string(self, predicate: Register): # returns "pk{/z or /m}, " or an empty string "" with contents in {} being optional # at this point the contents are already generated, we simply turn them into a string - return f', {predicate}' if predicate is not None else "" + return f", {predicate}" if predicate is not None else "" def render(s: AsmStmt): diff --git a/pypspamm/codegen/architectures/rvv/operands.py b/pypspamm/codegen/architectures/rvv/operands.py index 603b227..9322130 100644 --- a/pypspamm/codegen/architectures/rvv/operands.py +++ b/pypspamm/codegen/architectures/rvv/operands.py @@ -12,6 +12,7 @@ class Constant_RV(Constant): def ugly(self): return str(self.value) + def c(n): """Sugar for conveniently defining integer constants""" return Constant_RV(value=int(n)) @@ -38,16 +39,12 @@ def ugly_precision(self): @property def ugly_lsl_shift(self): - return { - "d": 3, - "s": 2, - "h": 1 - }[self.ugly_precision] + return {"d": 3, "s": 2, "h": 1}[self.ugly_precision] @property def clobbered(self): # removed [this comment should stay here for now---in case there's some compiler expecting it]: .replace("x", "r") - return (self.value.split(".")[0]) + return self.value.split(".")[0] @property def ugly_scalar(self): @@ -55,7 +52,7 @@ def ugly_scalar(self): @property def ugly_scalar_1d(self): - #turns "Vn.2d" into "Dn" + # turns "Vn.2d" into "Dn" return (self.value.split(".")[0]).replace("v", "d") @@ -63,13 +60,14 @@ def ugly_scalar_1d(self): f = lambda n: Register_RV(AsmType.f64, "f" + str(n)) v = lambda n: Register_RV(AsmType.f64x8, "v" + str(n)) + class MemoryAddress_RV(MemoryAddress): @property def ugly(self): if self.disp == 0: - return f'({self.base.ugly})' + return f"({self.base.ugly})" else: - return f'{self.disp}({self.base.ugly})' + return f"{self.disp}({self.base.ugly})" @property def clobbered(self): diff --git a/pypspamm/codegen/ast.py b/pypspamm/codegen/ast.py index 8044a3b..c7f672d 100644 --- a/pypspamm/codegen/ast.py +++ b/pypspamm/codegen/ast.py @@ -1,5 +1,5 @@ +from typing import TYPE_CHECKING, List -from typing import List, TYPE_CHECKING from pypspamm.codegen.operands import * if TYPE_CHECKING: @@ -11,47 +11,68 @@ class AsmStmt: def accept(self, visitor: "Visitor"): raise Exception("AsmStmt is supposed to be abstract") - + def reg_in_candidate(self): return () - + def reg_out_candidate(self): return () - + def regs_in(self): - return set(reg for regc in self.reg_in_candidate() if regc is not None for reg in regc.registers() if isinstance(reg, Register)) + return set( + reg + for regc in self.reg_in_candidate() + if regc is not None + for reg in regc.registers() + if isinstance(reg, Register) + ) def regs_out(self): - return set(reg for regc in self.reg_out_candidate() if regc is not None for reg in regc.registers() if isinstance(reg, Register)) - + return set( + reg + for regc in self.reg_out_candidate() + if regc is not None + for reg in regc.registers() + if isinstance(reg, Register) + ) + def regs(self): return self.regs_in() | self.regs_out() - + def args_in(self): - return set(reg for reg in self.reg_in_candidate() if reg is not None and isinstance(reg, InputOperand)) + return set( + reg + for reg in self.reg_in_candidate() + if reg is not None and isinstance(reg, InputOperand) + ) def args_out(self): - return set(reg for reg in self.reg_out_candidate() if reg is not None and isinstance(reg, InputOperand)) - + return set( + reg + for reg in self.reg_out_candidate() + if reg is not None and isinstance(reg, InputOperand) + ) + def barrier(self): return False - + def args(self): return self.args_in() | self.args_out() - + def normalize(self): yield self - + def flatten(self): yield self - + def stmtname(self): - return '???' - + return "???" + def __str__(self): - inregs = ', '.join(reg.ugly for reg in self.regs_in()) - outregs = ', '.join(reg.ugly for reg in self.regs_out()) - return f'{self.stmtname()} {inregs} -> {outregs}' + inregs = ", ".join(reg.ugly for reg in self.regs_in()) + outregs = ", ".join(reg.ugly for reg in self.regs_out()) + return f"{self.stmtname()} {inregs} -> {outregs}" + class GenericStmt(AsmStmt): operation = None @@ -71,15 +92,16 @@ class MovStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitMov(self) - + def reg_in_candidate(self): - return (self.src,self.temp,self.pred) - + return (self.src, self.temp, self.pred) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'mov' + return "mov" + class LeaStmt(AsmStmt): src = None @@ -91,15 +113,16 @@ class LeaStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitLea(self) - + def reg_in_candidate(self): - return (self.src,self.pred) - + return (self.src, self.pred) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'lea' + return "lea" + class LoadStmt(AsmStmt): src = None @@ -117,15 +140,16 @@ class LoadStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitLoad(self) - + def reg_in_candidate(self): - return (self.src,self.pred,self.add_reg) - + return (self.src, self.pred, self.add_reg) + def reg_out_candidate(self): return (self.dest, self.dest2, self.dest3, self.dest4) - + def stmtname(self): - return 'load' + return "load" + class StoreStmt(AsmStmt): src = None @@ -142,15 +166,16 @@ class StoreStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitStore(self) - + def reg_in_candidate(self): return (self.src, self.src2, self.src3, self.src4, self.pred, self.add_reg) - + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'store' + return "store" + class PrefetchStmt(AsmStmt): dest = None @@ -161,9 +186,9 @@ class PrefetchStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitPrefetch(self) - + def stmtname(self): - return 'prefetch' + return "prefetch" class FmaStmt(AsmStmt): @@ -176,15 +201,16 @@ class FmaStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitFma(self) - + def reg_in_candidate(self): return (self.add_dest, self.bcast_src, self.mult_src, self.pred) - + def reg_out_candidate(self): return (self.add_dest,) - + def stmtname(self): - return 'fma' + return "fma" + class MulStmt(AsmStmt): src = None @@ -194,15 +220,16 @@ class MulStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitMul(self) - + def reg_in_candidate(self): - return (self.mult_src,self.src,self.pred) - + return (self.mult_src, self.src, self.pred) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'mul' + return "mul" + class BcstStmt(AsmStmt): bcast_src = None @@ -211,15 +238,19 @@ class BcstStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitBcst(self) - + def reg_in_candidate(self): - return (self.bcast_src,self.pred,) - + return ( + self.bcast_src, + self.pred, + ) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'broadcast' + return "broadcast" + class AddStmt(AsmStmt): src = None @@ -233,15 +264,16 @@ def accept(self, visitor: "Visitor"): def reg_in_candidate(self): if self.additional is not None: - return (self.src,self.dest,self.additional,self.pred) + return (self.src, self.dest, self.additional, self.pred) else: - return (self.src,self.dest,self.pred) - + return (self.src, self.dest, self.pred) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'add' + return "add" + class CmpStmt(AsmStmt): lhs = None @@ -250,21 +282,22 @@ class CmpStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitCmp(self) - + def reg_in_candidate(self): - return (self.lhs,self.rhs,self.pred) - + return (self.lhs, self.rhs, self.pred) + def stmtname(self): - return 'cmp' + return "cmp" + class LabelStmt(AsmStmt): label = None def accept(self, visitor: "Visitor"): visitor.visitLabel(self) - + def __str__(self): - return f'Label: {self.label.ugly}' + return f"Label: {self.label.ugly}" class JumpStmt(AsmStmt): @@ -273,12 +306,13 @@ class JumpStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitJump(self) - + def reg_in_candidate(self): return (self.cmpreg,) def stmtname(self): - return 'branch' + return "branch" + class DataStmt(AsmStmt): value = None @@ -287,48 +321,59 @@ class DataStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitData(self) + class RVSetVLStmt(AsmStmt): actual = None requested = None def accept(self, visitor: "Visitor"): visitor.visitRVSetVLStmt(self) - + def reg_in_candidate(self): return (self.requested,) - + def reg_out_candidate(self): return (self.actual,) - + def barrier(self): return True + class Block(AsmStmt): contents = [] def accept(self, visitor: "Visitor"): visitor.visitBlock(self) - + def normalize(self): - return (subcontent for content in self.contents for subcontent in content.normalize()) - + return ( + subcontent + for content in self.contents + for subcontent in content.normalize() + ) + def flatten(self): - return (subcontent for content in self.contents for subcontent in content.flatten()) - + return ( + subcontent for content in self.contents for subcontent in content.flatten() + ) + def regs_in(self): regs = set() for instr in self.contents: regs |= instr.regs_in() return regs - + def regs_out(self): regs = set() for instr in self.contents: regs |= instr.regs_out() return regs - + def __str__(self): - return 'block {\n' + '\n'.join(str(content) for content in self.contents) + '\n}' + return ( + "block {\n" + "\n".join(str(content) for content in self.contents) + "\n}" + ) + class Command(AsmStmt): name = None diff --git a/pypspamm/codegen/ccode.py b/pypspamm/codegen/ccode.py index ee14125..a1defaf 100644 --- a/pypspamm/codegen/ccode.py +++ b/pypspamm/codegen/ccode.py @@ -1,12 +1,22 @@ -from pypspamm.codegen.ast import * +import pypspamm.architecture from pypspamm.codegen.analysis import * +from pypspamm.codegen.ast import * from pypspamm.codegen.precision import * -import pypspamm.architecture - -def make_cfunc(funcName:str, template:str, body:Block, flop:int, starting_regs:List[Register], precision: Precision) -> str: - Printer_class = pypspamm.architecture.get_class("pypspamm.codegen.architectures." + pypspamm.architecture.arch + ".inlineprinter").InlinePrinter +def make_cfunc( + funcName: str, + template: str, + body: Block, + flop: int, + starting_regs: List[Register], + precision: Precision, +) -> str: + Printer_class = pypspamm.architecture.get_class( + "pypspamm.codegen.architectures." + + pypspamm.architecture.arch + + ".inlineprinter" + ).InlinePrinter printer = Printer_class(precision) printer.lmargin = 4 @@ -15,16 +25,21 @@ def make_cfunc(funcName:str, template:str, body:Block, flop:int, starting_regs:L analyzer = Analyzer(starting_regs) analyzer.collect(body) - regs = set(f'"{reg.clobbered}"' for reg in analyzer.clobbered_registers if reg.clobbered is not None) + regs = set( + f'"{reg.clobbered}"' + for reg in analyzer.clobbered_registers + if reg.clobbered is not None + ) regs.add('"memory"') regs.add('"cc"') # TODO: maybe regs.add('"redzone"') ? clobbered = ", ".join(sorted(regs)) arglist = ", ".join(sorted(arg.arg for arg in analyzer.input_operands)) - return template.format(funcName = funcName, - body_text = body_text, - args = arglist, - clobbered = clobbered, - flop = flop, - real_type = Precision.getCType(precision)) - + return template.format( + funcName=funcName, + body_text=body_text, + args=arglist, + clobbered=clobbered, + flop=flop, + real_type=Precision.getCType(precision), + ) diff --git a/pypspamm/codegen/forms.py b/pypspamm/codegen/forms.py index dd9e50a..e5b7ff6 100644 --- a/pypspamm/codegen/forms.py +++ b/pypspamm/codegen/forms.py @@ -1,19 +1,22 @@ - from typing import List + from pypspamm.codegen.sugar import * + # TODO: We might eventually want to make this part of our syntax tree # in order to do unrolls and other fancy stuff with it class Loop(Block): _labels = [] - def __init__(self, - iteration_var: Register, - final_val: int, - body_contents: Block = None, - unroll: int = 1, - overlap: bool = False - ) -> None: + + def __init__( + self, + iteration_var: Register, + final_val: int, + body_contents: Block = None, + unroll: int = 1, + overlap: bool = False, + ) -> None: self.iteration_var = iteration_var self.final_val = final_val @@ -21,13 +24,13 @@ def __init__(self, self.unroll = unroll self.may_overlap = overlap - self.comment = f'loop {self.iteration_var.ugly} in range({self.final_val}), unroll {self.unroll}' + self.comment = f"loop {self.iteration_var.ugly} in range({self.final_val}), unroll {self.unroll}" @property def contents(self): self.label = "loop_top_" + str(len(Loop._labels)) Loop._labels.append(self.label) - + onestep = [*(self.body_contents.contents)] body = [] rest = [] @@ -36,16 +39,24 @@ def contents(self): for _ in range(self.final_val % self.unroll): rest += onestep - + true_final_val = (self.final_val // self.unroll) * self.unroll allcode = [] if true_final_val == self.unroll: allcode += body elif true_final_val > self.unroll: - allcode += [mov(-true_final_val, self.iteration_var, vector=False), - label(self.label)] + body + [add(self.unroll, self.iteration_var), - jump(self.label, self.iteration_var, backwards=True)] + allcode += ( + [ + mov(-true_final_val, self.iteration_var, vector=False), + label(self.label), + ] + + body + + [ + add(self.unroll, self.iteration_var), + jump(self.label, self.iteration_var, backwards=True), + ] + ) allcode += rest return allcode @@ -53,43 +64,73 @@ def contents(self): def body(self, *args): self.body_contents = block("Loop body", *args) return self - + def normalize(self): - yield loop(self.iteration_var, self.final_val, self.unroll, self.may_overlap).body(*[substmt for stmt in self.body_contents.contents for substmt in stmt.normalize()]) - + yield loop( + self.iteration_var, self.final_val, self.unroll, self.may_overlap + ).body( + *[ + substmt + for stmt in self.body_contents.contents + for substmt in stmt.normalize() + ] + ) + def __str__(self): - return f'loop {self.iteration_var.ugly} in range({self.final_val}), unroll {self.unroll}' + '{\n' + '\n'.join(str(content) for content in self.body_contents.contents) + '\n}' + return ( + f"loop {self.iteration_var.ugly} in range({self.final_val}), unroll {self.unroll}" + + "{\n" + + "\n".join(str(content) for content in self.body_contents.contents) + + "\n}" + ) + def loop(iter_var, final_val, unroll=1, overlap=False): return Loop(iter_var, final_val, unroll=unroll, overlap=overlap) + class Skip(Block): _labels = [] - def __init__(self, - skipreg: Register - ) -> None: + + def __init__(self, skipreg: Register) -> None: self.skipreg = skipreg - self.comment = f'if {self.checkreg} != 0' + self.comment = f"if {self.checkreg} != 0" @property def contents(self): self.label = "skip_" + str(len(Loop._labels)) Loop._labels.append(self.label) - return [jump(self.label, self.skipreg, backwards=True)] + body + [label(self.label)] + return ( + [jump(self.label, self.skipreg, backwards=True)] + + body + + [label(self.label)] + ) def body(self, *args): self.body_contents = block("Skip body", *args) return self - + def normalize(self): - yield skip(self.checkreg).body(*[substmt for stmt in self.body_contents.contents for substmt in stmt.normalize()]) - + yield skip(self.checkreg).body( + *[ + substmt + for stmt in self.body_contents.contents + for substmt in stmt.normalize() + ] + ) + def __str__(self): - return f'if {self.checkreg} != 0' + '{\n' + '\n'.join(str(content) for content in self.body_contents.contents) + '\n}' + return ( + f"if {self.checkreg} != 0" + + "{\n" + + "\n".join(str(content) for content in self.body_contents.contents) + + "\n}" + ) + def skip(checkreg): return Skip(checkreg) diff --git a/pypspamm/codegen/generator.py b/pypspamm/codegen/generator.py index 70e782b..051a2c0 100644 --- a/pypspamm/codegen/generator.py +++ b/pypspamm/codegen/generator.py @@ -1,15 +1,17 @@ -from pypspamm.cursors import * +from abc import ABC, abstractmethod + from pypspamm.codegen.ast import * from pypspamm.codegen.precision import * -from abc import ABC, abstractmethod +from pypspamm.cursors import * + class AbstractGenerator(ABC): def __init__(self, precision: Precision): - self.precision = precision + self.precision = precision def get_precision(self): - return self.precision - + return self.precision + def set_sparse(self): pass @@ -38,20 +40,23 @@ def get_template(self): pass @abstractmethod - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): + def make_reg_blocks( + self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int + ): pass @abstractmethod - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False - ) -> Block: + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + ) -> Block: pass @abstractmethod @@ -59,18 +64,19 @@ def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block pass @abstractmethod - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: pass diff --git a/pypspamm/codegen/operands.py b/pypspamm/codegen/operands.py index 52468f7..c470c17 100644 --- a/pypspamm/codegen/operands.py +++ b/pypspamm/codegen/operands.py @@ -1,13 +1,29 @@ from enum import Enum -from typing import List, Dict +from typing import Dict, List + +AsmType = Enum( + "AsmType", + [ + "unknown", + "i8", + "i16", + "i32", + "i64", + "f32", + "f64", + "f32x4", + "f32x8", + "f32x16", + "f64x2", + "f64x4", + "f64x8", + "p64x8", + ], +) + +RegisterType = Enum("RegisterType", ["undefined", "scalar", "vector", "predicate"]) -AsmType = Enum('AsmType', ['unknown','i8','i16','i32','i64','f32','f64', - 'f32x4','f32x8','f32x16','f64x2','f64x4','f64x8', - 'p64x8']) - -RegisterType = Enum('RegisterType', ['undefined', 'scalar', 'vector', 'predicate']) - class Operand: @property def ugly(self): @@ -16,20 +32,23 @@ def ugly(self): def registers(self): return [] + # TODO: Rename this 'Immediate' class Constant(Operand): - def __init__(self, value:int) -> None: + def __init__(self, value: int) -> None: self.value = value @property def ugly(self): raise NotImplementedError() + class Label(Operand): _interns = {} _last = -1 + def __init__(self, value) -> None: - assert(isinstance(value, str)) + assert isinstance(value, str) self.value = value if value in Label._interns: self.ordinal = Label._interns[value] @@ -42,12 +61,13 @@ def __init__(self, value) -> None: def ugly(self): raise NotImplementedError() + class Register(Operand): def __init__(self, typeinfo, value) -> None: self.typeinfo = typeinfo self.value = str(value) - + def size(self): if self.typeinfo == AsmType.i8: return 1 @@ -81,41 +101,41 @@ def ugly(self): @property def clobbered(self): return self.value - + def registers(self): return [self] - + def __eq__(self, other): return self.ugly == other.ugly - + def __hash__(self): return hash(self.ugly) + class MemoryAddress(Operand): - def __init__(self, - base, - disp) -> None: + def __init__(self, base, disp) -> None: self.base = base self.disp = disp @property def ugly(self): raise NotImplementedError() - + def registers(self): return [self.base] + class InputOperand(Operand): def __init__(self, name, optype, source): self.name = str(name) self.optype = optype self.source = source - + @property def ugly(self): - return f'%{self.name}' - + return f"%{self.name}" + @property def arg(self): return f'"{self.optype}"({self.source})' diff --git a/pypspamm/codegen/precision.py b/pypspamm/codegen/precision.py index 417c9a6..2c5d125 100644 --- a/pypspamm/codegen/precision.py +++ b/pypspamm/codegen/precision.py @@ -1,31 +1,31 @@ from enum import Enum + class Precision(Enum): - DOUBLE = 8 - SINGLE = 4 - HALF = 2 - BFLOAT16 = 2.1 + DOUBLE = 8 + SINGLE = 4 + HALF = 2 + BFLOAT16 = 2.1 + + @classmethod + def getCType(cls, precision): + ctype = { + cls.DOUBLE: "double", + cls.SINGLE: "float", + cls.HALF: "uint16_t", + cls.BFLOAT16: "uint16_t", + } + return ctype[precision] - @classmethod - def getCType(cls, precision): - ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'uint16_t', cls.BFLOAT16: 'uint16_t'} - return ctype[precision] - - def ctype(self): - return self.getCType(self) + def ctype(self): + return self.getCType(self) - def size(self): - return { - self.DOUBLE: 8, - self.SINGLE: 4, - self.HALF: 2, - self.BFLOAT16: 2 - }[self] - raise NotImplementedError() - - def __repr__(self): - return self.getCType(self) + def size(self): + return {self.DOUBLE: 8, self.SINGLE: 4, self.HALF: 2, self.BFLOAT16: 2}[self] + raise NotImplementedError() - def __str__(self): - return self.getCType(self) + def __repr__(self): + return self.getCType(self) + def __str__(self): + return self.getCType(self) diff --git a/pypspamm/codegen/prune.py b/pypspamm/codegen/prune.py index fb8aa0e..cd1f2c3 100644 --- a/pypspamm/codegen/prune.py +++ b/pypspamm/codegen/prune.py @@ -1,13 +1,18 @@ from .ast import * -from .operands import * from .forms import * +from .operands import * + def prune(block, toplevel=True): pruned = [] cached = [] for instr in block: - if isinstance(instr, AddStmt) and isinstance(instr.src, Constant) and instr.additional is None: + if ( + isinstance(instr, AddStmt) + and isinstance(instr.src, Constant) + and instr.additional is None + ): combinedValue = instr.src.value for i, cinstr in enumerate(cached): if cinstr.dest == instr.dest: @@ -19,9 +24,11 @@ def prune(block, toplevel=True): pruned += cached cached = [] if isinstance(instr, Loop): - instr.body_contents.contents = prune(instr.body_contents.contents, False) + instr.body_contents.contents = prune( + instr.body_contents.contents, False + ) pruned += [instr] - + if not toplevel: pruned += cached return pruned diff --git a/pypspamm/codegen/regcache.py b/pypspamm/codegen/regcache.py index 44bb0ae..f8e38ce 100644 --- a/pypspamm/codegen/regcache.py +++ b/pypspamm/codegen/regcache.py @@ -1,11 +1,10 @@ - class RegisterCache: def __init__(self, registers): self.access = 0 self.lru = [-1] * len(registers) self.registers = registers self.storage = {} - + def get(self, value): self.access += 1 diff --git a/pypspamm/codegen/schedule.py b/pypspamm/codegen/schedule.py index 1222832..3c4ae19 100644 --- a/pypspamm/codegen/schedule.py +++ b/pypspamm/codegen/schedule.py @@ -1,15 +1,29 @@ from .ast import * -from .operands import * from .forms import * +from .operands import * + def isStore(instr): - return isinstance(instr, StoreStmt) or (isinstance(instr, MovStmt) and isinstance(instr.dest, MemoryAddress)) + return isinstance(instr, StoreStmt) or ( + isinstance(instr, MovStmt) and isinstance(instr.dest, MemoryAddress) + ) + def isScalar(instr): - return isinstance(instr, AddStmt) or (isinstance(instr, MovStmt) and isinstance(instr.dest, Register) and instr.typ == AsmType.i64) + return isinstance(instr, AddStmt) or ( + isinstance(instr, MovStmt) + and isinstance(instr.dest, Register) + and instr.typ == AsmType.i64 + ) + def isLoad(instr): - return isinstance(instr, LoadStmt) or (isinstance(instr, MovStmt) and isinstance(instr.src, MemoryAddress)) or isinstance(instr, BcstStmt) + return ( + isinstance(instr, LoadStmt) + or (isinstance(instr, MovStmt) and isinstance(instr.src, MemoryAddress)) + or isinstance(instr, BcstStmt) + ) + def hasDependency(instr1, instr2, rrt=False): ww = instr1.regs_out() & instr2.regs_out() @@ -18,6 +32,7 @@ def hasDependency(instr1, instr2, rrt=False): rr = instr1.regs_in() & instr2.regs_in() return len(ww) > 0 or len(wr) > 0 or len(rw) > 0 or (rrt and len(rr) > 0) + def moveLoads(block, isLoop=False): preprocessed = [] for instr in block: @@ -28,15 +43,28 @@ def moveLoads(block, isLoop=False): if instr.final_val == 1: preprocessed += prelude + postlude elif instr.final_val > 1: - preprocessed += prelude + [loop(instr.iteration_var, instr.final_val - 1, instr.unroll).body(*inner)] + postlude + preprocessed += ( + prelude + + [ + loop( + instr.iteration_var, instr.final_val - 1, instr.unroll + ).body(*inner) + ] + + postlude + ) else: inner = moveLoads(instr.body_contents.contents, False) - preprocessed += [loop(instr.iteration_var, instr.final_val, instr.unroll).body(*inner)] + preprocessed += [ + loop(instr.iteration_var, instr.final_val, instr.unroll).body( + *inner + ) + ] else: preprocessed += [instr] - + return moveLoadsBlock(preprocessed, isLoop) + def moveLoadsBlock(block, isLoop): reordered = [] currentLoads = [] @@ -60,14 +88,18 @@ def addReorderedLoad(j): delta += newdelta reordered.append(loadInstr) return 1 + delta + def addDependentLoads(instr, i): j = 0 while j < len(currentLoads): loadInstr = currentLoads[j] # for now, include read-read dependencies here - if hasDependency(instr, loadInstr): # or too far away insertCounter[j] < i + 4 + if hasDependency( + instr, loadInstr + ): # or too far away insertCounter[j] < i + 4 j -= addReorderedLoad(j) j += 1 + def preponeLoad(instr, i): maxI = [] for loadInstr in reversed(currentLoads): @@ -85,7 +117,7 @@ def preponeLoad(instr, i): else: addDependentLoads(instr, len(reordered)) reordered.append(instr) - + if isLoop: # pass again, but ignore loads postlude = list(reversed(reordered)) @@ -99,13 +131,13 @@ def preponeLoad(instr, i): else: addDependentLoads(instr, len(reordered) + len(postlude)) reordered.append(instr) - + # add loads/scalar instructions that would be materialized over 2 iterations only (?) - for i,instr in enumerate(currentLoads): + for i, instr in enumerate(currentLoads): if i + len(prelude) >= len(currentLoads): break reordered.append(instr) - + return prelude, list(reversed(reordered)), postlude else: for loadInstr in currentLoads: diff --git a/pypspamm/codegen/sugar.py b/pypspamm/codegen/sugar.py index 5f90ccd..dc33495 100644 --- a/pypspamm/codegen/sugar.py +++ b/pypspamm/codegen/sugar.py @@ -1,25 +1,42 @@ from typing import Union +import pypspamm.architecture from pypspamm.codegen.ast import * from pypspamm.codegen.operands import * -import pypspamm.architecture # Convenient statement constructors -def add(src: Union[Operand, int], dest: Register, comment: str = None, additional: Register = None): +def add( + src: Union[Operand, int], + dest: Register, + comment: str = None, + additional: Register = None, +): stmt = AddStmt() - stmt.src = src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + stmt.src = ( + src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + ) stmt.dest = dest stmt.comment = comment stmt.additional = additional return stmt + def label(name: str): stmt = LabelStmt() stmt.label = pypspamm.architecture.operands.l(name) return stmt -def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: str = None, bcast: Union[int, None] = None, pred: Register = None, sub=False): + +def fma( + bcast_src: Register, + mult_src: Register, + add_dest: Register, + comment: str = None, + bcast: Union[int, None] = None, + pred: Register = None, + sub=False, +): stmt = FmaStmt() stmt.bcast_src = bcast_src stmt.mult_src = mult_src @@ -31,7 +48,14 @@ def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: st stmt.sub = sub return stmt -def mul(src: Register, mult_src: Register, dest: Register, comment: str = None, pred: Register = None): + +def mul( + src: Register, + mult_src: Register, + dest: Register, + comment: str = None, + pred: Register = None, +): stmt = MulStmt() stmt.src = src stmt.mult_src = mult_src @@ -41,6 +65,7 @@ def mul(src: Register, mult_src: Register, dest: Register, comment: str = None, stmt.pred = pred return stmt + def bcst(bcast_src: Register, dest: Register, comment: str = None): stmt = BcstStmt() stmt.bcast_src = bcast_src @@ -48,21 +73,38 @@ def bcst(bcast_src: Register, dest: Register, comment: str = None): stmt.comment = comment return stmt + def cmp(lhs: Union[Operand, int], rhs: Union[Operand, int]): stmt = CmpStmt() - stmt.lhs = lhs if isinstance(lhs, Operand) else pypspamm.architecture.operands.c(lhs) - stmt.rhs = rhs if isinstance(rhs, Operand) else pypspamm.architecture.operands.c(rhs) + stmt.lhs = ( + lhs if isinstance(lhs, Operand) else pypspamm.architecture.operands.c(lhs) + ) + stmt.rhs = ( + rhs if isinstance(rhs, Operand) else pypspamm.architecture.operands.c(rhs) + ) return stmt -def jump(label: str, cmpreg = None, backwards=True): + +def jump(label: str, cmpreg=None, backwards=True): stmt = JumpStmt() stmt.destination = pypspamm.architecture.operands.l(label) stmt.cmpreg = cmpreg return stmt -def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, pred = None, expand=None, temp=None): + +def mov( + src: Union[Operand, int], + dest: Operand, + vector: bool, + comment: str = None, + pred=None, + expand=None, + temp=None, +): stmt = MovStmt() - stmt.src = src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + stmt.src = ( + src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + ) stmt.dest = dest stmt.comment = comment stmt.pred = pred @@ -76,7 +118,8 @@ def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = Non stmt.typ = AsmType.i64 return stmt -def lea(src: Register, dest: Operand, offset: int, comment:str = None): + +def lea(src: Register, dest: Operand, offset: int, comment: str = None): stmt = LeaStmt() stmt.src = src stmt.dest = dest @@ -84,9 +127,26 @@ def lea(src: Register, dest: Operand, offset: int, comment:str = None): stmt.comment = comment return stmt -def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, dest2: Operand = None, pred: Register = None, is_B: bool = False, scalar_offs: bool = False, add_reg: AsmType.i64 = None, sub128: bool = False, expand=None, dest3: Operand = None, dest4: Operand = None): + +def ld( + src: Union[Operand, int], + dest: Operand, + vector: bool, + comment: str = None, + dest2: Operand = None, + pred: Register = None, + is_B: bool = False, + scalar_offs: bool = False, + add_reg: AsmType.i64 = None, + sub128: bool = False, + expand=None, + dest3: Operand = None, + dest4: Operand = None, +): stmt = LoadStmt() - stmt.src = src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + stmt.src = ( + src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + ) stmt.dest = dest stmt.dest2 = dest2 stmt.dest3 = dest3 @@ -110,9 +170,24 @@ def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None stmt.typ = AsmType.i64 return stmt -def st(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, src2: Operand = None, pred: Register = None, scalar_offs: bool = False, add_reg: AsmType.i64 = None, expand=None, src3: Operand=None, src4: Operand=None): + +def st( + src: Union[Operand, int], + dest: Operand, + vector: bool, + comment: str = None, + src2: Operand = None, + pred: Register = None, + scalar_offs: bool = False, + add_reg: AsmType.i64 = None, + expand=None, + src3: Operand = None, + src4: Operand = None, +): stmt = StoreStmt() - stmt.src = src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + stmt.src = ( + src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + ) stmt.src2 = src2 stmt.src3 = src3 stmt.src4 = src4 @@ -132,7 +207,16 @@ def st(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None stmt.typ = AsmType.i64 return stmt -def prefetch(dest: Operand, comment:str = None, pred: Register = None, precision: str = None, access_type: str = None, closeness: str = None, temporality: str = None): + +def prefetch( + dest: Operand, + comment: str = None, + pred: Register = None, + precision: str = None, + access_type: str = None, + closeness: str = None, + temporality: str = None, +): stmt = PrefetchStmt() stmt.dest = dest stmt.comment = comment @@ -144,18 +228,27 @@ def prefetch(dest: Operand, comment:str = None, pred: Register = None, precision stmt.temporality = temporality return stmt + def data(value: Union[Operand, int], asmType=AsmType.i64): stmt = DataStmt() - stmt.value = value if isinstance(value, Operand) else pypspamm.architecture.operands.c(value) + stmt.value = ( + value if isinstance(value, Operand) else pypspamm.architecture.operands.c(value) + ) stmt.asmType = asmType return stmt + def rvsetvl(actual: Register, requested: Union[Register, int]): stmt = RVSetVLStmt() stmt.actual = actual - stmt.requested = requested if isinstance(requested, Operand) else pypspamm.architecture.operands.c(requested) + stmt.requested = ( + requested + if isinstance(requested, Operand) + else pypspamm.architecture.operands.c(requested) + ) return stmt + # Fluent interface class BlockBuilder(Block): diff --git a/pypspamm/codegen/virtual.py b/pypspamm/codegen/virtual.py index a44de74..1507ada 100644 --- a/pypspamm/codegen/virtual.py +++ b/pypspamm/codegen/virtual.py @@ -1,8 +1,9 @@ from .operands import Register + class VirtualRegister(Register): def __init__(self, typeinfo, pool): - super().__init__(typeinfo, '') + super().__init__(typeinfo, "") self.register = None self.pool = pool @@ -16,30 +17,43 @@ def setRegister(register: Register): @property def ugly(self): - return self.register.ugly if self.register is not None else f'vreg{id(self)}' - + return self.register.ugly if self.register is not None else f"vreg{id(self)}" + @property def ugly_scalar_1d(self): - return self.register.ugly_scalar_1d if self.register is not None else f'vreg{id(self)}' - + return ( + self.register.ugly_scalar_1d + if self.register is not None + else f"vreg{id(self)}" + ) + @property def ugly_scalar(self): - return self.register.ugly_scalar if self.register is not None else f'vreg{id(self)}' - + return ( + self.register.ugly_scalar + if self.register is not None + else f"vreg{id(self)}" + ) + @property def ugly_xmm(self): - return self.register.ugly_xmm if self.register is not None else f'vreg{id(self)}' - + return ( + self.register.ugly_xmm if self.register is not None else f"vreg{id(self)}" + ) + @property def clobbered(self): - return self.register.clobbered if self.register is not None else f'vreg{id(self)}' - + return ( + self.register.clobbered if self.register is not None else f"vreg{id(self)}" + ) + def firstUsage(self): return None if len(self.usage) == 0 else self.usage[0] - + def lastUsage(self): return None if len(self.usage) == 0 else self.usage[-1] + class RegisterPool: def __init__(self, registers): self.registers = registers @@ -56,12 +70,14 @@ def assign(self, asm): if vreg.lastUsage() is instr: unlive.append(vreg.register) + def usagePass(asm): for instruction in asm.flatten(): for reg in instruction.regs(): if isinstance(reg, VirtualRegister): reg.usage += [instruction] + def assignVirtualRegisters(asm, pools): usagePass(asm) for pool in pools: diff --git a/pypspamm/codegen/visitor.py b/pypspamm/codegen/visitor.py index 8ae6a72..29d8b6c 100644 --- a/pypspamm/codegen/visitor.py +++ b/pypspamm/codegen/visitor.py @@ -1,5 +1,6 @@ from pypspamm.codegen.ast import * + class Visitor: def visitStmt(self, stmt: GenericStmt) -> None: diff --git a/pypspamm/cursors/__init__.py b/pypspamm/cursors/__init__.py index a21a3a4..734be8c 100644 --- a/pypspamm/cursors/__init__.py +++ b/pypspamm/cursors/__init__.py @@ -1,6 +1,5 @@ -from pypspamm.cursors.matrix import Matrix -from pypspamm.cursors.coords import Coords -from pypspamm.cursors.abstractcursor import BlockInfo, CursorLocation, Cursor +from pypspamm.cursors.abstractcursor import BlockInfo, Cursor, CursorLocation from pypspamm.cursors.blockcursor import BlockCursor, sparse_mask +from pypspamm.cursors.coords import Coords from pypspamm.cursors.densecursor import DenseCursor - +from pypspamm.cursors.matrix import Matrix diff --git a/pypspamm/cursors/abstractcursor.py b/pypspamm/cursors/abstractcursor.py index 8f43283..e1dfca9 100644 --- a/pypspamm/cursors/abstractcursor.py +++ b/pypspamm/cursors/abstractcursor.py @@ -1,24 +1,22 @@ -from pypspamm.cursors.matrix import Matrix -from pypspamm.cursors.coords import Coords - -from pypspamm.codegen.operands import * -from pypspamm.codegen.ast import AsmStmt, Command - +from collections import namedtuple from typing import List, Tuple -from collections import namedtuple +from pypspamm.codegen.ast import AsmStmt, Command +from pypspamm.codegen.operands import * +from pypspamm.cursors.coords import Coords +from pypspamm.cursors.matrix import Matrix BlockInfo = namedtuple("Blockinfo", ("br bc pattern_index pattern")) + class CursorLocation: current_block = None # Absolute coords of current block - current_cell = None # Relative? + current_cell = None # Relative? - def __init__(self, - current_block = Coords(absolute=True), - current_cell = Coords(absolute=False) - ) -> None: - assert(current_cell.absolute == False) + def __init__( + self, current_block=Coords(absolute=True), current_cell=Coords(absolute=False) + ) -> None: + assert current_cell.absolute == False self.current_block = current_block self.current_cell = current_cell @@ -48,35 +46,32 @@ def brf(self) -> int: def bcf(self) -> int: return self.c % self.bc - def move(self, - src: CursorLocation, - dest_block: Coords - ) -> Tuple[AsmStmt, CursorLocation]: + def move( + self, src: CursorLocation, dest_block: Coords + ) -> Tuple[AsmStmt, CursorLocation]: raise NotImplementedError() - def look(self, - src: CursorLocation, - dest_block: Coords, - dest_cell: Coords - ) -> Tuple[MemoryAddress, str]: + def look( + self, src: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> Tuple[MemoryAddress, str]: raise NotImplementedError() - def start_location(self, dest_block: Coords = Coords(absolute=True)) -> CursorLocation: + def start_location( + self, dest_block: Coords = Coords(absolute=True) + ) -> CursorLocation: raise NotImplementedError() - def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockInfo: + def get_block( + self, src: CursorLocation = None, dest_block: Coords = None + ) -> BlockInfo: raise NotImplementedError() class CursorMovement(Command): matrix = None + class CursorLookup(MemoryAddress): matrix = None src = None dest = None - - - - - diff --git a/pypspamm/cursors/blockcursor.py b/pypspamm/cursors/blockcursor.py index 8126117..26a969c 100644 --- a/pypspamm/cursors/blockcursor.py +++ b/pypspamm/cursors/blockcursor.py @@ -1,9 +1,10 @@ +from typing import cast + +from pypspamm.codegen.sugar import * from pypspamm.cursors.abstractcursor import * -from pypspamm.cursors.matrix import Matrix from pypspamm.cursors.coords import Coords +from pypspamm.cursors.matrix import Matrix -from pypspamm.codegen.sugar import * -from typing import cast class BlockCursor(Cursor): @@ -11,18 +12,20 @@ class BlockCursor(Cursor): patterns = None offsets = None - def __init__(self, - name: str, - base_ptr: Register, - rows: int, - cols: int, - ld: int, - block_rows: int, - block_cols: int, - scalar_bytes:int, - blocks: Matrix[int], - patterns: List[Matrix[bool]], - mtx_overhead) -> None: + def __init__( + self, + name: str, + base_ptr: Register, + rows: int, + cols: int, + ld: int, + block_rows: int, + block_cols: int, + scalar_bytes: int, + blocks: Matrix[int], + patterns: List[Matrix[bool]], + mtx_overhead, + ) -> None: self.name = name self.base_ptr = base_ptr @@ -42,18 +45,15 @@ def __init__(self, Bci = i // self.bc Bri = j // self.br index = cast(int, blocks[Bri, Bci]) - pattern = patterns[index] - if pattern[j % self.br,i % self.bc]: + pattern = patterns[index] + if pattern[j % self.br, i % self.bc]: self.offsets[j, i] = x x += 1 if ld != 0: x += self.ld - self.r x += mtx_overhead[i] - def offset(self, - src_loc: CursorLocation, - dest_loc: CursorLocation - ) -> int: + def offset(self, src_loc: CursorLocation, dest_loc: CursorLocation) -> int: src_block = src_loc.current_block src_cell = src_loc.current_cell @@ -63,26 +63,24 @@ def offset(self, if not dest_block.absolute: dest_block += src_block - assert(src_block.absolute) - assert(not src_cell.absolute) - assert(not dest_cell.absolute) + assert src_block.absolute + assert not src_cell.absolute + assert not dest_cell.absolute - src_cell += Coords(src_block.down*self.br, src_block.right*self.bc, True) - dest_cell += Coords(dest_block.down*self.br, dest_block.right*self.bc, True) + src_cell += Coords(src_block.down * self.br, src_block.right * self.bc, True) + dest_cell += Coords(dest_block.down * self.br, dest_block.right * self.bc, True) src_offset = self.offsets[src_cell.down, src_cell.right] dest_offset = self.offsets[dest_cell.down, dest_cell.right] - if (src_offset == -1 or dest_offset == -1): + if src_offset == -1 or dest_offset == -1: raise Exception("Cursor location does not exist in memory!") return dest_offset - - def move(self, - src_loc: CursorLocation, - dest_block: Coords - ) -> Tuple[AsmStmt, CursorLocation]: + def move( + self, src_loc: CursorLocation, dest_block: Coords + ) -> Tuple[AsmStmt, CursorLocation]: comment = f"Move {self.name} to {str(dest_block)}" @@ -92,45 +90,42 @@ def move(self, dest_loc = self.start_location(dest_block + src_loc.current_block) offset_bytes = self.offset(src_loc, dest_loc) * self.scalar_bytes - - return add(offset_bytes, self.base_ptr, comment), dest_loc + return add(offset_bytes, self.base_ptr, comment), dest_loc - def look(self, - src_loc: CursorLocation, - dest_block: Coords, - dest_cell: Coords - ) -> Tuple[MemoryAddress, str]: + def look( + self, src_loc: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> Tuple[MemoryAddress, str]: dest_loc = CursorLocation(dest_block, dest_cell) offset_bytes = self.offset(src_loc, dest_loc) * self.scalar_bytes comment = f"{self.name}[{dest_block.down},{dest_block.right}][{dest_cell.down},{dest_cell.right}]" addr = pypspamm.architecture.operands.mem(self.base_ptr, offset_bytes) - - return (addr, comment) + return (addr, comment) - def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockInfo: + def get_block( + self, src: CursorLocation = None, dest_block: Coords = None + ) -> BlockInfo: - if src is None: # Have dest_block but no src - assert(dest_block is not None) - assert(dest_block.absolute == True) + if src is None: # Have dest_block but no src + assert dest_block is not None + assert dest_block.absolute == True block_abs = dest_block - elif dest_block is None: # Have src but no dest_block - assert(src.current_block.absolute == True) + elif dest_block is None: # Have src but no dest_block + assert src.current_block.absolute == True block_abs = src.current_block - elif dest_block.absolute: # Have src and absolute dest_block + elif dest_block.absolute: # Have src and absolute dest_block block_abs = dest_block - else: # Have both src and relative dest_block - assert(src.current_block.absolute == True) + else: # Have both src and relative dest_block + assert src.current_block.absolute == True block_abs = dest_block + src.current_block - - br = self.br if block_abs.down < self.Br else self.brf #TODO: Verify these + br = self.br if block_abs.down < self.Br else self.brf # TODO: Verify these bc = self.bc if block_abs.right < self.Bc else self.bcf index = self.blocks[block_abs.down, block_abs.right] index = cast(int, index) # TODO: Overload functions correctly @@ -138,42 +133,46 @@ def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockI pattern = cast(Matrix[bool], pattern) return BlockInfo(br, bc, index, pattern) + def has_nonzero_cell( + self, src_loc: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> bool: - def has_nonzero_cell(self, - src_loc: CursorLocation, - dest_block: Coords, - dest_cell: Coords - ) -> bool: - - assert(not dest_cell.absolute) + assert not dest_cell.absolute if not dest_block.absolute: dest_block += src_loc.current_block - dest_cell += Coords(dest_block.down*self.br, dest_block.right*self.bc, True) - return self.offsets.shape[0] > dest_cell.down and self.offsets.shape[1] > dest_cell.right and self.offsets[dest_cell.down, dest_cell.right] != -1 - + dest_cell += Coords(dest_block.down * self.br, dest_block.right * self.bc, True) + return ( + self.offsets.shape[0] > dest_cell.down + and self.offsets.shape[1] > dest_cell.right + and self.offsets[dest_cell.down, dest_cell.right] != -1 + ) def has_nonzero_block(self, src: CursorLocation, dest_block: Coords) -> bool: nonzero = False - br,bc,idx,pat = self.get_block(src, dest_block) + br, bc, idx, pat = self.get_block(src, dest_block) for bci in range(bc): for bri in range(br): - if pat[bri,bci]: + if pat[bri, bci]: nonzero = True return nonzero + def start_location( + self, dest_block: Coords = Coords(absolute=True) + ) -> CursorLocation: - def start_location(self, dest_block: Coords = Coords(absolute=True)) -> CursorLocation: - - assert(dest_block.absolute == True) - br,bc,idx,pat = self.get_block(dest_block=dest_block) + assert dest_block.absolute == True + br, bc, idx, pat = self.get_block(dest_block=dest_block) for bci in range(bc): for bri in range(br): - if pat[bri,bci]: - return CursorLocation(dest_block, Coords(down=bri, right=bci, absolute=False)) - - raise Exception(f"Block {dest_block} has no starting location because it is empty!") + if pat[bri, bci]: + return CursorLocation( + dest_block, Coords(down=bri, right=bci, absolute=False) + ) + raise Exception( + f"Block {dest_block} has no starting location because it is empty!" + ) def start(self) -> CursorLocation: @@ -186,16 +185,17 @@ def start(self) -> CursorLocation: raise Exception("Matrix is completely empty!") -def sparse_mask(A_regs: Matrix[Register], - A: Cursor, - A_ptr: CursorLocation, - A_block_offset: Coords, - B: Cursor, - B_ptr: CursorLocation, - B_block_offset: Coords, - v_size: int, - has_mask: bool = False - ) -> Matrix[bool]: +def sparse_mask( + A_regs: Matrix[Register], + A: Cursor, + A_ptr: CursorLocation, + A_block_offset: Coords, + B: Cursor, + B_ptr: CursorLocation, + B_block_offset: Coords, + v_size: int, + has_mask: bool = False, +) -> Matrix[bool]: Vr, Vc = A_regs.shape mask = Matrix.full(Vr, Vc, False) @@ -203,13 +203,15 @@ def sparse_mask(A_regs: Matrix[Register], B_br, B_bc, B_idx, B_pat = B.get_block(B_ptr, B_block_offset) if not has_mask: - assert (A_br % v_size == 0) # bm must tile m exactly for now in non-mask-supporting ISAs - assert(Vc >= A_bc) # Matrix block must fit in register block - assert(A_bc == B_br) # Matrix blocks are compatible + assert ( + A_br % v_size == 0 + ) # bm must tile m exactly for now in non-mask-supporting ISAs + assert Vc >= A_bc # Matrix block must fit in register block + assert A_bc == B_br # Matrix blocks are compatible # Mask out registers not used in current block, including zero-rows of B and A for Vci in range(A_bc): - if B_pat[Vci,:].any(axis=1): - mask[:,Vci] = A_pat[:,Vci] + if B_pat[Vci, :].any(axis=1): + mask[:, Vci] = A_pat[:, Vci] return mask diff --git a/pypspamm/cursors/coords.py b/pypspamm/cursors/coords.py index fbdd817..176bed0 100644 --- a/pypspamm/cursors/coords.py +++ b/pypspamm/cursors/coords.py @@ -6,33 +6,38 @@ # a logical block start, or a physical block start depending on context. # We are including a {relative|absolute} flag in order to reduce the number of methods. -C = namedtuple('C', 'down right absolute') +C = namedtuple("C", "down right absolute") C.__new__.__defaults__ = (0, 0, False) + class Coords(C): def copy(self): return Coords(self.down, self.right, self.absolute) - + def __add__(self, other): absolute = self.absolute | other.absolute - return Coords(self.down+other.down, self.right+other.right, absolute) + return Coords(self.down + other.down, self.right + other.right, absolute) def __sub__(self, other): - absolute = self.absolute != other.absolute # TODO: What is the math behind this? - return Coords(self.down-other.down, self.right-other.right, absolute) + absolute = ( + self.absolute != other.absolute + ) # TODO: What is the math behind this? + return Coords(self.down - other.down, self.right - other.right, absolute) def __neg__(self, other): return Coords(-self.down, -self.right, self.absolute) def __eq__(self, other): - return self.down == other.down and \ - self.right == other.right and \ - self.absolute == other.absolute + return ( + self.down == other.down + and self.right == other.right + and self.absolute == other.absolute + ) def __repr__(self): if self.absolute: absolute = ", absolute" else: absolute = "" - return f"(d={self.down},r={self.right}{absolute})" \ No newline at end of file + return f"(d={self.down},r={self.right}{absolute})" diff --git a/pypspamm/cursors/densecursor.py b/pypspamm/cursors/densecursor.py index 92f0384..eefa3ba 100644 --- a/pypspamm/cursors/densecursor.py +++ b/pypspamm/cursors/densecursor.py @@ -4,22 +4,23 @@ from pypspamm.cursors import * - class DenseCursor(Cursor): - def __init__(self, - name: str, - base_ptr: Register, - rows:int, - cols:int, - ld: int, - block_rows: int, - block_cols: int, - scalar_bytes:int) -> None: + def __init__( + self, + name: str, + base_ptr: Register, + rows: int, + cols: int, + ld: int, + block_rows: int, + block_cols: int, + scalar_bytes: int, + ) -> None: self.name = name self.r, self.c = rows, cols - self.br, self.bc = block_rows, block_cols + self.br, self.bc = block_rows, block_cols self.pattern = Matrix.full(block_rows, block_cols, True) self.base_ptr = base_ptr @@ -30,31 +31,26 @@ def __init__(self, x = 0 for bci in range(self.bc): for bri in range(self.br): - self.offsets[bri,bci] = x + self.offsets[bri, bci] = x x += 1 - def offset(self, - src_block: Coords, - dest_block: Coords, - dest_cell: Coords - ) -> int: + def offset(self, src_block: Coords, dest_block: Coords, dest_cell: Coords) -> int: # TODO: Why not make offset compute the 1D distance # from current pointer to desired logical cell instead? - assert(src_block.absolute == True) - assert(dest_cell.absolute == False) + assert src_block.absolute == True + assert dest_cell.absolute == False if not dest_block.absolute: dest_block += src_block Bri, Bci = dest_block.down, dest_block.right bri, bci = dest_cell.down, dest_cell.right - return (Bci*self.bc + bci) * self.ld + Bri*self.br + bri + return (Bci * self.bc + bci) * self.ld + Bri * self.br + bri - def move(self, - src: CursorLocation, - dest_block: Coords - ) -> Tuple[AsmStmt, CursorLocation]: + def move( + self, src: CursorLocation, dest_block: Coords + ) -> Tuple[AsmStmt, CursorLocation]: if dest_block.absolute: dest_block_abs = dest_block @@ -68,13 +64,11 @@ def move(self, dest = CursorLocation(dest_block_abs, src.current_cell) return (add(rel_offset, self.base_ptr, comment), dest) - def look(self, - src: CursorLocation, - dest_block: Coords, - dest_cell: Coords - ) -> Tuple[MemoryAddress, str]: + def look( + self, src: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> Tuple[MemoryAddress, str]: - assert(dest_cell.absolute == False) + assert dest_cell.absolute == False comment = f"{self.name} [{dest_block.down},{dest_block.right}] [{dest_cell.down},{dest_cell.right}]" @@ -86,33 +80,40 @@ def look(self, return (addr, comment) + def start_location( + self, dest_block: Coords = Coords(absolute=True) + ) -> CursorLocation: - def start_location(self, dest_block: Coords = Coords(absolute=True)) -> CursorLocation: - - assert(dest_block.absolute == True) - #TODO: Handle fringe case? + assert dest_block.absolute == True + # TODO: Handle fringe case? for bci in range(self.bc): for bri in range(self.br): if self.offsets[bri, bci] != -1: - return CursorLocation(dest_block, Coords(down=bri, right=bci, absolute=False)) + return CursorLocation( + dest_block, Coords(down=bri, right=bci, absolute=False) + ) - raise Exception(f"Block {dest_block} has no starting location because it is empty!") + raise Exception( + f"Block {dest_block} has no starting location because it is empty!" + ) - def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockInfo: + def get_block( + self, src: CursorLocation = None, dest_block: Coords = None + ) -> BlockInfo: if src is None: - assert(dest_block is not None) - assert(dest_block.absolute == True) + assert dest_block is not None + assert dest_block.absolute == True block_abs = dest_block elif dest_block is None: - assert(src.current_block.absolute == True) + assert src.current_block.absolute == True block_abs = src.current_block else: - assert(src.current_block.absolute == True) - assert(dest_block.absolute == False) + assert src.current_block.absolute == True + assert dest_block.absolute == False block_abs = dest_block + src.current_block - br = self.br if block_abs.down < self.Br else self.brf #TODO: Verify these + br = self.br if block_abs.down < self.Br else self.brf # TODO: Verify these bc = self.bc if block_abs.right < self.Bc else self.bcf index = 0 pattern = self.pattern[0:br, 0:bc] @@ -121,12 +122,14 @@ def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockI def has_nonzero_block(self, src: CursorLocation, dest_block: Coords) -> bool: return True - - def has_nonzero_cell(self, - src_loc: CursorLocation, - dest_block: Coords, - dest_cell: Coords) -> bool: - return self.offsets.shape[0] > dest_cell.down and self.offsets.shape[1] > dest_cell.right + + def has_nonzero_cell( + self, src_loc: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> bool: + return ( + self.offsets.shape[0] > dest_cell.down + and self.offsets.shape[1] > dest_cell.right + ) def start(self) -> CursorLocation: return CursorLocation() diff --git a/pypspamm/cursors/matrix.py b/pypspamm/cursors/matrix.py index c939cb5..cc39709 100644 --- a/pypspamm/cursors/matrix.py +++ b/pypspamm/cursors/matrix.py @@ -1,16 +1,17 @@ - - # Need a native Python matrix type. # Lists of lists are too cumbersome, and scipy does not understand typing. # Also don't want to introduce a hard dependence on scipy if not necessary. -from typing import TypeVar, Generic, Union, Tuple, List, overload, Any -from scipy.sparse import csc_matrix -from scipy.io import mmread, mmwrite -import numpy as np import random +from typing import Any, Generic, List, Tuple, TypeVar, Union, overload + +import numpy as np +from scipy.io import mmread, mmwrite +from scipy.sparse import csc_matrix + +T = TypeVar("T") + -T = TypeVar('T') class Matrix(Generic[T]): def __init__(self, data): @@ -23,16 +24,16 @@ def __init__(self, data): self.cols = self.shape[1] @classmethod - def full(cls, rows:int, cols:int, initial_value:T): + def full(cls, rows: int, cols: int, initial_value: T): """Create a brand new matrix of given size""" - return cls(np.full((rows,cols), initial_value)) + return cls(np.full((rows, cols), initial_value)) def __repr__(self): col_str = [] for ri in range(self.rows): row_str = [] for ci in range(self.cols): - row_str.append(str(self._underlying[ri,ci]).rjust(8)) + row_str.append(str(self._underlying[ri, ci]).rjust(8)) col_str.append("".join(row_str)) return "\n".join(col_str) @@ -40,11 +41,11 @@ def __eq__(self, other): return (self._underlying == other._underlying).all() @overload - def __getitem__(self, t: Tuple[slice,slice]) -> "Matrix[T]": + def __getitem__(self, t: Tuple[slice, slice]) -> "Matrix[T]": pass @overload - def __getitem__(self, t: Tuple[int,int]) -> T: + def __getitem__(self, t: Tuple[int, int]) -> T: pass def __getitem__(self, t) -> Union[T, "Matrix[T]"]: @@ -54,7 +55,7 @@ def __getitem__(self, t) -> Union[T, "Matrix[T]"]: else: return result - def __setitem__(self, cell:Tuple[int,int], value:T): + def __setitem__(self, cell: Tuple[int, int], value: T): self._underlying[cell] = value def __or__(self, other): @@ -68,12 +69,17 @@ def any(self, axis=None, out=None): def nnz(self, axis=None) -> Union[int, List[int]]: if axis is None: - return sum(self[r,c] != 0 for r in range(self.rows) - for c in range(self.cols)) + return sum( + self[r, c] != 0 for r in range(self.rows) for c in range(self.cols) + ) if axis == 1: - return [sum(self[r,c] != 0 for r in range(self.rows)) for c in range(self.cols)] + return [ + sum(self[r, c] != 0 for r in range(self.rows)) for c in range(self.cols) + ] if axis == 0: - return [sum(self[r,c] != 0 for c in range(self.cols)) for r in range(self.rows)] + return [ + sum(self[r, c] != 0 for c in range(self.cols)) for r in range(self.rows) + ] @classmethod def load_pattern(cls, filename) -> "Matrix[bool]": diff --git a/pypspamm/matmul.py b/pypspamm/matmul.py index 995ebef..6bddab3 100644 --- a/pypspamm/matmul.py +++ b/pypspamm/matmul.py @@ -1,21 +1,21 @@ from typing import Tuple +import numpy + +import pypspamm.architecture from pypspamm.codegen.ast import * -from pypspamm.codegen.sugar import * from pypspamm.codegen.forms import * from pypspamm.codegen.precision import * - -from pypspamm.cursors import * - -from pypspamm.codegen.virtual import * from pypspamm.codegen.prune import * - -import pypspamm.architecture -import numpy +from pypspamm.codegen.sugar import * +from pypspamm.codegen.virtual import * +from pypspamm.cursors import * -def decompose_pattern(k, n, pattern:Matrix[bool], bk:int, bn:int) -> Tuple[Matrix[int], List[Matrix[bool]]]: - Bk,Bn = k//bk, n//bn +def decompose_pattern( + k, n, pattern: Matrix[bool], bk: int, bn: int +) -> Tuple[Matrix[int], List[Matrix[bool]]]: + Bk, Bn = k // bk, n // bn patterns = [] x = 0 @@ -27,20 +27,31 @@ def decompose_pattern(k, n, pattern:Matrix[bool], bk:int, bn:int) -> Tuple[Matri if k_overhead > 0: Bk += 1 - blocks = Matrix.full(Bk,Bn,-1) + blocks = Matrix.full(Bk, Bn, -1) for Bni in range(Bn): for Bki in range(Bk): if Bni + 1 == Bn and n_overhead > 0 and Bki + 1 == Bk and k_overhead > 0: - block = pattern[(Bki*bk):((Bki+1)*bk+k_overhead), (Bni*bn):((Bni)*bn+n_overhead)] + block = pattern[ + (Bki * bk) : ((Bki + 1) * bk + k_overhead), + (Bni * bn) : ((Bni) * bn + n_overhead), + ] elif Bni + 1 == Bn and n_overhead > 0: - block = pattern[(Bki*bk):((Bki+1)*bk), (Bni*bn):((Bni)*bn+n_overhead)] + block = pattern[ + (Bki * bk) : ((Bki + 1) * bk), + (Bni * bn) : ((Bni) * bn + n_overhead), + ] elif Bki + 1 == Bk and k_overhead > 0: - block = pattern[(Bki*bk):((Bki+1)*bk+k_overhead), (Bni*bn):((Bni+1)*bn)] + block = pattern[ + (Bki * bk) : ((Bki + 1) * bk + k_overhead), + (Bni * bn) : ((Bni + 1) * bn), + ] else: - block = pattern[(Bki*bk):((Bki+1)*bk), (Bni*bn):((Bni+1)*bn)] - - blocks[Bki,Bni] = x + block = pattern[ + (Bki * bk) : ((Bki + 1) * bk), (Bni * bn) : ((Bni + 1) * bn) + ] + + blocks[Bki, Bni] = x x += 1 patterns.append(block) @@ -53,31 +64,33 @@ def decompose_pattern(k, n, pattern:Matrix[bool], bk:int, bn:int) -> Tuple[Matri return blocks, patterns, mtx_overhead + class MatMul: - def __init__(self, - m: int, - n: int, - k: int, - lda: int, - ldb: int, - ldc: int, - alpha: str, - beta: str, - mtx_filename: str, - amtx_filename: str, - bmtx_filename: str, - mtx_format: str = 'any', - output_funcname: str = None, - output_filename: str = None, - output_overwrite: bool = False, - bm: int = None, - bn: int = None, - bk: int = None, - arch: str = 'knl', - precision: str = 'd', - prefetching: str = None, - **kwargs # Accept and ignore args which don't belong - ) -> None: + def __init__( + self, + m: int, + n: int, + k: int, + lda: int, + ldb: int, + ldc: int, + alpha: str, + beta: str, + mtx_filename: str, + amtx_filename: str, + bmtx_filename: str, + mtx_format: str = "any", + output_funcname: str = None, + output_filename: str = None, + output_overwrite: bool = False, + bm: int = None, + bn: int = None, + bk: int = None, + arch: str = "knl", + precision: str = "d", + prefetching: str = None, + **kwargs, # Accept and ignore args which don't belong + ) -> None: self.m = m self.n = n @@ -88,91 +101,97 @@ def __init__(self, self.ldc = ldc try: - self.alpha = float(alpha) + self.alpha = float(alpha) except: - self.alpha = 'generic' + self.alpha = "generic" try: - self.beta = float(beta) + self.beta = float(beta) except: - self.beta = 'generic' + self.beta = "generic" - if arch.startswith('skx'): - arch = 'knl' + arch[3:] + if arch.startswith("skx"): + arch = "knl" + arch[3:] # hacky implementation of multi-register length - if arch.startswith('arm_sve'): - if len(arch) == 7: - v_len_regs = 4 # compatibility: arm_sve == arm_sve512 - else: - v_len_bits = int(arch[7:]) - assert v_len_bits % 128 == 0 and v_len_bits <= 2048 - v_len_regs = v_len_bits // 128 - arch = 'arm_sve' - - if arch.startswith('knl'): - if len(arch) == 3: - v_len_regs = 4 - else: - v_len_bits = int(arch[3:]) - assert v_len_bits in (128, 256, 512) - v_len_regs = v_len_bits // 128 - arch = 'knl' - - if arch.startswith('hsw'): - if len(arch) == 3: - v_len_regs = 2 - else: - v_len_bits = int(arch[3:]) - assert v_len_bits in (128, 256) - v_len_regs = v_len_bits // 128 - arch = 'hsw' - - if arch.startswith('rvv'): - if len(arch) == 3: - v_len_regs = 1 - else: - v_len_bits = int(arch[3:]) - assert v_len_bits in (128, 256, 512, 1024, 2048, 4096, 8192) - v_len_regs = v_len_bits // 128 - arch = 'rvv' - - if arch.startswith('arm') and not arch.startswith('arm_sve'): - # only 128 supported - v_len_regs = 1 - arch = 'arm' - - if arch.startswith('lsx'): - if len(arch) == 3: + if arch.startswith("arm_sve"): + if len(arch) == 7: + v_len_regs = 4 # compatibility: arm_sve == arm_sve512 + else: + v_len_bits = int(arch[7:]) + assert v_len_bits % 128 == 0 and v_len_bits <= 2048 + v_len_regs = v_len_bits // 128 + arch = "arm_sve" + + if arch.startswith("knl"): + if len(arch) == 3: + v_len_regs = 4 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256, 512) + v_len_regs = v_len_bits // 128 + arch = "knl" + + if arch.startswith("hsw"): + if len(arch) == 3: + v_len_regs = 2 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = "hsw" + + if arch.startswith("rvv"): + if len(arch) == 3: + v_len_regs = 1 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256, 512, 1024, 2048, 4096, 8192) + v_len_regs = v_len_bits // 128 + arch = "rvv" + + if arch.startswith("arm") and not arch.startswith("arm_sve"): + # only 128 supported v_len_regs = 1 - else: - v_len_bits = int(arch[3:]) - assert v_len_bits in (128, 256) - v_len_regs = v_len_bits // 128 - arch = 'lsx' - - if arch.startswith('lasx'): - if len(arch) == 4: - v_len_regs = 2 - else: - v_len_bits = int(arch[4:]) - assert v_len_bits in (128, 256) - v_len_regs = v_len_bits // 128 - arch = 'lsx' + arch = "arm" + + if arch.startswith("lsx"): + if len(arch) == 3: + v_len_regs = 1 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = "lsx" + + if arch.startswith("lasx"): + if len(arch) == 4: + v_len_regs = 2 + else: + v_len_bits = int(arch[4:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = "lsx" self.arch = arch - assert precision.lower() in ['bf16', 'h', 's', 'd'] + assert precision.lower() in ["bf16", "h", "s", "d"] self.precision = { - 'h' : Precision.HALF, - 's' : Precision.SINGLE, - 'd' : Precision.DOUBLE, - 'bf16' : Precision.BFLOAT16 + "h": Precision.HALF, + "s": Precision.SINGLE, + "d": Precision.DOUBLE, + "bf16": Precision.BFLOAT16, }[precision.lower()] pypspamm.architecture.init() pypspamm.architecture.arch = arch - pypspamm.architecture.Generator = pypspamm.architecture.get_class("pypspamm.codegen.architectures." + arch + ".generator").Generator - pypspamm.architecture.operands = pypspamm.architecture.get_class("pypspamm.codegen.architectures." + arch + ".operands") - pypspamm.architecture.blocksize = pypspamm.architecture.get_class("pypspamm.codegen.architectures." + arch + ".blocksize").Default + pypspamm.architecture.Generator = pypspamm.architecture.get_class( + "pypspamm.codegen.architectures." + arch + ".generator" + ).Generator + pypspamm.architecture.operands = pypspamm.architecture.get_class( + "pypspamm.codegen.architectures." + arch + ".operands" + ) + pypspamm.architecture.blocksize = pypspamm.architecture.get_class( + "pypspamm.codegen.architectures." + arch + ".blocksize" + ).Default self.generator = pypspamm.architecture.Generator(self.precision) @@ -186,11 +205,13 @@ def __init__(self, self.v_size = self.generator.get_v_size() if bk == None: - bk = 2 if arch == 'knl' else 1 + bk = 2 if arch == "knl" else 1 if bm == None or bn == None: - (self.bm, self.bn, self.bk) = pypspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) - else: + (self.bm, self.bn, self.bk) = pypspamm.architecture.blocksize.getBlocksize( + m, n, bk, self.v_size, self.precision + ) + else: self.bm = bm self.bn = bn self.bk = bk @@ -202,25 +223,25 @@ def __init__(self, self.output_overwrite = output_overwrite if ldb == 0: - if bmtx_filename is None or bmtx_filename == '': + if bmtx_filename is None or bmtx_filename == "": bmtx_filename = mtx_filename bpattern = Matrix.load(bmtx_filename) self.generator.set_sparse() else: bpattern = Matrix.full(k, n, True) assert self.k <= ldb - + if lda == 0: apattern = Matrix.load(amtx_filename) self.generator.set_sparse() else: apattern = Matrix.full(m, k, True) assert self.m <= lda - + self.bmtx_filename = bmtx_filename self.amtx_filename = amtx_filename self.mtx_format = mtx_format - + assert self.m <= ldc self.bnnz = bpattern.nnz() @@ -229,34 +250,144 @@ def __init__(self, # compute flops by splitting into outer products over k kannz = apattern.nnz(1) kbnnz = bpattern.nnz(0) - self.flop = 2 * sum(ka * kb for ka,kb in zip(kannz, kbnnz)) + self.flop = 2 * sum(ka * kb for ka, kb in zip(kannz, kbnnz)) # if matrices are always padded to multiple of v_size, we can remove the if-part and execute the assert for SVE too if not self.masks: - assert(self.m % self.v_size == 0) - - self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_regs, self.additional_regs, self.mask_regs, self.prefetch_reg = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.bnnz, self.m, self.n, self.k, self.prefetching) - - self.A_pool = RegisterPool([self.A_regs[i,j] for i in range(self.A_regs.shape[0]) for j in range(self.A_regs.shape[1])]) - self.B_pool = RegisterPool([self.B_regs[i,j] for i in range(self.B_regs.shape[0]) for j in range(self.B_regs.shape[1])]) - self.C_pool = RegisterPool([self.C_regs[i,j] for i in range(self.C_regs.shape[0]) for j in range(self.C_regs.shape[1])]) - - self.alpha_bcst_reg, self.beta_bcst_reg = self.starting_regs[3], self.starting_regs[4] + assert self.m % self.v_size == 0 + + ( + self.A_regs, + self.B_regs, + self.C_regs, + self.starting_regs, + self.alpha_reg, + self.beta_reg, + self.loop_regs, + self.additional_regs, + self.mask_regs, + self.prefetch_reg, + ) = self.generator.make_reg_blocks( + self.bm, + self.bn, + self.bk, + self.v_size, + self.bnnz, + self.m, + self.n, + self.k, + self.prefetching, + ) + + self.A_pool = RegisterPool( + [ + self.A_regs[i, j] + for i in range(self.A_regs.shape[0]) + for j in range(self.A_regs.shape[1]) + ] + ) + self.B_pool = RegisterPool( + [ + self.B_regs[i, j] + for i in range(self.B_regs.shape[0]) + for j in range(self.B_regs.shape[1]) + ] + ) + self.C_pool = RegisterPool( + [ + self.C_regs[i, j] + for i in range(self.C_regs.shape[0]) + for j in range(self.C_regs.shape[1]) + ] + ) + + self.alpha_bcst_reg, self.beta_bcst_reg = ( + self.starting_regs[3], + self.starting_regs[4], + ) if lda == 0: - blocks, patterns, mtx_overhead = decompose_pattern(self.m, self.k, apattern, self.bm, self.bk) - self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size(), blocks, patterns, mtx_overhead) + blocks, patterns, mtx_overhead = decompose_pattern( + self.m, self.k, apattern, self.bm, self.bk + ) + self.A = BlockCursor( + "A", + self.starting_regs[0], + self.m, + self.k, + self.lda, + self.bm, + self.bk, + self.precision.size(), + blocks, + patterns, + mtx_overhead, + ) self.annz += sum(mtx_overhead) else: - self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size()) + self.A = DenseCursor( + "A", + self.starting_regs[0], + self.m, + self.k, + self.lda, + self.bm, + self.bk, + self.precision.size(), + ) if ldb == 0: - blocks, patterns, mtx_overhead = decompose_pattern(self.k, self.n, bpattern, self.bk, self.bn) - self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size(), blocks, patterns, mtx_overhead) + blocks, patterns, mtx_overhead = decompose_pattern( + self.k, self.n, bpattern, self.bk, self.bn + ) + self.B = BlockCursor( + "B", + self.starting_regs[1], + self.k, + self.n, + self.ldb, + self.bk, + self.bn, + self.precision.size(), + blocks, + patterns, + mtx_overhead, + ) self.bnnz += sum(mtx_overhead) else: - self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size()) - self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) - self.C_pf = DenseCursor("C_pf", self.starting_regs[5], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) if self.prefetch_reg else None + self.B = DenseCursor( + "B", + self.starting_regs[1], + self.k, + self.n, + self.ldb, + self.bk, + self.bn, + self.precision.size(), + ) + self.C = DenseCursor( + "C", + self.starting_regs[2], + self.m, + self.n, + self.ldc, + self.bm, + self.bn, + self.precision.size(), + ) + self.C_pf = ( + DenseCursor( + "C_pf", + self.starting_regs[5], + self.m, + self.n, + self.ldc, + self.bm, + self.bn, + self.precision.size(), + ) + if self.prefetch_reg + else None + ) self.unroll_n = ldb == 0 self.unroll_m = lda == 0 @@ -288,7 +419,15 @@ def microkernel(self, asm, Bmi, Bni, unroll, A_ptr, B_ptr, C_ptr, C_pf_ptr): if m_overhead > 0: Bm += 1 - regs = Matrix([[VirtualRegister(self.C_regs[0,0].typeinfo, self.C_pool) for _ in range(self.C_regs.shape[1])] for _ in range(self.C_regs.shape[0])]) + regs = Matrix( + [ + [ + VirtualRegister(self.C_regs[0, 0].typeinfo, self.C_pool) + for _ in range(self.C_regs.shape[1]) + ] + for _ in range(self.C_regs.shape[0]) + ] + ) BnEnd = Bni + 1 == Bn BmEnd = Bmi + 1 == Bm @@ -297,20 +436,47 @@ def microkernel(self, asm, Bmi, Bni, unroll, A_ptr, B_ptr, C_ptr, C_pf_ptr): regs = regs[:, :n_overhead] if BmEnd and m_overhead > 0: regs = regs[:vm_overhead, :] - + C_ptr_in = CursorLocation(Coords(right=Bni, down=Bmi, absolute=True)) to_C = Coords() C_ptr_pf_in = C_ptr_in if self.alpha in [-1.0, 1.0] and self.beta != 0.0: - asm.add(self.generator.move_register_block(self.C, C_ptr_in, to_C, regs, self.v_size, self.additional_regs, None, False)) + asm.add( + self.generator.move_register_block( + self.C, + C_ptr_in, + to_C, + regs, + self.v_size, + self.additional_regs, + None, + False, + ) + ) if self.beta != 1.0: if self.use_bcst: - asm.add(bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta")) + asm.add( + bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta") + ) for ic in range(regs.shape[1]): for ir in range(regs.shape[0]): - pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir * self.v_size, self.v_size, "m") - asm.add(mul(regs[ir,ic], self.beta_reg[1], regs[ir,ic], "C = beta * C", pred=pred_m)) + pred_m = ( + None + if not self.masks + else self.generator.pred_n_trues( + self.bm - ir * self.v_size, self.v_size, "m" + ) + ) + asm.add( + mul( + regs[ir, ic], + self.beta_reg[1], + regs[ir, ic], + "C = beta * C", + pred=pred_m, + ) + ) else: asm.add(self.generator.make_zero_block(regs, self.additional_regs)) @@ -322,14 +488,24 @@ def kernelK(asm, Bki): A_ptr_in = CursorLocation(Coords(right=0, down=Bmi, absolute=True)) else: A_ptr_in = A_ptr - to_A = Coords(right=Bki, down=Bmi, absolute=True) if self.unroll_m else Coords(right=Bki) + to_A = ( + Coords(right=Bki, down=Bmi, absolute=True) + if self.unroll_m + else Coords(right=Bki) + ) if BnEnd and n_overhead > 0 and not self.unroll_n: B_ptr_in = CursorLocation(Coords(down=0, right=Bni, absolute=True)) else: B_ptr_in = B_ptr - to_B = Coords(right=Bni, down=Bki, absolute=True) if self.unroll_n else Coords(down=Bki) - keep = (not self.unroll_n or self.B.has_nonzero_block(B_ptr_in, to_B)) and (not self.unroll_m or self.A.has_nonzero_block(A_ptr_in, to_A)) + to_B = ( + Coords(right=Bni, down=Bki, absolute=True) + if self.unroll_n + else Coords(down=Bki) + ) + keep = ( + not self.unroll_n or self.B.has_nonzero_block(B_ptr_in, to_B) + ) and (not self.unroll_m or self.A.has_nonzero_block(A_ptr_in, to_A)) else: # setting A_ptr, B_ptr here may be a bit too hacky... A_ptr_in = CursorLocation(Coords(right=Bki, down=Bmi, absolute=True)) @@ -337,44 +513,175 @@ def kernelK(asm, Bki): to_A = Coords() to_B = Coords() keep = True - + sub = self.alpha == -1.0 if keep: - A_regs = Matrix([[VirtualRegister(self.A_regs[0,0].typeinfo, self.A_pool) for _ in range(self.A_regs.shape[1])] for _ in range(self.A_regs.shape[0])]) - B_regs = Matrix([[VirtualRegister(self.B_regs[0,0].typeinfo, self.B_pool) for _ in range(self.B_regs.shape[1])] for _ in range(self.B_regs.shape[0])]) - asm.add(self.generator.make_microkernel(self.A, self.B, A_ptr_in, B_ptr_in, A_regs, B_regs, regs, self.v_size, self.additional_regs, to_A, to_B, sub)) - - self.loopwrap(asm, kernelK, Bk, k_overhead > 0, unroll, self.loop_regs[2], [self.A, self.B], [A_ptr, B_ptr], ['right', 'down'], loopunroll=1, overlap=True) + A_regs = Matrix( + [ + [ + VirtualRegister(self.A_regs[0, 0].typeinfo, self.A_pool) + for _ in range(self.A_regs.shape[1]) + ] + for _ in range(self.A_regs.shape[0]) + ] + ) + B_regs = Matrix( + [ + [ + VirtualRegister(self.B_regs[0, 0].typeinfo, self.B_pool) + for _ in range(self.B_regs.shape[1]) + ] + for _ in range(self.B_regs.shape[0]) + ] + ) + asm.add( + self.generator.make_microkernel( + self.A, + self.B, + A_ptr_in, + B_ptr_in, + A_regs, + B_regs, + regs, + self.v_size, + self.additional_regs, + to_A, + to_B, + sub, + ) + ) + + self.loopwrap( + asm, + kernelK, + Bk, + k_overhead > 0, + unroll, + self.loop_regs[2], + [self.A, self.B], + [A_ptr, B_ptr], + ["right", "down"], + loopunroll=1, + overlap=True, + ) if self.alpha not in [-1.0, 1.0]: store_block = block("") if self.use_bcst: - store_block.add(bcst(self.alpha_bcst_reg, self.alpha_reg[1], "Broadcast alpha")) + store_block.add( + bcst(self.alpha_bcst_reg, self.alpha_reg[1], "Broadcast alpha") + ) if self.beta != 0.0 and self.beta != 1.0: - store_block.add(bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta")) + store_block.add( + bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta") + ) for x in range(0, regs.shape[1], self.A_regs.shape[1]): - A_regs = Matrix([[VirtualRegister(self.A_regs[0,0].typeinfo, self.A_pool) for _ in range(self.A_regs.shape[1])] for _ in range(self.A_regs.shape[0])]) - A_regs_cut = A_regs[0:min(self.A_regs.shape[0], regs.shape[0]), 0:regs.shape[1]-x] + A_regs = Matrix( + [ + [ + VirtualRegister(self.A_regs[0, 0].typeinfo, self.A_pool) + for _ in range(self.A_regs.shape[1]) + ] + for _ in range(self.A_regs.shape[0]) + ] + ) + A_regs_cut = A_regs[ + 0 : min(self.A_regs.shape[0], regs.shape[0]), 0 : regs.shape[1] - x + ] if self.beta != 0.0: - store_block.add(self.generator.move_register_block(self.C, C_ptr_in, to_C, A_regs_cut, self.v_size, self.additional_regs, None, False, None, self.ldc * x)) + store_block.add( + self.generator.move_register_block( + self.C, + C_ptr_in, + to_C, + A_regs_cut, + self.v_size, + self.additional_regs, + None, + False, + None, + self.ldc * x, + ) + ) for ir in range(A_regs_cut.shape[0]): for ic in range(A_regs_cut.shape[1]): - pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir*self.v_size, self.v_size, "m") + pred_m = ( + None + if not self.masks + else self.generator.pred_n_trues( + self.bm - ir * self.v_size, self.v_size, "m" + ) + ) if self.beta != 0.0 and self.beta != 1.0: - store_block.add(mul(A_regs_cut[ir,ic], self.beta_reg[1], A_regs_cut[ir,ic], "C = beta * C + alpha * AB", pred=pred_m)) - + store_block.add( + mul( + A_regs_cut[ir, ic], + self.beta_reg[1], + A_regs_cut[ir, ic], + "C = beta * C + alpha * AB", + pred=pred_m, + ) + ) + if self.beta == 0.0: - store_block.add(mul(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = alpha * AB", pred=pred_m)) + store_block.add( + mul( + regs[ir, x + ic], + self.alpha_reg[1], + A_regs_cut[ir, ic], + "C = alpha * AB", + pred=pred_m, + ) + ) else: - store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", None, pred=pred_m)) - store_block.add(self.generator.move_register_block(self.C, C_ptr_in, to_C, A_regs_cut, self.v_size, self.additional_regs, None, True, self.prefetching, self.ldc * x, self.C_pf, C_pf_ptr)) + store_block.add( + fma( + regs[ir, x + ic], + self.alpha_reg[1], + A_regs_cut[ir, ic], + "C = C + alpha * AB", + None, + pred=pred_m, + ) + ) + store_block.add( + self.generator.move_register_block( + self.C, + C_ptr_in, + to_C, + A_regs_cut, + self.v_size, + self.additional_regs, + None, + True, + self.prefetching, + self.ldc * x, + self.C_pf, + C_pf_ptr, + ) + ) asm.add(store_block) else: - asm.add(self.generator.move_register_block(self.C, C_ptr_in, to_C, regs, self.v_size, self.additional_regs, None, True, self.prefetching, 0, self.C_pf, C_pf_ptr)) + asm.add( + self.generator.move_register_block( + self.C, + C_ptr_in, + to_C, + regs, + self.v_size, + self.additional_regs, + None, + True, + self.prefetching, + 0, + self.C_pf, + C_pf_ptr, + ) + ) def blockloop(self, asm, A_ptr, B_ptr, C_ptr, C_pf_ptr): Bn = self.n // self.bn @@ -387,7 +694,7 @@ def blockloop(self, asm, A_ptr, B_ptr, C_ptr, C_pf_ptr): k_overhead = self.k % self.bk m_overhead = self.m % self.bm vm_overhead = -(m_overhead // -self.v_size) - + if n_overhead > 0: Bn += 1 if k_overhead > 0: @@ -395,19 +702,35 @@ def blockloop(self, asm, A_ptr, B_ptr, C_ptr, C_pf_ptr): if m_overhead > 0: Bm += 1 - argsA = [Bm, m_overhead > 0, self.unroll_m, self.loop_regs[0], [self.A], [A_ptr], ['down']] - argsB = [Bn, n_overhead > 0, self.unroll_n, self.loop_regs[1], [self.B], [B_ptr], ['right']] + argsA = [ + Bm, + m_overhead > 0, + self.unroll_m, + self.loop_regs[0], + [self.A], + [A_ptr], + ["down"], + ] + argsB = [ + Bn, + n_overhead > 0, + self.unroll_n, + self.loop_regs[1], + [self.B], + [B_ptr], + ["right"], + ] if self.unroll_n and not self.unroll_m: # swap loops outerArgs, innerArgs = (argsB, argsA) - dirC, dirC2 = ('down', 'right') - args = lambda i,j: (j,i) + dirC, dirC2 = ("down", "right") + args = lambda i, j: (j, i) else: outerArgs, innerArgs = (argsA, argsB) - dirC, dirC2 = ('right', 'down') - args = lambda i,j: (i,j) - + dirC, dirC2 = ("right", "down") + args = lambda i, j: (i, j) + unroll_k = self.unroll_m | self.unroll_n def outerLoop(asm, i): @@ -415,35 +738,56 @@ def innerLoop(asm, j): Bmi, Bni = args(i, j) self.microkernel(asm, Bmi, Bni, unroll_k, A_ptr, B_ptr, C_ptr, C_pf_ptr) if j < innerArgs[0] - 1: - move_C, _ = self.C.move(C_ptr, Coords(**{dirC:1})) + move_C, _ = self.C.move(C_ptr, Coords(**{dirC: 1})) asm.add(move_C) if self.C_pf: - move_C_pf, _ = self.C_pf.move(C_pf_ptr, Coords(**{dirC:1})) + move_C_pf, _ = self.C_pf.move(C_pf_ptr, Coords(**{dirC: 1})) asm.add(move_C_pf) + overhead = self.loopwrap(asm, innerLoop, *innerArgs) - moveLength = 1-innerArgs[0] if overhead else -innerArgs[0] - asm.add(self.C.move(C_ptr, Coords(**{dirC2:1, dirC:moveLength}))[0]) + moveLength = 1 - innerArgs[0] if overhead else -innerArgs[0] + asm.add(self.C.move(C_ptr, Coords(**{dirC2: 1, dirC: moveLength}))[0]) if self.C_pf: - asm.add(self.C_pf.move(C_pf_ptr, Coords(**{dirC2:1, dirC:moveLength}))[0]) + asm.add( + self.C_pf.move(C_pf_ptr, Coords(**{dirC2: 1, dirC: moveLength}))[0] + ) self.loopwrap(asm, outerLoop, *outerArgs) - def loopwrap(self, asm, inner, length, overhead, unroll, loopreg, matrices, ptrs, directions, loopunroll=1, overlap=False): + def loopwrap( + self, + asm, + inner, + length, + overhead, + unroll, + loopreg, + matrices, + ptrs, + directions, + loopunroll=1, + overlap=False, + ): if unroll: for i in range(length): inner(asm, i) return True else: + def makeMove(dist): asm = block(f"move by {dist}") for matrix, ptr, direction in zip(matrices, ptrs, directions): - asm.add(matrix.move(ptr, Coords(**{direction:dist}))[0]) + asm.add(matrix.move(ptr, Coords(**{direction: dist}))[0]) return asm + def makeLoop(until): loopblock = block("kernel") inner(loopblock, 0) loopblock.add(makeMove(1)) - return loop(loopreg, until, unroll=loopunroll, overlap=overlap).body(loopblock) + return loop(loopreg, until, unroll=loopunroll, overlap=overlap).body( + loopblock + ) + if length == 1: inner(asm, 0) return True @@ -451,7 +795,7 @@ def makeLoop(until): if length > 1: asm.add(makeLoop(length - 1)) inner(asm, length - 1) - asm.add(makeMove(1-length)) + asm.add(makeMove(1 - length)) return True else: asm.add(makeLoop(length)) @@ -466,12 +810,19 @@ def make(self): asm = block("kernel") - asm.add(self.generator.make_argument_load(self.starting_regs, self.C_pf is not None)) - - asm.add(block("header", - self.generator.make_scaling_offsets(self.additional_regs, self.bnnz), - self.generator.init_mask(self.m, self.bm, self.v_size, self.loop_regs[0], self.mask_regs) - )) + asm.add( + self.generator.make_argument_load(self.starting_regs, self.C_pf is not None) + ) + + asm.add( + block( + "header", + self.generator.make_scaling_offsets(self.additional_regs, self.bnnz), + self.generator.init_mask( + self.m, self.bm, self.v_size, self.loop_regs[0], self.mask_regs + ), + ) + ) asm.add(self.generator.init_block(self.v_size)) diff --git a/pypspamm/metagen/arm.py b/pypspamm/metagen/arm.py index 6c7436a..9576871 100644 --- a/pypspamm/metagen/arm.py +++ b/pypspamm/metagen/arm.py @@ -1,10 +1,9 @@ - def arm_basic(): generator = MetaGenerator() - generator.add_condition('', 'arm128') - generator.add_condition('svcntb() == 16', 'arm_sve128') - generator.add_condition('svcntb() == 32', 'arm_sve256') - generator.add_condition('svcntb() == 64', 'arm_sve512') - generator.add_condition('svcntb() == 128', 'arm_sve1024') - generator.add_condition('svcntb() == 256', 'arm_sve2048') + generator.add_condition("", "arm128") + generator.add_condition("svcntb() == 16", "arm_sve128") + generator.add_condition("svcntb() == 32", "arm_sve256") + generator.add_condition("svcntb() == 64", "arm_sve512") + generator.add_condition("svcntb() == 128", "arm_sve1024") + generator.add_condition("svcntb() == 256", "arm_sve2048") diff --git a/pypspamm/metagen/metagen.py b/pypspamm/metagen/metagen.py index e4416f6..cd39423 100644 --- a/pypspamm/metagen/metagen.py +++ b/pypspamm/metagen/metagen.py @@ -1,17 +1,20 @@ -from pypspamm.matmul import MatMul from pypspamm.codegen.ccode import * +from pypspamm.matmul import MatMul + class MetaGenerator: def __init__(self): self.conditions = [] self.archs = [] - + def add_condition(self, condition, arch): self.conditions += [condition] self.archs += arch - + def generate_meta(self, funcname, params): - condition_template = " if ({condition}) {{ func = {funcname}_{arch}; }}\n" + condition_template = ( + " if ({condition}) {{ func = {funcname}_{arch}; }}\n" + ) template = """ void {funcname}({params}) {{ @@ -27,23 +30,39 @@ def generate_meta(self, funcname, params): """ conditions = "" - for (condition, arch) in zip(self.conditions, self.archs): - conditions += condition_template.format(funcname=funcname, arch=arch, condition=condition) - + for condition, arch in zip(self.conditions, self.archs): + conditions += condition_template.format( + funcname=funcname, arch=arch, condition=condition + ) + return template.format(funcname=funcname, params=params, conditions=conditions) def generate(self, alg: MatMul): block = alg.make() - return make_cfunc(alg.output_funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs, alg.generator.get_precision()) + return make_cfunc( + alg.output_funcname, + alg.generator.get_template(), + block, + alg.flop, + alg.starting_regs, + alg.generator.get_precision(), + ) if len(self.archs) == 0: return "" - + if len(self.archs) == 1: block = alg.make() - return make_cfunc(alg.output_funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs, alg.generator.get_precision()) + return make_cfunc( + alg.output_funcname, + alg.generator.get_template(), + block, + alg.flop, + alg.starting_regs, + alg.generator.get_precision(), + ) # only generate the kernel; nothing else else: text = "" @@ -51,13 +70,20 @@ def generate(self, alg: MatMul): for arch in self.archs: block = alg.make() - funcname = f'{alg.output_funcname}_{arch}' + funcname = f"{alg.output_funcname}_{arch}" - func = make_cfunc(funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs, alg.generator.get_precision()) + func = make_cfunc( + funcname, + alg.generator.get_template(), + block, + alg.flop, + alg.starting_regs, + alg.generator.get_precision(), + ) text += f"static {func}\n\n" - params = f"const {alg.precision.ctype()}* A, const {alg.precision.ctype()}* B, {alg.precision.ctype()}* C, {alg.precision.ctype()} alpha, {alg.precision.ctype()} beta, const {alg.precision.ctype()}* prefetch" + params = f"const {alg.precision.ctype()}* A, const {alg.precision.ctype()}* B, {alg.precision.ctype()}* C, {alg.precision.ctype()} alpha, {alg.precision.ctype()} beta, const {alg.precision.ctype()}* prefetch" text += self.generate_meta(alg.output_funcname, params) text += "\n\n" diff --git a/setup.py b/setup.py index 71ce9a5..39846fa 100644 --- a/setup.py +++ b/setup.py @@ -1,36 +1,36 @@ import setuptools with open("pypspamm/VERSION", "r") as fh: - current_version = fh.read().strip() + current_version = fh.read().strip() with open("README.md", "r") as fh: - long_description = fh.read() + long_description = fh.read() with open("requirements.txt", "r") as fh: - install_requires = [s.strip() for s in fh.readlines() if s.strip() != ''] + install_requires = [s.strip() for s in fh.readlines() if s.strip() != ""] setuptools.setup( - name="PspaMM", - version=current_version, - license="BSD-3-Clause", - author="Peter Wauligmann, Nathan Brei, Alex Puscas, David Schneller", - author_email="david.schneller@tum.de", - description="An inline assembly generator for sparse matrix multiplications", - long_description=long_description, - long_description_content_type="text/markdown", - packages=setuptools.find_packages(), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: BSD License", - "Operating System :: OS Independent", - ], - url="https://github.com/pspamm/pspamm", - python_requires='>=3.7', - install_requires=install_requires, - include_package_data=True, - entry_points={ - "console_scripts": [ - "pspamm-generator = pypspamm.cli:main", - ] - } + name="PspaMM", + version=current_version, + license="BSD-3-Clause", + author="Peter Wauligmann, Nathan Brei, Alex Puscas, David Schneller", + author_email="david.schneller@tum.de", + description="An inline assembly generator for sparse matrix multiplications", + long_description=long_description, + long_description_content_type="text/markdown", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + ], + url="https://github.com/pspamm/pspamm", + python_requires=">=3.7", + install_requires=install_requires, + include_package_data=True, + entry_points={ + "console_scripts": [ + "pspamm-generator = pypspamm.cli:main", + ] + }, ) diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 3ba455b..b631f17 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -1,14 +1,19 @@ -from collections import namedtuple -import subprocess -import numpy as np +import os.path import random +import subprocess import sys -import os.path +from collections import namedtuple + +import numpy as np + from pypspamm.codegen.precision import * -BASEDIR = 'build' +BASEDIR = "build" -TestKernel = namedtuple('TestKernel', 'name precision m n k lda ldb ldc alpha beta block_sizes amtx bmtx delta') +TestKernel = namedtuple( + "TestKernel", + "name precision m n k lda ldb ldc alpha beta block_sizes amtx bmtx delta", +) head_of_testsuite = """#include #include @@ -272,12 +277,12 @@ def generateMTX(k, n, nnz, bk=1, bn=1): - random.seed(k*n + nnz) + random.seed(k * n + nnz) if k < bk: - bk = k + bk = k if n < bn: - bn = n + bn = n assert k % bk == 0 assert n % bn == 0 @@ -286,36 +291,43 @@ def generateMTX(k, n, nnz, bk=1, bn=1): true_nzz = nnz * bk * bn - os.makedirs(os.path.join(BASEDIR, 'mtx'), exist_ok=True) + os.makedirs(os.path.join(BASEDIR, "mtx"), exist_ok=True) - filename = os.path.join(BASEDIR, 'mtx', f'{k}-{bk}-{n}-{bn}-{nnz}.mtx') + filename = os.path.join(BASEDIR, "mtx", f"{k}-{bk}-{n}-{bn}-{nnz}.mtx") if os.path.isfile(filename): return filename - with open(filename, 'w') as f: + with open(filename, "w") as f: - f.write(f'%%MatrixMarket matrix coordinate real general\n%\n{k} {n} {true_nzz}') + f.write(f"%%MatrixMarket matrix coordinate real general\n%\n{k} {n} {true_nzz}") - zeros = set() + zeros = set() - for i in range(1, k + 1, bk): - for j in range(1, n + 1, bn): - zeros.add((i, j)) + for i in range(1, k + 1, bk): + for j in range(1, n + 1, bn): + zeros.add((i, j)) - nonzeros = random.sample(sorted(zeros), nnz) + nonzeros = random.sample(sorted(zeros), nnz) - for entry in nonzeros: - for ii in range(bk): - for jj in range(bn): - f.write('\n' + str(entry[0] + ii) + ' ' + str(entry[1] + jj) + ' ' + str(random.uniform(0.00001, 1000))) + for entry in nonzeros: + for ii in range(bk): + for jj in range(bn): + f.write( + "\n" + + str(entry[0] + ii) + + " " + + str(entry[1] + jj) + + " " + + str(random.uniform(0.00001, 1000)) + ) return filename def make(kernels, arch): os.makedirs(os.path.join(BASEDIR, arch), exist_ok=True) - f = open(os.path.join(BASEDIR, f'{arch}_testsuite.cpp'), 'w') + f = open(os.path.join(BASEDIR, f"{arch}_testsuite.cpp"), "w") f.write(head_of_testsuite) @@ -323,18 +335,29 @@ def make(kernels, arch): for kern in kernels: - arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), - str(kern.ldc), str(kern.alpha), str(kern.beta)] + arguments = [ + "pspamm-generator", + str(kern.m), + str(kern.n), + str(kern.k), + str(kern.lda), + str(kern.ldb), + str(kern.ldc), + str(kern.alpha), + str(kern.beta), + ] if kern.amtx is not None: - arguments += ['--amtx_filename', kern.amtx] + arguments += ["--amtx_filename", kern.amtx] if kern.bmtx is not None: - arguments += ['--bmtx_filename', kern.bmtx] + arguments += ["--bmtx_filename", kern.bmtx] - prec = 's' if kern.precision == Precision.SINGLE else 'd' - arguments += ['--precision', prec] + prec = "s" if kern.precision == Precision.SINGLE else "d" + arguments += ["--precision", prec] - block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes)) + block_sizes = list( + set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes) + ) for bs in block_sizes: bm = bs[0] @@ -342,9 +365,9 @@ def make(kernels, arch): bk = bs[2] if arch.startswith("arm_sve"): - veclen = int(arch[7:]) if arch[7:] != '' else 128 + veclen = int(arch[7:]) if arch[7:] != "" else 128 else: - veclen = int(arch[3:]) if arch[3:] != '' else 128 + veclen = int(arch[3:]) if arch[3:] != "" else 128 assert veclen % 128 == 0 reglen = veclen // 128 v_len = (16 // kern.precision.size()) * reglen @@ -352,67 +375,113 @@ def make(kernels, arch): # ceiling division vm = -(bm // -v_len) v_size = v_len - elem128 = (16 // kern.precision.size()) + elem128 = 16 // kern.precision.size() if arch.startswith("knl"): - if not ((bn+bk) * vm <= 32): - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + if not ((bn + bk) * vm <= 32): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("hsw"): - if not ((bn+bk) * vm + bn * bk <= 16) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + if ( + not ((bn + bk) * vm + bn * bk <= 16) + or not (kern.m % v_size) == 0 + or not (bm % v_size) == 0 + ): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("arm_sve"): - vkext = -(bk // -elem128) - isvkext = bn*vkext <= 16 if elem128 == 2 else bn*vkext <= 8 - vk = vkext if isvkext else bk - if not ((bn+bk) * vm + bn * vk <= 32): - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + vkext = -(bk // -elem128) + isvkext = bn * vkext <= 16 if elem128 == 2 else bn * vkext <= 8 + vk = vkext if isvkext else bk + if not ((bn + bk) * vm + bn * vk <= 32): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("arm"): - vk = -(bk // -elem128) - if not ((bn+bk) * vm + bn * vk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + vk = -(bk // -elem128) + if ( + not ((bn + bk) * vm + bn * vk <= 32) + or not (kern.m % v_size) == 0 + or not (bm % v_size) == 0 + ): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("rvv"): - if not ((bn+bk) * vm <= 32) or not (bn*bk <= 30) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + if ( + not ((bn + bk) * vm <= 32) + or not (bn * bk <= 30) + or not (kern.m % v_size) == 0 + or not (bm % v_size) == 0 + ): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("lsx") or arch.startswith("lasx"): - if not ((bn+bk) * vm + bn * bk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue - - name = f'{kern.name}_{kern.precision}_{bm}_{bn}_{bk}' - - additional_args = ['--output_funcname', name, '--output_filename', os.path.join(BASEDIR, arch, name + '.h'), - '--output_overwrite'] - additional_args += ['--bm', str(bm), '--bn', str(bn), '--bk', str(bk), '--arch', arch] - additional_args += ['--prefetching', 'BL2viaC'] + if ( + not ((bn + bk) * vm + bn * bk <= 32) + or not (kern.m % v_size) == 0 + or not (bm % v_size) == 0 + ): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue + + name = f"{kern.name}_{kern.precision}_{bm}_{bn}_{bk}" + + additional_args = [ + "--output_funcname", + name, + "--output_filename", + os.path.join(BASEDIR, arch, name + ".h"), + "--output_overwrite", + ] + additional_args += [ + "--bm", + str(bm), + "--bn", + str(bn), + "--bk", + str(bk), + "--arch", + arch, + ] + additional_args += ["--prefetching", "BL2viaC"] try: - print(' '.join(arguments + additional_args)) - subprocess.check_output(arguments + additional_args, stderr=subprocess.STDOUT) + print(" ".join(arguments + additional_args)) + subprocess.check_output( + arguments + additional_args, stderr=subprocess.STDOUT + ) except subprocess.CalledProcessError as e: - raise RuntimeError(f"The command\n{' '.join(e.cmd)}\n returned with an error (code {e.returncode}):\n{e.output.decode('utf-8')}") + raise RuntimeError( + f"The command\n{' '.join(e.cmd)}\n returned with an error (code {e.returncode}):\n{e.output.decode('utf-8')}" + ) - f.write('#include "' + arch + '/' + name + '.h"\n') + f.write('#include "' + arch + "/" + name + '.h"\n') testcases += [ - setup_single_testcase.format( - m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, - beta=kern.beta, delta=kern.delta, name=name, - amtx=kern.amtx or '', bmtx = kern.bmtx or '', - asparse=1 if kern.lda == 0 else 0, bsparse=3 if kern.ldb == 0 else 2, - precision=kern.precision.ctype()) + setup_single_testcase.format( + m=kern.m, + n=kern.n, + k=kern.k, + lda=kern.lda, + ldb=kern.ldb, + ldc=kern.ldc, + alpha=kern.alpha, + beta=kern.beta, + delta=kern.delta, + name=name, + amtx=kern.amtx or "", + bmtx=kern.bmtx or "", + asparse=1 if kern.lda == 0 else 0, + bsparse=3 if kern.ldb == 0 else 2, + precision=kern.precision.ctype(), + ) ] - f.write('\n') + f.write("\n") f.write(function_definitions) f.write(setup_main.format(arch=arch)) for testcase in testcases: - f.write(testcase) + f.write(testcase) f.write(end_of_testsuite) diff --git a/tests/unit_test.py b/tests/unit_test.py index 12575d3..4247a1e 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -1,30 +1,40 @@ #!/usr/bin/env python3 -import testsuite_generator as generator +import random +import re +import sys from importlib import import_module -from pypspamm.codegen.precision import * +import testsuite_generator as generator -import sys -import re -import random +from pypspamm.codegen.precision import * arch = sys.argv[1] -parsedarch = re.fullmatch(r'(?P[a-zA-Z_]+)(?P\d+)', arch) +parsedarch = re.fullmatch(r"(?P[a-zA-Z_]+)(?P\d+)", arch) -archname = parsedarch.group('name') -archprec = parsedarch.group('prec') +archname = parsedarch.group("name") +archprec = parsedarch.group("prec") blocksize = import_module("pypspamm.codegen.architectures." + archname + ".blocksize") scripts = { - "arm": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxK, blocksize.Cube], + "arm": lambda blocksize: [ + blocksize.Old, + blocksize.Max, + blocksize.MaxK, + blocksize.Cube, + ], "arm_sve": lambda blocksize: [blocksize.Max, blocksize.MaxK, blocksize.Cube], - "knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn, blocksize.CubeBn], + "knl": lambda blocksize: [ + blocksize.Old, + blocksize.Max, + blocksize.MaxBn, + blocksize.CubeBn, + ], "hsw": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.Cube], "rvv": lambda blocksize: [blocksize.MaxBn, blocksize.CubeBn], - "lsx": lambda blocksize: [blocksize.Max] + "lsx": lambda blocksize: [blocksize.Max], } blocksize_algs = scripts[archname](blocksize) + [blocksize.Default] @@ -36,128 +46,1838 @@ # define the maximum allowed difference between elements of our solution and the reference solution for # double and single precision delta_hp = 1e-2 -delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough -delta_dp = 1e-6 # epsilon is around e-15 => /2 +delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough +delta_dp = 1e-6 # epsilon is around e-15 => /2 kernels = [] for precision, delta in zip((Precision.SINGLE, Precision.DOUBLE), (delta_sp, delta_dp)): v_size = v_size_fun(precision) - kernels.append(generator.TestKernel("test0m", precision, 8, 8, 8, 8, 8, 8, -1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("test0dv", precision, 8, 8, 8, 8, 8, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("test0bspv", precision, 8, 8, 8, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(8, 8, 8, 1, 1), delta)) - kernels.append(generator.TestKernel("test0aspv", precision, 8, 8, 8, 0, 8, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(8, 8, 8, v_size, 1), None, delta)) - kernels.append(generator.TestKernel("test0abspv", precision, 8, 8, 8, 0, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(8, 8, 8, v_size, 1), generator.generateMTX(8, 8, 8, 1, 1), delta)) - - kernels.append(generator.TestKernel("test1dv", precision, 64, 8, 56, 64, 56, 64, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("test1bspv", precision, 64, 8, 56, 64, 0, 64, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 8, 20, 1, 1), delta)) - kernels.append(generator.TestKernel("test1aspv", precision, 64, 8, 56, 0, 56, 64, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(64, 56, 30, v_size, 1), None, delta)) - kernels.append(generator.TestKernel("test1abspv", precision, 64, 8, 56, 0, 0, 64, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(64, 56, 30, v_size, 1), generator.generateMTX(56, 8, 20, 1, 1), delta)) - - kernels.append(generator.TestKernel("testlarge", precision, 40, 100, 100, 100, 100, 100, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(10, 10, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 56, 30), delta)) - kernels.append(generator.TestKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 2, 1), delta)) - kernels.append(generator.TestKernel("knl_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2,2), (16,7,2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(40, 40, 20), delta)) - - kernels.append(generator.TestKernel("knl_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 2, 2), delta)) - kernels.append(generator.TestKernel("knl_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20,2), (24,3,2)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(10, 20, 1), delta)) - kernels.append(generator.TestKernel("knl_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2,2), (8,14,2)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(10, 5, 1), delta)) - kernels.append(generator.TestKernel("knl_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1,2)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 1, 1), delta)) - kernels.append(generator.TestKernel("knl_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24,2), (8,1,2)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(40, 24, 1), delta)) - - kernels.append(generator.TestKernel("knl_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2,2), (16,7,2)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28,2)], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20,2), (8,3,2)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2,2), (8,14,2)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1,2)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24,2)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("hswtest1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 56, 30), delta)) - kernels.append(generator.TestKernel("hswtest2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hswtest3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 2, 1), delta)) - kernels.append(generator.TestKernel("hsw_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(40, 40, 20), delta)) - - kernels.append(generator.TestKernel("hsw_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 2, 2), delta)) - kernels.append(generator.TestKernel("hsw_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(10, 20, 1), delta)) - kernels.append(generator.TestKernel("hsw_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(10, 5, 1), delta)) - kernels.append(generator.TestKernel("hsw_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 1, 1), delta)) - kernels.append(generator.TestKernel("hsw_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(40, 24, 1), delta)) - - kernels.append(generator.TestKernel("hsw_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("itest4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4), (4,4,2), (4,4,4), (4,4,8)], None, None, delta)) - - kernels.append(generator.TestKernel("itest1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 56, 30), delta)) - kernels.append(generator.TestKernel("itest2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("itest3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(4, 3, 5), delta)) - kernels.append(generator.TestKernel("arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(4, 3, 5), delta)) - kernels.append(generator.TestKernel("arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(50, 80, 294), delta)) - kernels.append(generator.TestKernel("arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(32, 32, 24), delta)) - kernels.append(generator.TestKernel("arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 1, 1), delta)) - kernels.append(generator.TestKernel("arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(2, 2, 1), delta)) - kernels.append(generator.TestKernel("arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(7, 5, 35), delta)) - - kernels.append(generator.TestKernel("arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("sve_mixed_test1", precision, 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test2", precision, 9, 9, 9, 9, 0, 9, 4.0, 2.5, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(9, 9, 20), delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test3", precision, 18, 18, 18, 18, 0, 18, 3.4, -2.5, [(1, 1), (3, 3), (6, 6)] + [x.getBlocksize(18, 18, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(18, 18, 59), delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test4", precision, 80, 80, 80, 80, 0, 80, 0.0, -2.5, [(4, 4), (8, 8)] + [x.getBlocksize(80, 80, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(80, 80, 312), delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test5", precision, 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(8, 8, 6), delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test6", precision, 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - - kernels.append(generator.TestKernel("sve_test4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], None, None, delta_dp)) - - kernels.append(generator.TestKernel("sve_test1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 56, 30), delta_dp)) - kernels.append(generator.TestKernel("sve_test2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_test3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - - kernels.append(generator.TestKernel("sve_arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(4, 3, 5), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(4, 3, 5), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(50, 80, 294), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(32, 32, 24), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 1, 1), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(2, 2, 1), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(7, 5, 35), delta_dp)) - - kernels.append(generator.TestKernel("sve_arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - - kernels.append(generator.TestKernel("sve_arm_only_test15", precision, 23, 29, 31, 23, 31, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test16", precision, 23, 29, 31, 23, 0, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(31, 29, 61), delta_dp)) - - kernels.append(generator.TestKernel("sve_single_prec_test_S1", precision, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], None, None, delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S2", precision, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], None, None, delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S3", precision, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], None, None, delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S4", precision, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], None, None, delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S5", precision, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(9, 9, 8), delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S6", precision, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(15, 15, 22), delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S7", precision, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(23, 23, 52), delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S8", precision, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(13, 31, 40), delta_sp)) + kernels.append( + generator.TestKernel( + "test0m", + precision, + 8, + 8, + 8, + 8, + 8, + 8, + -1.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "test0dv", + precision, + 8, + 8, + 8, + 8, + 8, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test0bspv", + precision, + 8, + 8, + 8, + 8, + 0, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(8, 8, 8, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test0aspv", + precision, + 8, + 8, + 8, + 0, + 8, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + generator.generateMTX(8, 8, 8, v_size, 1), + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test0abspv", + precision, + 8, + 8, + 8, + 0, + 0, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + generator.generateMTX(8, 8, 8, v_size, 1), + generator.generateMTX(8, 8, 8, 1, 1), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "test1dv", + precision, + 64, + 8, + 56, + 64, + 56, + 64, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test1bspv", + precision, + 64, + 8, + 56, + 64, + 0, + 64, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 8, 20, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test1aspv", + precision, + 64, + 8, + 56, + 0, + 56, + 64, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], + generator.generateMTX(64, 56, 30, v_size, 1), + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test1abspv", + precision, + 64, + 8, + 56, + 0, + 0, + 64, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], + generator.generateMTX(64, 56, 30, v_size, 1), + generator.generateMTX(56, 8, 20, 1, 1), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "testlarge", + precision, + 40, + 100, + 100, + 100, + 100, + 100, + 2.5, + 1.0, + [(8, 5), (8, 2)] + + [x.getBlocksize(10, 10, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test1", + precision, + 8, + 56, + 56, + 8, + 0, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 56, 30), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test2", + precision, + 8, + 40, + 40, + 8, + 40, + 8, + 2.5, + 1.0, + [(8, 5), (8, 2)] + + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test3", + precision, + 8, + 56, + 56, + 8, + 56, + 8, + 1.0, + 5.0, + [(8, 3), (8, 5)] + + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test1", + precision, + 8, + 2, + 1, + 8, + 0, + 8, + 1.0, + 0.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 2, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test2", + precision, + 24, + 40, + 40, + 32, + 0, + 24, + 1000, + 1.0, + [(8, 2, 2), (16, 7, 2)] + + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(40, 40, 20), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "knl_only_test3", + precision, + 8, + 2, + 1, + 8, + 0, + 16, + -2.0, + 0.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 2, 2), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test4", + precision, + 24, + 20, + 10, + 40, + 0, + 24, + 35.222, + 0.0, + [(8, 20, 2), (24, 3, 2)] + + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(10, 20, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test5", + precision, + 64, + 5, + 10, + 64, + 0, + 64, + 2.3, + 0.0, + [(32, 2, 2), (8, 14, 2)] + + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(10, 5, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test6", + precision, + 8, + 1, + 1, + 16, + 0, + 56, + 1.0, + 0.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test7", + precision, + 8, + 24, + 40, + 8, + 0, + 8, + 1.0, + 333333.2222222, + [(8, 24, 2), (8, 1, 2)] + + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(40, 24, 1), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "knl_only_test8", + precision, + 8, + 2, + 1, + 8, + 1, + 8, + 2.5, + 0.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test9", + precision, + 32, + 40, + 40, + 32, + 60, + 32, + 2.0, + -4.33, + [(8, 2, 2), (16, 7, 2)] + + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test10", + precision, + 56, + 28, + 56, + 56, + 56, + 56, + 0.1, + 3.0, + [(8, 28, 2)], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test11", + precision, + 8, + 20, + 8, + 40, + 10, + 8, + 234234.123123, + 0.0, + [(8, 20, 2), (8, 3, 2)] + + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test12", + precision, + 64, + 5, + 10, + 64, + 12, + 64, + 1.0, + 1.0, + [(32, 2, 2), (8, 14, 2)] + + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test13", + precision, + 8, + 1, + 1, + 16, + 1, + 56, + 0.0, + 123.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test14", + precision, + 8, + 24, + 40, + 8, + 41, + 8, + 2.0, + 1.0, + [(8, 24, 2)] + + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "hswtest1", + precision, + 8, + 56, + 56, + 8, + 0, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 56, 30), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hswtest2", + precision, + 8, + 40, + 40, + 8, + 40, + 8, + 2.5, + 1.0, + [(8, 2)] + + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hswtest3", + precision, + 8, + 56, + 56, + 8, + 56, + 8, + 1.0, + 5.0, + [(8, 3)] + + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test1", + precision, + 8, + 2, + 1, + 8, + 0, + 8, + 1.0, + 0.0, + [(8, 1)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 2, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test2", + precision, + 24, + 40, + 40, + 32, + 0, + 24, + 1000, + 1.0, + [(8, 2)] + + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(40, 40, 20), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "hsw_only_test3", + precision, + 8, + 2, + 1, + 8, + 0, + 16, + -2.0, + 0.0, + [(8, 1)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 2, 2), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test4", + precision, + 24, + 20, + 10, + 40, + 0, + 24, + 35.222, + 0.0, + [] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(10, 20, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test5", + precision, + 64, + 5, + 10, + 64, + 0, + 64, + 2.3, + 0.0, + [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(10, 5, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test6", + precision, + 8, + 1, + 1, + 16, + 0, + 56, + 1.0, + 0.0, + [(8, 1)] + + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test7", + precision, + 8, + 24, + 40, + 8, + 0, + 8, + 1.0, + 333333.2222222, + [(8, 1)] + + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(40, 24, 1), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "hsw_only_test8", + precision, + 8, + 2, + 1, + 8, + 1, + 8, + 2.5, + 0.0, + [(8, 1)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test9", + precision, + 32, + 40, + 40, + 32, + 60, + 32, + 2.0, + -4.33, + [(8, 2)] + + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test10", + precision, + 56, + 28, + 56, + 56, + 56, + 56, + 0.1, + 3.0, + [x.getBlocksize(56, 28, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test11", + precision, + 8, + 20, + 8, + 40, + 10, + 8, + 234234.123123, + 0.0, + [(8, 3)] + + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test12", + precision, + 64, + 5, + 10, + 64, + 12, + 64, + 1.0, + 1.0, + [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test13", + precision, + 8, + 1, + 1, + 16, + 1, + 56, + 0.0, + 123.0, + [(8, 1)] + + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test14", + precision, + 8, + 24, + 40, + 8, + 41, + 8, + 2.0, + 1.0, + [] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "itest4", + precision, + 4, + 4, + 4, + 4, + 4, + 4, + 2.0, + 2.0, + [(4, 4), (4, 4, 2), (4, 4, 4), (4, 4, 8)], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "itest1", + precision, + 8, + 56, + 56, + 8, + 0, + 8, + 1.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 56, 30), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "itest2", + precision, + 8, + 40, + 40, + 8, + 40, + 8, + 3.0, + 2.0, + [(8, 5), (8, 2)] + + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "itest3", + precision, + 8, + 56, + 56, + 8, + 56, + 8, + 0.0, + 0.0, + [(8, 3), (8, 5)] + + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "arm_only_test1", + precision, + 2, + 3, + 4, + 2, + 0, + 2, + 1.1233, + 0.0, + [(2, 1), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(4, 3, 5), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test2", + precision, + 2, + 3, + 4, + 20, + 0, + 14, + 1.0, + 1.0, + [(2, 2), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(4, 3, 5), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test3", + precision, + 32, + 80, + 50, + 32, + 0, + 32, + 1.0, + 3.0, + [(8, 5)] + + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(50, 80, 294), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test4", + precision, + 32, + 32, + 32, + 34, + 0, + 32, + 1.0, + 0.0, + [(4, 4), (4, 3)] + + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(32, 32, 24), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test5", + precision, + 2, + 1, + 1, + 2, + 0, + 8, + 1.0, + -1.0, + [(2, 1)] + + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test6", + precision, + 2, + 2, + 2, + 2, + 0, + 2, + 2.0, + 234234.123, + [(2, 1)] + + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(2, 2, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test7", + precision, + 16, + 5, + 7, + 16, + 0, + 16, + 0.0, + -1.123, + [(8, 1), (8, 2)] + + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(7, 5, 35), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "arm_only_test8", + precision, + 2, + 3, + 4, + 2, + 4, + 2, + 1.0, + 0.0, + [(2, 1), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test9", + precision, + 2, + 3, + 4, + 20, + 12, + 14, + 2.0, + 1.123, + [(2, 2), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test10", + precision, + 32, + 80, + 50, + 32, + 50, + 32, + 0.0, + 0.2, + [(8, 5)] + + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test11", + precision, + 32, + 32, + 32, + 33, + 68, + 32, + 1231.0, + 14443.0, + [(4, 4), (4, 3)] + + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test12", + precision, + 2, + 1, + 1, + 2, + 1, + 8, + 1.0, + 3.0, + [(2, 1)] + + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test13", + precision, + 2, + 3, + 3, + 2, + 3, + 2, + 1.0, + 0.0, + [(2, 1)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test14", + precision, + 16, + 5, + 7, + 16, + 7, + 16, + 1.0, + 1.0, + [(8, 1), (8, 2)] + + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_mixed_test1", + precision, + 9, + 9, + 9, + 9, + 9, + 9, + 1.0, + 0.0, + [(3, 3)] + + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test2", + precision, + 9, + 9, + 9, + 9, + 0, + 9, + 4.0, + 2.5, + [(3, 3)] + + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(9, 9, 20), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test3", + precision, + 18, + 18, + 18, + 18, + 0, + 18, + 3.4, + -2.5, + [(1, 1), (3, 3), (6, 6)] + + [x.getBlocksize(18, 18, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(18, 18, 59), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test4", + precision, + 80, + 80, + 80, + 80, + 0, + 80, + 0.0, + -2.5, + [(4, 4), (8, 8)] + + [x.getBlocksize(80, 80, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(80, 80, 312), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test5", + precision, + 8, + 8, + 8, + 10, + 0, + 8, + 3.0, + -0.9, + [(2, 2), (4, 4)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(8, 8, 6), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test6", + precision, + 8, + 8, + 8, + 10, + 8, + 8, + 3.0, + -0.9, + [(2, 2), (4, 4)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_test4", + precision, + 4, + 4, + 4, + 4, + 4, + 4, + 2.0, + 2.0, + [(4, 4)], + None, + None, + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_test1", + precision, + 8, + 56, + 56, + 8, + 0, + 8, + 1.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 56, 30), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_test2", + precision, + 8, + 40, + 40, + 8, + 40, + 8, + 3.0, + 2.0, + [(8, 5), (8, 2)] + + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_test3", + precision, + 8, + 56, + 56, + 8, + 56, + 8, + 0.0, + 0.0, + [(8, 3), (8, 5)] + + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_arm_only_test1", + precision, + 2, + 3, + 4, + 2, + 0, + 2, + 1.1233, + 0.0, + [(2, 1), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(4, 3, 5), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test2", + precision, + 2, + 3, + 4, + 20, + 0, + 14, + 1.0, + 1.0, + [(2, 2), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(4, 3, 5), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test3", + precision, + 32, + 80, + 50, + 32, + 0, + 32, + 1.0, + 3.0, + [(8, 5)] + + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(50, 80, 294), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test4", + precision, + 32, + 32, + 32, + 34, + 0, + 32, + 1.0, + 0.0, + [(4, 4), (4, 3)] + + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(32, 32, 24), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test5", + precision, + 2, + 1, + 1, + 2, + 0, + 8, + 1.0, + -1.0, + [(2, 1)] + + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 1, 1), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test6", + precision, + 2, + 2, + 2, + 2, + 0, + 2, + 2.0, + 234234.123, + [(2, 1)] + + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(2, 2, 1), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test7", + precision, + 16, + 5, + 7, + 16, + 0, + 16, + 0.0, + -1.123, + [(8, 1), (8, 2)] + + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(7, 5, 35), + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_arm_only_test8", + precision, + 2, + 3, + 4, + 2, + 4, + 2, + 1.0, + 0.0, + [(2, 1), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test9", + precision, + 2, + 3, + 4, + 20, + 12, + 14, + 2.0, + 1.123, + [(2, 2), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test10", + precision, + 32, + 80, + 50, + 32, + 50, + 32, + 0.0, + 0.2, + [(8, 5)] + + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test11", + precision, + 32, + 32, + 32, + 33, + 68, + 32, + 1231.0, + 14443.0, + [(4, 4), (4, 3)] + + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test12", + precision, + 2, + 1, + 1, + 2, + 1, + 8, + 1.0, + 3.0, + [(2, 1)] + + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test13", + precision, + 2, + 3, + 3, + 2, + 3, + 2, + 1.0, + 0.0, + [(2, 1)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test14", + precision, + 16, + 5, + 7, + 16, + 7, + 16, + 1.0, + 1.0, + [(8, 1), (8, 2)] + + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_arm_only_test15", + precision, + 23, + 29, + 31, + 23, + 31, + 23, + 1.32, + 0.96, + [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test16", + precision, + 23, + 29, + 31, + 23, + 0, + 23, + 1.32, + 0.96, + [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(31, 29, 61), + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S1", + precision, + 9, + 9, + 9, + 9, + 9, + 9, + 1.24, + 0.87, + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S2", + precision, + 15, + 15, + 15, + 15, + 15, + 15, + -3.14, + 6.28, + [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S3", + precision, + 23, + 23, + 23, + 23, + 23, + 23, + 1.5, + -0.66, + [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S4", + precision, + 23, + 31, + 13, + 23, + 13, + 23, + 2.0, + 0.0, + [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S5", + precision, + 9, + 9, + 9, + 9, + 0, + 9, + 1.24, + 0.87, + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(9, 9, 8), + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S6", + precision, + 15, + 15, + 15, + 15, + 0, + 15, + -3.14, + 6.28, + [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(15, 15, 22), + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S7", + precision, + 23, + 23, + 23, + 23, + 0, + 23, + 1.5, + -0.66, + [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(23, 23, 52), + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S8", + precision, + 23, + 31, + 13, + 23, + 0, + 23, + 2.0, + 0.0, + [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(13, 31, 40), + delta_sp, + ) + ) generator.make(kernels, arch) From e286f90b04bfe83c9874bd99400eb0789816770f Mon Sep 17 00:00:00 2001 From: David Schneller Date: Tue, 29 Jul 2025 19:18:25 +0200 Subject: [PATCH 4/9] Correct some install bugs --- pypspamm/__init__.py | 2 +- pypspamm/codegen/__init__.py | 0 pypspamm/codegen/architectures/arm/__init__.py | 0 pypspamm/codegen/architectures/arm_sve/__init__.py | 0 pypspamm/codegen/architectures/hsw/__init__.py | 0 pypspamm/codegen/architectures/knl/__init__.py | 0 pypspamm/codegen/architectures/lsx/__init__.py | 0 pypspamm/codegen/architectures/rvv/__init__.py | 0 pypspamm/cursors/densecursor.py | 3 ++- pypspamm/metagen/__init__.py | 0 requirements.txt | 2 -- setup.py | 4 ++-- 12 files changed, 5 insertions(+), 6 deletions(-) create mode 100644 pypspamm/codegen/__init__.py create mode 100644 pypspamm/codegen/architectures/arm/__init__.py create mode 100644 pypspamm/codegen/architectures/arm_sve/__init__.py create mode 100644 pypspamm/codegen/architectures/hsw/__init__.py create mode 100644 pypspamm/codegen/architectures/knl/__init__.py create mode 100644 pypspamm/codegen/architectures/lsx/__init__.py create mode 100644 pypspamm/codegen/architectures/rvv/__init__.py create mode 100644 pypspamm/metagen/__init__.py diff --git a/pypspamm/__init__.py b/pypspamm/__init__.py index b802516..2aae60a 100644 --- a/pypspamm/__init__.py +++ b/pypspamm/__init__.py @@ -1 +1 @@ -from pspamm import * +from pypspamm import * diff --git a/pypspamm/codegen/__init__.py b/pypspamm/codegen/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypspamm/codegen/architectures/arm/__init__.py b/pypspamm/codegen/architectures/arm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypspamm/codegen/architectures/arm_sve/__init__.py b/pypspamm/codegen/architectures/arm_sve/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypspamm/codegen/architectures/hsw/__init__.py b/pypspamm/codegen/architectures/hsw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypspamm/codegen/architectures/knl/__init__.py b/pypspamm/codegen/architectures/knl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypspamm/codegen/architectures/lsx/__init__.py b/pypspamm/codegen/architectures/lsx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypspamm/codegen/architectures/rvv/__init__.py b/pypspamm/codegen/architectures/rvv/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypspamm/cursors/densecursor.py b/pypspamm/cursors/densecursor.py index eefa3ba..a0cc44d 100644 --- a/pypspamm/cursors/densecursor.py +++ b/pypspamm/cursors/densecursor.py @@ -1,7 +1,8 @@ from typing import List, Tuple, cast from pypspamm.codegen.sugar import * -from pypspamm.cursors import * +from pypspamm.cursors.matrix import * +from pypspamm.cursors.abstractcursor import * class DenseCursor(Cursor): diff --git a/pypspamm/metagen/__init__.py b/pypspamm/metagen/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 2a378b7..4562c82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,2 @@ numpy>=1.14.0 scipy>=1.0.0 -setuptools>=61.0.0 -wheel diff --git a/setup.py b/setup.py index 39846fa..e936f71 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ install_requires = [s.strip() for s in fh.readlines() if s.strip() != ""] setuptools.setup( - name="PspaMM", + name="PSpaMM", version=current_version, license="BSD-3-Clause", author="Peter Wauligmann, Nathan Brei, Alex Puscas, David Schneller", @@ -24,7 +24,7 @@ "License :: OSI Approved :: BSD License", "Operating System :: OS Independent", ], - url="https://github.com/pspamm/pspamm", + url="https://github.com/seissol/pspamm", python_requires=">=3.7", install_requires=install_requires, include_package_data=True, From a37e78f47e61c6a269b527fd206c5b5550b7693b Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 27 Sep 2025 16:44:21 +0200 Subject: [PATCH 5/9] Fix most pre-commit issues --- pypspamm/architecture.py | 0 .../codegen/architectures/arm/generator.py | 2 +- .../architectures/arm_sve/generator.py | 2 +- .../codegen/architectures/rvv/generator.py | 2 +- pypspamm/cursors/densecursor.py | 2 +- tests/README.md | 88 +++++++++++++------ tests/testsuite_generator.py | 10 +-- tests/unit_test.py | 0 8 files changed, 69 insertions(+), 37 deletions(-) mode change 100755 => 100644 pypspamm/architecture.py mode change 100755 => 100644 tests/testsuite_generator.py mode change 100644 => 100755 tests/unit_test.py diff --git a/pypspamm/architecture.py b/pypspamm/architecture.py old mode 100755 new mode 100644 diff --git a/pypspamm/codegen/architectures/arm/generator.py b/pypspamm/codegen/architectures/arm/generator.py index bd81201..4986fca 100644 --- a/pypspamm/codegen/architectures/arm/generator.py +++ b/pypspamm/codegen/architectures/arm/generator.py @@ -13,7 +13,7 @@ class Generator(AbstractGenerator): __asm__ __volatile__( {body_text} : : {args} : {clobbered}); - + #ifndef NDEBUG #ifdef _OPENMP #pragma omp atomic diff --git a/pypspamm/codegen/architectures/arm_sve/generator.py b/pypspamm/codegen/architectures/arm_sve/generator.py index 5e76da7..eba412e 100644 --- a/pypspamm/codegen/architectures/arm_sve/generator.py +++ b/pypspamm/codegen/architectures/arm_sve/generator.py @@ -13,7 +13,7 @@ class Generator(AbstractGenerator): {init_registers} {body_text} : : {args} : {clobbered}); - + #ifndef NDEBUG #ifdef _OPENMP #pragma omp atomic diff --git a/pypspamm/codegen/architectures/rvv/generator.py b/pypspamm/codegen/architectures/rvv/generator.py index 8bb3124..f341e1f 100644 --- a/pypspamm/codegen/architectures/rvv/generator.py +++ b/pypspamm/codegen/architectures/rvv/generator.py @@ -12,7 +12,7 @@ class Generator(AbstractGenerator): __asm__ __volatile__( {body_text} : : {args} : {clobbered}); - + #ifndef NDEBUG #ifdef _OPENMP #pragma omp atomic diff --git a/pypspamm/cursors/densecursor.py b/pypspamm/cursors/densecursor.py index a0cc44d..7a800ab 100644 --- a/pypspamm/cursors/densecursor.py +++ b/pypspamm/cursors/densecursor.py @@ -1,8 +1,8 @@ from typing import List, Tuple, cast from pypspamm.codegen.sugar import * -from pypspamm.cursors.matrix import * from pypspamm.cursors.abstractcursor import * +from pypspamm.cursors.matrix import * class DenseCursor(Cursor): diff --git a/tests/README.md b/tests/README.md index a21a9eb..deac52a 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,44 +1,69 @@ -# Guidelines on how to execute the NEON & SVE tests -### DISCLAIMER: -Some unit tests for SVE fail when including the gcc compiler flag "-mcpu=a64fx". -We assume that this flag makes the compiler optimize the unit tests in a way that breaks them. -Specifically, the values for `ldb`, `alpha`, and `beta` are sometimes set to 0 when calculating a reference solution which we compare to the solution of the PSpaMM kernel. -To fix this, the generated testsuite for Arm NEON and SVE saves certain values as variables before passing them to specific functions +# Guidelines on how to execute the NEON & SVE tests + +## DISCLAIMER + +Some unit tests for SVE fail when including the gcc +compiler flag "-mcpu=a64fx". +We assume that this flag makes the compiler optimize +the unit tests in a way that breaks them. +Specifically, the values for `ldb`, `alpha`, and `beta` +are sometimes set to 0 when calculating a reference solution +which we compare to the solution of the PSpaMM kernel. +To fix this, the generated testsuite for Arm NEON and SVE saves +certain values as variables before passing them to specific functions instead of passing them as constant values. + ## Compiling with gcc -A Makefile is provided, however only NEON and SVE related unit tests can be compiled at the moment. -Naturally, other compiler flags than the ones provided may be used. -Compiling the SVE testsuite with gcc 11.0.0 seems to break some test cases. Within the provided test setup, the values of certain parameters are overwritten after -specific tests, namely ```sve_arm_only_test15_23_6.h``` and ```sve_arm_only_test16_23_6.h```. This leads to a wrong reference solution which is then compared to -the one calculated by our generated kernel. + +A Makefile is provided, however only NEON and SVE related +unit tests can be compiled at the moment. +Naturally, other compiler flags than the ones provided may be used. +Compiling the SVE testsuite with gcc 11.0.0 seems to break +some test cases. Within the provided test setup, the +values of certain parameters are overwritten after +specific tests, namely ```sve_arm_only_test15_23_6.h``` +and ```sve_arm_only_test16_23_6.h```. This leads to a +wrong reference solution which is then compared to +the one calculated by our generated kernel. Example output when using GDB: -``` + +```bash Program received signal SIGSEGV, Segmentation fault. -#0 0x0000000000213dc4 in post (M=7, M@entry=23, N=N@entry=29, K=K@entry=31, LDA=7, LDA@entry=23, LDB=0x3feeb851eb851eb8, LDB@entry=0xffffffffa5ec, LDC=7, LDC@entry=23, A=A@entry=0x2fa8c0, - B=B@entry=0x2fbf40, C=C@entry=0x300cc0, Cref=Cref@entry=0x2ff7c0, DELTA=DELTA@entry=9.9999999999999995e-08, BETA=, ALPHA=, BETA=, +#0 0x0000000000213dc4 in post (M=7, M@entry=23, N=N@entry=29, K=K@entry=31, LDA=7, LDA@entry=23, LDB=0x3feeb851eb851eb8, LDB@entry=0xffffffffa5ec, LDC=7, LDC@entry=23, A=A@entry=0x2fa8c0, + B=B@entry=0x2fbf40, C=C@entry=0x300cc0, Cref=Cref@entry=0x2ff7c0, DELTA=DELTA@entry=9.9999999999999995e-08, BETA=, ALPHA=, BETA=, ALPHA=) at sve_testsuite.cpp:179 #1 0x00000000002aecf4 in main () at sve_testsuite.cpp:619 ``` -Additional testing of gcc-based compilation is needed. Meanwhile, the SVE testsuite should be compiled with clang. + +Additional testing of gcc-based compilation is needed. +Meanwhile, the SVE testsuite should be compiled with clang. + ## Unit Tests -Unit tests for all 3 architectures (KNL, Arm NEON, Arm SVE) are provided. -The testsuite that corresponds to a unit test needs to be executed on the respective processor/architecture. -How to generate and execute a specific testsuite is shown below. -If nothing breaks, the generated testsuite reports the number of successful test case executions. + +Unit tests for all 3 architectures (KNL, Arm NEON, Arm SVE) are provided. +The testsuite that corresponds to a unit test needs +to be executed on the respective processor/architecture. +How to generate and execute a specific testsuite is shown below. +If nothing breaks, the generated testsuite reports +the number of successful test case executions. + ### KNL + 1. Generate the testsuite by calling ```python3 unit_tests_knl.py``` 2. Adjust the Makefile as needed and compile the generated ```testsuite.cpp``` 3. Run the compiled executable ### Arm NEON + 1. Generate the testsuite by calling ```python3 unit_tests_arm.py``` -2. Adjust the Makefile as needed and compile the generated ```testsuite.cpp``` by calling +2. Adjust the Makefile as needed and compile the generated ```testsuite.cpp``` by calling ```make neon_testsuite``` 3. Run the compiled executable with ```./neon_testsuite``` -### Arm SVE +### Arm SVE + 1. Generate the testsuite by calling ```python3 unit_tests_arm_sve.py``` -2. Adjust the Makefile as needed and compile the generated ```sve_testsuite.cpp``` by calling +2. Adjust the Makefile as needed and compile the generated ```sve_testsuite.cpp``` by calling ```make sve_testsuite``` 3. Run the compiled executable with ```./sve_testsuite``` @@ -47,7 +72,8 @@ If nothing breaks, the generated testsuite reports the number of successful test Run `runall-sve.sh` which tests a bunch of configurations already. For a bit length `BITLEN`, it executes the following commands: -``` + +```bash # generate tests python unit_tests_arm_sve.py $BITLEN @@ -58,14 +84,20 @@ aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} a qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 ./sve${BITLEN}-test ``` +For debugging, for example for vector length 512 +(cf. ): -For debugging, for example for vector length 512 (cf. https://mariokartwii.com/showthread.php?tid=1998 ): -``` +```bash aarch64-linux-gnu-g++ -g -ggdb -static -march=armv9-a+sve -msve-vector-bits=512 sve_testsuite.cpp qemu-aarch64-static -g 1234 -cpu max,sve512=on ./a.out ``` + (we use 1234 as port here, and a.out as filename) -In a separate window, run `aarch64-linux-gnu-gdb --ex "target remote localhost:1234" --ex "file a.out"`. -The extra commands already connect you with QEMU and attach you to the compiled binary file, so method names etc. are printed correctly. -To run the program, just type `continue`. You may maybe want to set up breakpoints etc. before you do that. +In a separate window, run +`aarch64-linux-gnu-gdb --ex "target remote localhost:1234" --ex "file a.out"`. +The extra commands already connect you with +QEMU and attach you to the compiled binary file, +so method names etc. are printed correctly. +To run the program, just type `continue`. You +may maybe want to set up breakpoints etc. before you do that. diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py old mode 100755 new mode 100644 index b631f17..9a77356 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -83,9 +83,9 @@ T* C; int resA = posix_memalign(reinterpret_cast(&A), 64, LDA*LDB*sizeof(T)); - int resAsparse = posix_memalign(reinterpret_cast(&Asparse), 64, LDA*LDB*sizeof(T)); + int resAsparse = posix_memalign(reinterpret_cast(&Asparse), 64, LDA*LDB*sizeof(T)); int resB = posix_memalign(reinterpret_cast(&B), 64, LDB*N*sizeof(T)); - int resBsparse = posix_memalign(reinterpret_cast(&Bsparse), 64, LDB*N*sizeof(T)); + int resBsparse = posix_memalign(reinterpret_cast(&Bsparse), 64, LDB*N*sizeof(T)); int resCref = posix_memalign(reinterpret_cast(&Cref), 64, LDC*N*sizeof(T)); int resC = posix_memalign(reinterpret_cast(&C), 64, LDC*N*sizeof(T)); @@ -198,13 +198,13 @@ } gemm_ref(M, N, K, *LDA, *LDB, LDC, *ALPHA, *BETA, A, B, Cref); - + double diffAbsMax = 0; double diffRelMax = 0; int failedCount = 0; for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { - // we use the relative error instead of the absolute error because of an issue we found for sparse single precision + // we use the relative error instead of the absolute error because of an issue we found for sparse single precision // kernels presumably due to limited precision of floats const double diffAbs = std::abs((static_cast(C[i + j * LDC]) - static_cast(Cref[i + j * LDC]))); const double diffRel = diffAbs / std::abs(static_cast(Cref[i + j * LDC])); @@ -252,7 +252,7 @@ setup_prefetch(prefetch, std::get<4>(pointers), {n}, {ldc}); {name}(std::get<{asparse}>(pointers), std::get<{bsparse}>(pointers), std::get<4>(pointers), {alpha}, {beta}, prefetch); const auto result = post<{precision}>({m}, {n}, {k}, &lda, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<2>(pointers), std::get<4>(pointers), std::get<5>(pointers), {delta:.15e}); - + if (result) {{ ++correct; }} diff --git a/tests/unit_test.py b/tests/unit_test.py old mode 100644 new mode 100755 From fe06b10d5e84568afa4f4abd18f99132d74b4169 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 27 Sep 2025 16:47:04 +0200 Subject: [PATCH 6/9] Add a first pre-commit config --- .markdownlint.yaml | 1 + .pre-commit-config.yaml | 80 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 .markdownlint.yaml create mode 100644 .pre-commit-config.yaml diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000..5b824f7 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1 @@ +line-length: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d2ac0c4 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: 2025 SeisSol Group +# +# SPDX-License-Identifier: BSD-3-Clause +# SPDX-LicenseComments: Full text under /LICENSE and /LICENSES/ +# +# SPDX-FileContributor: Author lists in /AUTHORS and /CITATION.cff + +--- + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-merge-conflict + name: '[GENERIC] merge conflict check' + - id: check-symlinks + name: '[GENERIC] symlink check' + - id: destroyed-symlinks + name: '[GENERIC] detect broken symlinks' + - id: detect-private-key + name: '[GENERIC] detect private keys uploaded by accident' + - id: check-case-conflict + name: '[GENERIC] detect OS file naming case conflicts' + - id: check-executables-have-shebangs + name: '[GENERIC] check for shebangs in executable files' + - id: check-illegal-windows-names + name: '[GENERIC] detect illegal Windows file names' + - id: check-json + name: '[JSON] check' + - id: check-xml + name: '[XML] check' + - id: check-shebang-scripts-are-executable + name: '[GENERIC] check that shebang-containing files are executable' + +- repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.18.1 + hooks: + - id: markdownlint-cli2 + name: '[MARKDOWN] lint' + +#- repo: https://github.com/fsfe/reuse-tool +# rev: v5.1.1 +# hooks: +# - id: reuse +# name: '[GENERIC] REUSE compatibiltiy' + +- repo: https://github.com/psf/black-pre-commit-mirror + rev: 25.1.0 + hooks: + - id: black + language_version: python3.13 + files: ^(?!preprocessing|postprocessing) + name: '[PYTHON] black' +- repo: https://github.com/pycqa/isort + rev: 6.0.1 + hooks: + - id: isort + files: ^(?!preprocessing|postprocessing) + args: ["--profile", "black"] + name: '[PYTHON] isort' +- repo: https://github.com/pycqa/bandit + rev: 1.8.6 + hooks: + - id: bandit + args: ["--confidence-level", "high", "--severity-level", "high"] + name: '[PYTHON] bandit' +#- repo: https://github.com/pycqa/flake8 +# rev: '7.3.0' +# hooks: +# - id: flake8 +# files: ^(?!preprocessing|postprocessing) +# name: '[PYTHON] Flake8' + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: end-of-file-fixer + name: '[GENERIC] newline eof' + - id: trailing-whitespace + name: '[GENERIC] remove trailing whitespace' From 44160720802c44ea480324083e3eb598165ef818 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 27 Sep 2025 16:48:35 +0200 Subject: [PATCH 7/9] Add a pre-commit action --- .github/workflows/pre-commit.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/pre-commit.yml diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..5eefd96 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2025 SeisSol Group +# +# SPDX-License-Identifier: BSD-3-Clause +# SPDX-LicenseComments: Full text under /LICENSE and /LICENSES/ +# +# SPDX-FileContributor: Author lists in /AUTHORS and /CITATION.cff + +name: pre-commit +on: + - push + +jobs: + pre-commit: + name: pre-commit + runs-on: ubuntu-24.04 + steps: + - name: setup-python + uses: actions/setup-python@v6 + with: + python-version: '3.13' + + - uses: actions/checkout@v5 + + - uses: pre-commit/action@v3.0.1 From ac90f69925bb4c7ef815b82f401376379ff87c04 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 28 Sep 2025 21:02:36 +0200 Subject: [PATCH 8/9] Remove the old init file --- pspamm.py | 6 ------ 1 file changed, 6 deletions(-) delete mode 100755 pspamm.py diff --git a/pspamm.py b/pspamm.py deleted file mode 100755 index bbf1583..0000000 --- a/pspamm.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python3 - -if __name__ == "__main__": - import pypspamm.cli - - pypspamm.cli.main() From a7e03982e5e4673936f14252617a6a7e5859afef Mon Sep 17 00:00:00 2001 From: David Schneller Date: Fri, 9 Jan 2026 11:27:37 +0100 Subject: [PATCH 9/9] Fix test readme --- tests/README.md | 88 ++++--------------------------------------------- 1 file changed, 6 insertions(+), 82 deletions(-) diff --git a/tests/README.md b/tests/README.md index deac52a..2abea55 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,90 +1,14 @@ -# Guidelines on how to execute the NEON & SVE tests +# Test Running Guidelines -## DISCLAIMER +Run `runall-sve.sh` which tests all configurations using software emulation. Note that on a processor that does not support AVX512/AVX10, you might need to comment out some tests, since there is no known emulation method for them yet. -Some unit tests for SVE fail when including the gcc -compiler flag "-mcpu=a64fx". -We assume that this flag makes the compiler optimize -the unit tests in a way that breaks them. -Specifically, the values for `ldb`, `alpha`, and `beta` -are sometimes set to 0 when calculating a reference solution -which we compare to the solution of the PSpaMM kernel. -To fix this, the generated testsuite for Arm NEON and SVE saves -certain values as variables before passing them to specific functions -instead of passing them as constant values. +## Running a single test -## Compiling with gcc +Use `runlocal.sh` with the PSpaMM architecture of your choice (e.g. `knl512`). The script will also automatically execute the tests; unless you give it the `norun` flag as second argument. -A Makefile is provided, however only NEON and SVE related -unit tests can be compiled at the moment. -Naturally, other compiler flags than the ones provided may be used. -Compiling the SVE testsuite with gcc 11.0.0 seems to break -some test cases. Within the provided test setup, the -values of certain parameters are overwritten after -specific tests, namely ```sve_arm_only_test15_23_6.h``` -and ```sve_arm_only_test16_23_6.h```. This leads to a -wrong reference solution which is then compared to -the one calculated by our generated kernel. -Example output when using GDB: +## Debugging -```bash -Program received signal SIGSEGV, Segmentation fault. -#0 0x0000000000213dc4 in post (M=7, M@entry=23, N=N@entry=29, K=K@entry=31, LDA=7, LDA@entry=23, LDB=0x3feeb851eb851eb8, LDB@entry=0xffffffffa5ec, LDC=7, LDC@entry=23, A=A@entry=0x2fa8c0, - B=B@entry=0x2fbf40, C=C@entry=0x300cc0, Cref=Cref@entry=0x2ff7c0, DELTA=DELTA@entry=9.9999999999999995e-08, BETA=, ALPHA=, BETA=, - ALPHA=) at sve_testsuite.cpp:179 -#1 0x00000000002aecf4 in main () at sve_testsuite.cpp:619 -``` - -Additional testing of gcc-based compilation is needed. -Meanwhile, the SVE testsuite should be compiled with clang. - -## Unit Tests - -Unit tests for all 3 architectures (KNL, Arm NEON, Arm SVE) are provided. -The testsuite that corresponds to a unit test needs -to be executed on the respective processor/architecture. -How to generate and execute a specific testsuite is shown below. -If nothing breaks, the generated testsuite reports -the number of successful test case executions. - -### KNL - -1. Generate the testsuite by calling ```python3 unit_tests_knl.py``` -2. Adjust the Makefile as needed and compile the generated ```testsuite.cpp``` -3. Run the compiled executable - -### Arm NEON - -1. Generate the testsuite by calling ```python3 unit_tests_arm.py``` -2. Adjust the Makefile as needed and compile the generated ```testsuite.cpp``` by calling -```make neon_testsuite``` -3. Run the compiled executable with ```./neon_testsuite``` - -### Arm SVE - -1. Generate the testsuite by calling ```python3 unit_tests_arm_sve.py``` -2. Adjust the Makefile as needed and compile the generated ```sve_testsuite.cpp``` by calling -```make sve_testsuite``` -3. Run the compiled executable with ```./sve_testsuite``` - -#### Notes Running SVE with QEMU user-static - -Run `runall-sve.sh` which tests a bunch of configurations already. - -For a bit length `BITLEN`, it executes the following commands: - -```bash -# generate tests -python unit_tests_arm_sve.py $BITLEN - -# compile: we use AVM V8.2 and SVE; the SVE vector length is set explicitly -aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} arm_sve${BITLEN}_testsuite.cpp -o sve${BITLEN}-test - -# run using QEMU, this way we may run on x86-64 as well; enable all features and constrain to sve${BITLEN} SVE registers maximum length (cf. https://qemu-project.gitlab.io/qemu/system/arm/cpu-features.html); the sve-default-vector-length=-1 parameter is needed for 1024 and 2048 bit SVE to work correctly (otherwise, QEMU will assume 512 bit maximum) -qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 ./sve${BITLEN}-test -``` - -For debugging, for example for vector length 512 +For debugging, for example for SVE with vector length 512 (cf. ): ```bash