diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000..5eefd96 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2025 SeisSol Group +# +# SPDX-License-Identifier: BSD-3-Clause +# SPDX-LicenseComments: Full text under /LICENSE and /LICENSES/ +# +# SPDX-FileContributor: Author lists in /AUTHORS and /CITATION.cff + +name: pre-commit +on: + - push + +jobs: + pre-commit: + name: pre-commit + runs-on: ubuntu-24.04 + steps: + - name: setup-python + uses: actions/setup-python@v6 + with: + python-version: '3.13' + + - uses: actions/checkout@v5 + + - uses: pre-commit/action@v3.0.1 diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000..5b824f7 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1 @@ +line-length: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d2ac0c4 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: 2025 SeisSol Group +# +# SPDX-License-Identifier: BSD-3-Clause +# SPDX-LicenseComments: Full text under /LICENSE and /LICENSES/ +# +# SPDX-FileContributor: Author lists in /AUTHORS and /CITATION.cff + +--- + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: check-merge-conflict + name: '[GENERIC] merge conflict check' + - id: check-symlinks + name: '[GENERIC] symlink check' + - id: destroyed-symlinks + name: '[GENERIC] detect broken symlinks' + - id: detect-private-key + name: '[GENERIC] detect private keys uploaded by accident' + - id: check-case-conflict + name: '[GENERIC] detect OS file naming case conflicts' + - id: check-executables-have-shebangs + name: '[GENERIC] check for shebangs in executable files' + - id: check-illegal-windows-names + name: '[GENERIC] detect illegal Windows file names' + - id: check-json + name: '[JSON] check' + - id: check-xml + name: '[XML] check' + - id: check-shebang-scripts-are-executable + name: '[GENERIC] check that shebang-containing files are executable' + +- repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.18.1 + hooks: + - id: markdownlint-cli2 + name: '[MARKDOWN] lint' + +#- repo: https://github.com/fsfe/reuse-tool +# rev: v5.1.1 +# hooks: +# - id: reuse +# name: '[GENERIC] REUSE compatibiltiy' + +- repo: https://github.com/psf/black-pre-commit-mirror + rev: 25.1.0 + hooks: + - id: black + language_version: python3.13 + files: ^(?!preprocessing|postprocessing) + name: '[PYTHON] black' +- repo: https://github.com/pycqa/isort + rev: 6.0.1 + hooks: + - id: isort + files: ^(?!preprocessing|postprocessing) + args: ["--profile", "black"] + name: '[PYTHON] isort' +- repo: https://github.com/pycqa/bandit + rev: 1.8.6 + hooks: + - id: bandit + args: ["--confidence-level", "high", "--severity-level", "high"] + name: '[PYTHON] bandit' +#- repo: https://github.com/pycqa/flake8 +# rev: '7.3.0' +# hooks: +# - id: flake8 +# files: ^(?!preprocessing|postprocessing) +# name: '[PYTHON] Flake8' + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: end-of-file-fixer + name: '[GENERIC] newline eof' + - id: trailing-whitespace + name: '[GENERIC] remove trailing whitespace' diff --git a/LICENSE b/LICENSE index 4a67232..5b240a5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ BSD 3-Clause License +Copyright (c) 2018-2025 SeisSol Group Copyright (c) 2018, Peter Wauligmann, Nathan Brei All rights reserved. diff --git a/README.md b/README.md index 198b3b4..2f793c6 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ Currently supported: ## Installation -PspaMM is a Python package. I.e. after cloning, may install it via pip. +PSpaMM is a Python package. I.e. after cloning, may install it via pip. Alternatively, you can install it directly by running diff --git a/pspamm.py b/pspamm.py deleted file mode 100755 index c6feffc..0000000 --- a/pspamm.py +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env python3 - -if __name__=='__main__': - import pspamm.cli - pspamm.cli.main() diff --git a/pspamm/__init__.py b/pspamm/__init__.py deleted file mode 100644 index b802516..0000000 --- a/pspamm/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from pspamm import * diff --git a/pspamm/codegen/architectures/__init__.py b/pspamm/codegen/architectures/__init__.py deleted file mode 100644 index 6962d5a..0000000 --- a/pspamm/codegen/architectures/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from pspamm.codegen.architectures.arm_sve.generator import * -from pspamm.codegen.architectures.arm_sve.inlineprinter import * -from pspamm.codegen.architectures.arm_sve.operands import * - -from pspamm.codegen.architectures.arm.generator import * -from pspamm.codegen.architectures.arm.inlineprinter import * -from pspamm.codegen.architectures.arm.operands import * - -from pspamm.codegen.architectures.knl.generator import * -from pspamm.codegen.architectures.knl.inlineprinter import * -from pspamm.codegen.architectures.knl.operands import * - -from pspamm.codegen.architectures.hsw.generator import * -from pspamm.codegen.architectures.hsw.inlineprinter import * -from pspamm.codegen.architectures.hsw.operands import * diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py deleted file mode 100644 index ebeb71c..0000000 --- a/pspamm/codegen/architectures/arm/generator.py +++ /dev/null @@ -1,313 +0,0 @@ -from pspamm.cursors import * - -from pspamm.codegen.architectures.arm.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * - - -class Generator(AbstractGenerator): - - template = """ -void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, {real_type} alpha, {real_type} beta, const {real_type}* prefetch) {{ - __asm__ __volatile__( -{body_text} - : : {args} : {clobbered}); - - #ifndef NDEBUG - #ifdef _OPENMP - #pragma omp atomic - #endif - pspamm_num_total_flops += {flop}; - #endif -}} -""" - - def get_v_size(self): - return 16 // self.precision.size() - - def get_template(self): - return Generator.template - - def use_broadcast(self): - return True - - def has_masks(self): - return False - - def init_mask(self, m, bm, v_size, tempreg, maskregs): - return block("") - - def make_argument_load(self, starting_regs, prefetch): - asm = block("Load arguments") - asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) - asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) - if prefetch: - asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) - return asm - - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): - assert(bm % v_size == 0) - vm = bm//v_size - elem128 = 16 // self.get_precision().size() - vk = -(bk // -elem128) - assert((bn+bk) * vm + bn * vk <= 32) # Needs to fit in NEON v registers - - prec = { - Precision.DOUBLE: "2d", - Precision.SINGLE: "4s", - Precision.HALF: "8h", - }[self.get_precision()] - - A_regs = Matrix([[v(vm*c + r, prec) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[v(vm*bk + bn * r + c, prec) for c in range(bn)] for r in range(vk)]) - C_regs = Matrix([[v(32 - vm*bn + vm*c + r, prec) for c in range(bn)] - for r in range(vm)]) - - # get vector register number of the first vector in B_regs - b_reg = vm*bk - alpha_reg = [v(b_reg, prec), v(b_reg, prec)] - beta_reg = [v(b_reg + 1, prec), v(b_reg + 1, prec)] - - - starting_regs = [r(0), r(1), r(2), r(3), r(4), r(5), r(11)] - - additional_regs = [r(8), xzr, r(10)] - - loop_regs = [r(12), r(13), r(14)] - - prefetch_reg = prefetch is not None - - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [], prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: - - asm = block("No register based scaling") - return asm - - def init_block(self, size): - return block("") - - class LoadStoreLocation: - def __init__(self, addr, register, comment, pfaddr=None): - self.addr = addr - self.register = register - self.comment = comment - self.pfaddr = pfaddr - - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None - ) -> Block: - - rows, cols = registers.shape - - locations = [] - for ic in range(cols): - for ir in range(rows): - if (mask is None) or (mask[ir,ic]): - all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(v_size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] - if not any(has_nonzero): - continue - elif any(has_nonzero) and not all(has_nonzero): - raise NotImplementedError("Element-wise sparsity in A is not yet implemented.") - - cell_offset = Coords(down=ir*v_size, right=ic) - addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - - if prefetching: - pfaddr, _ = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) - pfaddr.disp += self.precision.size() * load_offset - else: - pfaddr = None - locations += [self.LoadStoreLocation(addr, registers[ir,ic], comment, pfaddr)] - - return self.fuse_loadstore_block(locations, store, cursor.name, block_offset, additional_regs) - - def fuse_loadstore_block(self, locations, store, name, block_offset, additional_regs): - offsets = list(sorted([(location.addr.disp,location) for location in locations])) - - action = "Store" if store else "Load" - asm = block(f"{action} {name} register block @ {block_offset}") - - curpf = 0 - cur11 = -1000 - fuse_cache = [] - def try_flush_cache(force, cur11): - if len(fuse_cache) == 0: - return - - if force: - op1 = fuse_cache[0] - op2 = fuse_cache[1] if len(fuse_cache) > 1 else None - op3 = fuse_cache[2] if len(fuse_cache) > 2 else None - op4 = fuse_cache[3] if len(fuse_cache) > 3 else None - - max_offset = [65520, 1008, 48, 64][len(fuse_cache) - 1] - div_offset = [16, 16, 24, 32][len(fuse_cache) - 1] - - comment = f'{op1.comment}' - if op2 is not None: comment += f', {op2.comment}' - if op3 is not None: comment += f', {op3.comment}' - if op4 is not None: comment += f', {op4.comment}' - - offset = op1.addr.disp - cur11 if cur11 >= 0 else op1.addr.disp - - if cur11 >= 0: - op1.addr.disp = offset - op1.addr.base = additional_regs[0] - - if offset > max_offset or offset % div_offset != 0: - if cur11 < 0: - asm.add(add(offset, additional_regs[0], "", op1.addr.base)) - cur11 = offset - else: - asm.add(add(offset, additional_regs[0], "")) - cur11 += offset - op1.addr.disp = 0 - op1.addr.base = additional_regs[0] - - op1r = op1.register - op2r = op2.register if op2 is not None else None - op3r = op3.register if op3 is not None else None - op4r = op4.register if op4 is not None else None - - if store: - asm.add(st(op1r, op1.addr, True, comment, src2=op2r, src3=op3r, src4=op4r)) - else: - asm.add(ld(op1.addr, op1r, True, comment, dest2=op2r, dest3=op3r, dest4=op4r)) - - fuse_cache.clear() - - return cur11 - - for _,location in offsets: - if len(fuse_cache) > 0: - can_fuse = location.addr.disp == fuse_cache[-1].addr.disp + 16 - - # TODO: extend to 4? - max_length = len(fuse_cache) == 2 - - cur11 = try_flush_cache(not can_fuse or max_length, cur11) - - fuse_cache += [location] - - if location.pfaddr is not None: - if location.pfaddr.disp - curpf >= 32768: - asm.add(add(location.pfaddr.disp, additional_regs[2], "increment the prefetch register", location.pfaddr.base)) - curpf = location.pfaddr.disp - if curpf > 0: - reg = additional_regs[2] - disp = location.pfaddr.disp - curpf - else: - reg = location.pfaddr.base - disp = location.pfaddr.disp - asm.add(prefetch(mem(reg, disp), "", access_type="LD", closeness="L2", temporality="KEEP")) - - cur11 = try_flush_cache(True, cur11) - - return asm - - def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: - - rows, cols = registers.shape - asm = block("zero registers") - - for ic in range(cols): - for ir in range(rows): - asm.add(mov(additional_regs[1], registers[ir,ic], True)) - - return asm - - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. - """ - - asm = block("Block GEMM microkernel") - bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) - bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - assert(bm % v_size == 0) - - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) - - elem128 = 16 // self.get_precision().size() - vk = -(bk // -elem128) - - # TODO: fuse loads here as well - bs = [] - firstloc = {} - locations = [] - for Vmi in range(bm//v_size): - for bni in range(bn): # inside this n-block - for bki in range(bk): # inside this k-block - bki_reg = bki // elem128 - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): - if (bki_reg, bni) not in firstloc: - B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) - firstloc[(bki_reg, bni)] = self.LoadStoreLocation(B_cell_addr, B_regs[bki_reg, bni], B_comment) - if A.has_nonzero_cell(A_ptr, to_A_block, to_acell) and B_regs[bki_reg, bni] not in bs: - locations += [firstloc[(bki_reg, bni)]] - bs.append(B_regs[bki_reg, bni]) - asm.add(self.fuse_loadstore_block(locations, False, B.name, to_B_block, additional_regs)) - - cell_indices = {} - for bki in range(bk): # inside this k-block - # TODO: refactor cell_indices into the cursors/blocks - for Vmi in range(bm//v_size): - for bni in range(bn): # inside this n-block - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - - bki_reg = bki // elem128 - if (Vmi, bki_reg, bni) not in cell_indices: - cell_indices[(Vmi, bki_reg, bni)] = 0 - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): - _, B_comment = B.look(B_ptr, to_B_block, to_bcell) - comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" - asm.add(fma(B_regs[bki_reg, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=cell_indices[(Vmi, bki_reg, bni)], sub=sub)) - - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): - cell_indices[(Vmi, bki_reg, bni)] += 1 - - return asm diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py deleted file mode 100644 index bafa122..0000000 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ /dev/null @@ -1,453 +0,0 @@ -from pspamm.cursors import * - -from pspamm.codegen.architectures.arm_sve.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * - - -class Generator(AbstractGenerator): - template = """ -void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, const {real_type} alpha, const {real_type} beta, const {real_type}* prefetch) {{{{ - __asm__ __volatile__( -{init_registers} -{body_text} - : : {args} : {clobbered}); - - #ifndef NDEBUG - #ifdef _OPENMP - #pragma omp atomic - #endif - pspamm_num_total_flops += {flop}; - #endif - -}}}} -""" - - prefetch_count = 0 - is_sparse = False - v_len = 4 # vector register length: v_len * 128 bit - predicates = {} - - def get_v_size(self): - return (16 // self.precision.size()) * self.v_len - - def get_precision(self): - return self.precision - - def get_template(self): - return self.template - - def use_broadcast(self): - return True - - def has_masks(self): - return True - - def make_argument_load(self, starting_regs, prefetch): - asm = block("Load arguments") - asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) - asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) - if prefetch: - asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) - return asm - - def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Register_ARM: - """pred takes num_trues=num of true elements and suffix=type of predicate (m or z) for merging or zeroing - we only use p7 as all-true predicate and p0 as overhead predicate - e.g. pred_n_trues(n=4, v_size=8, suffix="m") returns the predicate p0/m with the first 4 elements - set to true""" - assert (num_trues > 0) - assert (suffix == "m" or suffix == "z" or suffix is None) - - # we only use p7 or p0 as predicates (1 == p0, 8 == p7) - index = 7 if num_trues >= v_size else self.predicates[num_trues] - - if suffix is None: - s = f"p{index}" - else: - s = f"p{index}/{suffix}" - return Register_ARM(AsmType.p64x8, s) - - # is called at most one time in matmul.py - def set_sparse(self): - self.is_sparse = True - - def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int, prefetch:str): - vm = self.ceil_div(bm, v_size) # vm can be 0 if bm < v_size -> makes ceil_div necessary - - # k-broadcasting only works in 128-bit lanes - elem128 = 16 // self.get_precision().size() - vkext = -(bk // -elem128) - - # inline broadcasting is only allowed for the lower-numbered registers - self.inline_broadcast = False - if bn*vkext <= 16 if self.get_precision().size() == 8 else bn*vkext <= 8: - self.inline_broadcast = True - if bk == 1: - self.inline_broadcast = False - - if self.inline_broadcast: - vk = vkext - else: - vk = bk - - assert ((bn + bk) * vm + bn * vk <= 32) # Needs to fit in SVE z registers - - prec = { - Precision.DOUBLE: "d", - Precision.SINGLE: "s", - Precision.HALF: "h", - Precision.BFLOAT16: "h", - }[self.get_precision()] - - # make place for the two broadcasting registers - a_offset = 1 if bn * vk == 1 else 0 - assert ((bn + bk) * vm + bn * vk + a_offset <= 32) - - A_regs = Matrix([[z(vm * c + r + bn * vk + a_offset, prec) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[z(bn * r + c, prec) for c in range(bn)] for r in range(vk)]) - C_regs = Matrix([[z(32 - vm * bn + vm * c + r, prec) for c in range(bn)] for r in range(vm)]) - - b_reg = 0 - alpha_reg = [z(b_reg, prec), z(b_reg, prec)] - beta_reg = [z(b_reg + 1, prec), z(b_reg + 1, prec)] - - starting_regs = [r(0), r(1), r(2), r(3), r(4), r(5), r(6), r(11)] # r6 is needed for predicate creation, r5 is added in init_prefetching() - - additional_regs = [r(8), l("0.0"), r(10), r(6)] # r10 used for scaling offsets - - loop_regs = [r(12), r(13), r(14)] - - mask_regs = [p(0), p(7)] - - self.init_registers(m, bm, k, bk, v_size, nnz) - - prefetch_reg = prefetch is not None - - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs, prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: - - asm = block("No register based scaling") - return asm - - def init_block(self, size): - return block("") - - def init_mask(self, - m: int, - bm: int, - v_size: int, - tempreg, - maskreg - ) -> Block: - - asm = block("No register based scaling") - return asm - - def init_registers(self, - m: int, - bm: int, - k: int, - bk: int, - v_size: int, - nnz: int - ) -> None: - - bmmod = bm % v_size - elem128 = 16 // self.get_precision().size() - bkmod = bk % elem128 if self.inline_broadcast else 0 - kmod = (k % bk) % elem128 if self.inline_broadcast else 0 - mmod = (m % bm) % v_size - - eol = "\\n\\t" # define the "end of line" sequence for easy assembly - # determine the predicate suffix - p_suffix = { - Precision.DOUBLE: "d", - Precision.SINGLE: "s", - Precision.HALF: "h", - Precision.BFLOAT16: "h", - }[self.get_precision()] - # determine length of 'dup' registers - gen_reg = "w" if self.get_precision().size() <= 4 else "x" - overhead_counter = 6 - - comment = "// p7 denotes the 'all-true' predicate\n\t" - comment += "// if given, p0 denotes the 'bm % v_size' predicate\n\t" - comment += "// if given, p1 denotes the 'bk % elem128' predicate\n\t" - comment += "// if given, p2 denotes the 'k % elem128' predicate\n\t" - comment += "// if given, p4 denotes the 'k % v_size' predicate\n\t" - - self.has_k_overhead = kmod != 0 - self.has_bk_overhead = bkmod != 0 - self.has_nnz_overhead = nnz % elem128 != 0 - - # specification for ptrue: https://developer.arm.com/documentation/ddi0596/2021-12/SVE-Instructions/PTRUE--Initialise-predicate-from-named-constraint- - # search for 'DecodePredCount' for the explanation of how the pattern in 'ptrue p{d}.{suffix}, #pattern' is decoded: - # https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2 - # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above - # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate - overhead_bm = "\"mov {gen_reg}{overhead_counter}, #{overhead_bm}{eol}\"\n\t\"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if bmmod != 0 else "" - overhead_bk = "\"mov {gen_reg}{overhead_counter}, #{overhead_bk}{eol}\"\n\t\"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_bk_overhead else "" - overhead_k = "\"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}\"\n\t\"whilelo p2.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_k_overhead else "" - overhead_nnz = "\"mov {gen_reg}{overhead_counter}, #{overhead_nnz}{eol}\"\n\t\"whilelo p3.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if self.has_nnz_overhead else "" - overhead_m = "\"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}\"\n\t\"whilelo p4.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}\"\n\t" if mmod != 0 else "" - all_true = "\"ptrue p7.{suffix}, #31{eol}\"" # define all true predicate - init_registers = (comment + overhead_bm + overhead_bk + overhead_k + overhead_nnz + overhead_m + all_true).format(suffix=p_suffix, - gen_reg=gen_reg, - overhead_counter=overhead_counter, - v_size=v_size, - overhead_bm=bmmod, - overhead_bk=bkmod, - overhead_k=kmod, - overhead_m=mmod, - overhead_nnz=nnz % elem128, - eol=eol) - - self.predicates[v_size] = 7 - if bmmod != 0: self.predicates[bmmod] = 0 - if bkmod != 0: self.predicates[bkmod] = 1 - if kmod != 0: self.predicates[kmod] = 2 - if mmod != 0: self.predicates[mmod] = 4 - - # since .format() doesn't allow partial formatting, we need to re-include the - # placeholders that are replaced at the end of generating a kernel - self.template = self.get_template().format(init_registers=init_registers, - funcName="{funcName}", - body_text="{body_text}", - clobbered="{clobbered}", - flop="{flop}", - real_type="{real_type}", - args="{args}") - - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None, - is_B: bool = False - ) -> Block: - - rows, cols = registers.shape - action = "Store" if store else "Load" - asm = block(f"{action} {cursor.name} register block @ {block_offset}") - prec = self.get_precision() - - b_row, b_col, i, _ = cursor.get_block(cursor_ptr, block_offset) - - cur11 = 0 - #TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit) - threshold = 1 if self.is_sparse else (16 // self.v_len) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes - - # DONE if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically) - mul_vl = 16 * self.v_len # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4) - max_mem_ins_mult = 7 # A64FX allows a maximum positive offset of 7 in memory instructions, e.g. ld1d z1.d, p0/z, [x0, 7, MUL VL] (TODO: tune, if ever different) - max_offset = mul_vl * max_mem_ins_mult # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL - - prev_disp = 0 - prev_overhead = True - prev_base = None - - process_size = min(v_size, cursor.br) - - for ic in range(cols): - for ir in range(rows): - if (mask is None) or (mask[ir, ic]): - processed = ir * process_size - size = min(process_size, b_row - processed) - all_coords = [Coords(down=ir*process_size+i,right=ic) for i in range(size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] - if not any(has_nonzero): - continue - elif any(has_nonzero) and not all(has_nonzero) and not is_B: - raise NotImplementedError("Element-wise sparsity in A is not yet implemented.") - - p = self.pred_n_trues(size, v_size) if not is_B else self.pred_n_trues(process_size, v_size) - p_zeroing = self.pred_n_trues(size, v_size, "z") if not is_B else self.pred_n_trues(process_size, v_size, "z") - cell_offset = Coords(down=ir * process_size, right=ic) - - # addr = base "pointer" + relative offset in bytes - addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - - offset = addr.disp - prev_disp - - # count how many elements we have processed between last step and this step - cont_counter = (offset // mul_vl) - larger_max_offset = cont_counter > max_mem_ins_mult - non_dividing_offset = offset % mul_vl != 0 - - if larger_max_offset or (prev_overhead and addr.disp > 0) or non_dividing_offset: - offset_comment = f"disp > {max_offset}" if larger_max_offset else ("disp % VL != 0" if non_dividing_offset else "previous mem. instr. used p0") - asm.add(add(addr.disp, additional_regs[0], offset_comment, addr.base)) - prev_disp = addr.disp - addr.base = additional_regs[0] - prev_base = addr.base - - # adjust addr.disp to a multiple of a SVE vector's length - if prev_base is None: - prev_base = addr.base - - addr.base = prev_base - addr.disp = (addr.disp - prev_disp) // mul_vl - - if store: - asm.add(st(registers[ir, ic], addr, True, comment, pred=p, scalar_offs=False, - add_reg=additional_regs[2])) - # perform prefetching after a store instruction, similar to KNL case - if prefetching: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - if prev_disp > 0: - asm.add(add(prev_disp, additional_regs[3], "increment the prefetch register", addr.base)) - asm.add(prefetch(mem(additional_regs[3] if prev_disp > 0 else addr.base, (addr.disp - prev_disp) // mul_vl), - "", p, prec, access_type="LD", closeness="L2", temporality="KEEP")) - else: - asm.add(ld(addr, registers[ir, ic], True, comment, pred=p_zeroing, is_B=is_B, scalar_offs=False, - add_reg=additional_regs[2])) - - prev_overhead = p is None or int(p.ugly[1]) == 0 # determine if we previously used p0 (overhead predicate) - - return asm - - def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: - - rows, cols = registers.shape - asm = block("zero registers") - - for ic in range(cols): - for ir in range(rows): - asm.add(mov(additional_regs[1], registers[ir, ic], True)) - - return asm - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size: int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. - """ - - asm = block("Block GEMM microkernel") - """block_row, block_col, (start)index, pattern_matrix (true/false)""" - bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) - bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) - - # tell sparse_mask() that we use sve - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) - - # x = 0; - bs = [] - cur11 = -1000 - Vm = max(self.ceil_div(bm, v_size), 1) - - multiple = self.precision.size() - # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 - # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 - # in both cases: instruction encodes the immediate offset within 6 bits - if not self.inline_broadcast: - max_offs = (2 ** 6 - 1) * multiple - divider = 1 - elem128 = 1 - vk = bk - else: - max_offs = 127 - divider = 16 - elem128 = 16 // self.get_precision().size() - vk = -(bk // -elem128) - - preg = self.pred_n_trues(elem128, elem128, 'z') - preg_last = preg if bk % elem128 == 0 else self.pred_n_trues(bk % elem128, elem128, 'z') - firstloc = {} - for Vmi in range(Vm): - # set to all v_size predicates to true, we want to replicate a B element into a whole vector - for bni in range(bn): # inside this n-block - for bki in range(bk): # inside this k-block - bki_reg = bki // elem128 - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): - if (bki_reg, bni) not in firstloc: - B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) - firstloc[(bki_reg, bni)] = (B_cell_addr, B_comment) - if A.has_nonzero_cell(A_ptr, to_A_block, to_acell) and B_regs[bki_reg, bni] not in bs: - p_zeroing = preg_last if bki_reg + 1 == vk else preg - - B_cell_addr = firstloc[(bki_reg, bni)][0] - B_comment = firstloc[(bki_reg, bni)][1] - - # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value - if B_cell_addr.disp > max_offs or B_cell_addr.disp % divider != 0: - moved = B_cell_addr.disp - cur11 - if moved > 0 and moved <= max_offs and moved % divider == 0: - B_cell_addr.disp = moved - else: - asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) - cur11 = B_cell_addr.disp - B_cell_addr.disp = 0 - - B_cell_addr.base = additional_regs[0] - - if not self.inline_broadcast: - asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment, pred=p_zeroing, is_B=True)) - else: - asm.add(ld(B_cell_addr, B_regs[bki_reg, bni], True, B_comment, pred=p_zeroing, sub128=True)) - bs.append(B_regs[bki_reg, bni]) - - # TODO: refactor cell_indices into the cursors/blocks - cell_indices = {} - for bki in range(bk): # inside this k-block - for Vmi in range(Vm): - p_merging = self.pred_n_trues(bm - Vmi * v_size, v_size, "m") - end_index = bm if Vmi + 1 == Vm else Vmi * v_size + v_size # end_index helps us print the right index ranges - for bni in range(bn): # inside this n-block - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - bki_reg = bki // elem128 - if (Vmi, bki_reg, bni) not in cell_indices: - cell_indices[(Vmi, bki_reg, bni)] = 0 - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): - _, B_comment = B.look(B_ptr, to_B_block, to_bcell) - comment = f"C[{Vmi * v_size}:{end_index},{bni}] += A[{Vmi * v_size}:{end_index},{bki}]*{B_comment}" - - if not self.inline_broadcast: - bcast = None - else: - bcast = cell_indices[(Vmi, bki_reg, bni)] - asm.add(fma(B_regs[bki_reg, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=bcast, sub=sub)) - - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): - cell_indices[(Vmi, bki_reg, bni)] += 1 - return asm diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py deleted file mode 100644 index d1caba2..0000000 --- a/pspamm/codegen/architectures/hsw/generator.py +++ /dev/null @@ -1,301 +0,0 @@ -from pspamm.cursors import * - -from pspamm.codegen.architectures.hsw.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * -from pspamm.codegen.regcache import * - -class Generator(AbstractGenerator): - template = """ -void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, {real_type} alpha, {real_type} beta, {real_type} const* prefetch) {{ - {real_type}* alpha_p = α - {real_type}* beta_p = β - __asm__ __volatile__( -{body_text} - : : {args} : {clobbered}); - - #ifndef NDEBUG - #ifdef _OPENMP - #pragma omp atomic - #endif - pspamm_num_total_flops += {flop}; - #endif -}} -""" - v_len = 2 - - def get_v_size(self): - return (16 // self.precision.size()) * self.v_len - - def get_template(self): - return Generator.template - - def use_broadcast(self): - return True - - def has_masks(self): - return False - - def init_mask(self, m, bm, v_size, tempreg, maskregs): - return block("") - - def scale_base(self): - return 256 - - def pred_n_trues(self, count, v_size, mode): - # hacked in right now: we set a number as predicate if we need it - if count < v_size: - return (1 << count) - 1 - else: - return None - - def make_argument_load(self, starting_regs, prefetch): - asm = block("Load arguments") - asm.add(mov(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(mov(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(mov(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(mov(InputOperand(f'3', 'm', 'alpha_p'), starting_regs[3], False)) - asm.add(mov(InputOperand(f'4', 'm', 'beta_p'), starting_regs[4], False)) - if prefetch: - asm.add(mov(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) - return asm - - def make_expand_predicate(self, mask): - combined = 0 - offset = 0 - for i, value in enumerate(mask): - if value: - combined |= offset << (8*i) - offset += 1 - else: - combined |= 255 << (8*i) - return combined - - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): - assert(bm % v_size == 0) - vm = self.ceil_div(bm, v_size) - - # Needs to fit in AVX/AVX2 ymm registers - if (bn + bk) * vm + bn * bk <= 16: - self.preloadA = True - else: - self.preloadA = False - assert(bn * vm + bn * bk + 1 <= 16) - - vmm = { - 1: xmm, - 2: ymm - }[self.v_len] - - if self.preloadA: - A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) - Aoffset = vm*bk - else: - A_regs = Matrix([[vmm(0) for c in range(bk)] for r in range(vm)]) - Aoffset = 1 - - B_regs = Matrix([[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[vmm(16 - vm*bn + vm*c + r) for c in range(bn)] - for r in range(vm)]) - starting_regs = [rdi, rsi, rdx, rbx, rcx] - - b_reg = Aoffset - alpha_reg = [xmm(b_reg), vmm(b_reg)] - beta_reg = [xmm(b_reg + 1), vmm(b_reg + 1)] - - additional_regs = [r(9),r(10),r(11),r(15),rax] # ,r(13),r(14) - - prefetch_reg = prefetch == 'BL2viaC' - if prefetch_reg: - starting_regs += [r(8)] - else: - additional_regs += [r(8)] - - loop_regs = [r(12), r(13), r(14)] - - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [], prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: - - asm = block("Optimize usage of offsets when accessing B Matrix") - - for i in range(1, len(additional_regs)): - asm.add(mov(c(self.scale_base() * (2*i - 1)), additional_regs[i], False)) - - return asm - - def init_block(self, size): - return block("") - - def reg_based_scaling(self, asm, addr: MemoryAddress, additional_regs: List[Register]): - halfscale = self.scale_base() // 2 - if addr.disp >= halfscale: - base = (addr.disp + halfscale) // self.scale_base() - scaling = 1 - while base % 2 == 0: - base //= 2 - scaling *= 2 - register = base // 2 + 1 - - if register < len(additional_regs) and scaling <= 8: - addr.index = additional_regs[register] - addr.scaling = scaling - addr.disp = ((addr.disp + halfscale) % self.scale_base()) - halfscale - - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None, - temp = None - ) -> Block: - - rows, cols = registers.shape - action = "Store" if store else "Load" - asm = block(f"{action} {cursor.name} register block @ {block_offset}") - - for ic in range(cols): - for ir in range(rows): - if (mask is None) or (mask[ir,ic]): - all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(v_size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] - if all(has_nonzero): - cell_offset = all_coords[0] - addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - self.reg_based_scaling(asm, addr, additional_regs) - if store: - asm.add(mov(registers[ir,ic], addr, True, comment)) - if prefetching == 'BL2viaC' and pf_cursor is not None: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - self.reg_based_scaling(asm, addr, additional_regs) - asm.add(prefetch(addr, closeness="L2")) - else: - asm.add(mov(addr, registers[ir,ic], True, comment)) - elif any(has_nonzero): - raise NotImplementedError("Element-wise sparsity in A is not yet fully implemented.") - firsti = 0 - for i in range(v_size): - if has_nonzero[i]: - firsti = i - break - addr, comment = cursor.look(cursor_ptr, block_offset, all_coords[firsti]) - # assume contiguous memory here - - asm.add(mov(self.make_expand_predicate(all_coords), additional_regs[0], False)) - return asm - - def move_register_single(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - ir, - ic, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0 - ) -> Block: - - asm = block("") - - if (mask is None) or (mask[ir,ic]): - cell_offset = Coords(down=ir*v_size, right=ic) - addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - asm.add(mov(addr, registers[ir,ic], True, comment)) - return asm - - def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: - - rows, cols = registers.shape - asm = block("zero registers") - - for ic in range(cols): - for ir in range(rows): - asm.add(mov(0, registers[ir,ic], True)) - - return asm - - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. - """ - asm = block("Block GEMM microkernel") - bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) - bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - assert(bm % v_size == 0) - - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) - if self.preloadA: - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False, temp=B_regs[0,0])) - else: - asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, 0, 0, mask, store=False)) - - Vm = self.ceil_div(bm, v_size) - - bs = [] - bsv = [] - for Vmi in range(Vm): - for bni in range(bn): # inside this n-block - for bki in range(bk): # inside this k-block - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): - B_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) - self.reg_based_scaling(asm, B_addr, additional_regs) - if B_regs[bki, bni] not in bs: - asm.add(bcst(B_addr, B_regs[bki, bni], comment=B_comment)) - bs.append(B_regs[bki, bni]) - bsv.append(B_addr) - else: - # just to make sure we do not use registers differently in a block - assert bsv[bs.index(B_regs[bki, bni])].ugly == B_addr.ugly - - for bki in range(bk): # inside this k-block - for Vmi in range(Vm): - if not self.preloadA and not (Vmi, bki) == (0,0): - asm.add(self.move_register_single(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, Vmi, bki, mask, store=False)) - for bni in range(bn): # inside this n-block - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): - _, B_comment = B.look(B_ptr, to_B_block, to_bcell) - comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None, sub=sub)) - return asm diff --git a/pspamm/codegen/architectures/lsx/generator.py b/pspamm/codegen/architectures/lsx/generator.py deleted file mode 100644 index 836277d..0000000 --- a/pspamm/codegen/architectures/lsx/generator.py +++ /dev/null @@ -1,236 +0,0 @@ -from pspamm.cursors import * - -from pspamm.codegen.architectures.lsx.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * -from pspamm.codegen.regcache import * - -class Generator(AbstractGenerator): - template = """ -void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, {real_type} alpha, {real_type} beta, {real_type} const* prefetch) {{ - __asm__ __volatile__( -{body_text} - : : {args} : {clobbered}); - - #ifndef NDEBUG - #ifdef _OPENMP - #pragma omp atomic - #endif - pspamm_num_total_flops += {flop}; - #endif -}} -""" - v_len = 2 - - def get_v_size(self): - return (16 // self.precision.size()) * self.v_len - - def get_template(self): - return Generator.template - - def use_broadcast(self): - return True - - def has_masks(self): - return False - - def init_mask(self, m, bm, v_size, tempreg, maskregs): - return block("") - - def make_argument_load(self, starting_regs, prefetch): - asm = block("Load arguments") - asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) - asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) - if prefetch: - asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) - return asm - - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): - assert(bm % v_size == 0) - vm = self.ceil_div(bm, v_size) - - assert (bn + bk) * vm + bn * bk <= 32 - - vmm = { - 1: vr, - 2: xr - }[self.v_len] - - A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) - Aoffset = vm*bk - - B_regs = Matrix([[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[vmm(32 - vm*bn + vm*c + r) for c in range(bn)] - for r in range(vm)]) - - b_reg = Aoffset - alpha_reg = [vmm(b_reg)] * 2 - beta_reg = [vmm(b_reg + 1)] * 2 - - starting_regs = [r(10), r(11), r(12), r(13), r(14), r(6), r(5)] - - additional_regs = [r(15), r(16), r(17), r(31), r(7)] - - loop_regs = [r(28), r(29), r(30)] - - prefetch_reg = prefetch == 'BL2viaC' - - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [], prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: - return block("") - - def init_block(self, size): - return block("") - - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None, - temp = None - ) -> Block: - - rows, cols = registers.shape - action = "Store" if store else "Load" - asm = block(f"{action} {cursor.name} register block @ {block_offset}") - - max_offs = 2047 - cur11 = 0 - - for ic in range(cols): - for ir in range(rows): - if (mask is None) or (mask[ir,ic]): - all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(v_size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] - if all(has_nonzero): - cell_offset = all_coords[0] - addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - needsmove = False - if addr.disp > max_offs: - moved = addr.disp - cur11 - if moved > 0 and moved <= max_offs: - addr.disp = moved - else: - asm.add(add(addr.disp, additional_regs[0], "", addr.base)) - cur11 = addr.disp - addr.disp = 0 - needsmove = True - - addr.base = additional_regs[0] - if store: - asm.add(st(registers[ir,ic], addr, True, comment)) - if prefetching == 'BL2viaC' and pf_cursor is not None: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - if addr.disp > max_offs: - moved = addr.disp - cur11 - if needsmove: - asm.add(add(addr.disp, additional_regs[3], "", addr.base)) - addr.disp = 0 - else: - addr.disp = moved - addr.base = additional_regs[3] - asm.add(prefetch(addr, closeness="L2")) - else: - asm.add(ld(addr, registers[ir,ic], True, comment)) - elif any(has_nonzero): - raise NotImplementedError("Element-wise sparsity in A is not yet fully implemented.") - return asm - - def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: - - rows, cols = registers.shape - asm = block("zero registers") - - for ic in range(cols): - for ir in range(rows): - asm.add(mov(0, registers[ir,ic], True)) - - return asm - - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. - """ - asm = block("Block GEMM microkernel") - bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) - bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - assert(bm % v_size == 0) - - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False, temp=B_regs[0,0])) - - Vm = self.ceil_div(bm, v_size) - cur11 = 0 - max_offs = 2047 - - bs = [] - for Vmi in range(Vm): - for bni in range(bn): # inside this n-block - for bki in range(bk): # inside this k-block - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): - B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) - if B_regs[bki, bni] not in bs: - # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value - if B_cell_addr.disp > max_offs: - moved = B_cell_addr.disp - cur11 - if moved > 0 and moved <= max_offs: - B_cell_addr.disp = moved - else: - asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) - cur11 = B_cell_addr.disp - B_cell_addr.disp = 0 - - B_cell_addr.base = additional_regs[0] - - asm.add(bcst(B_cell_addr, B_regs[bki, bni], B_comment)) - bs.append(B_regs[bki, bni]) - - for bki in range(bk): # inside this k-block - for Vmi in range(Vm): - for bni in range(bn): # inside this n-block - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): - _, B_comment = B.look(B_ptr, to_B_block, to_bcell) - comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None, sub=sub)) - return asm diff --git a/pspamm/codegen/architectures/rvv/generator.py b/pspamm/codegen/architectures/rvv/generator.py deleted file mode 100644 index 1efe040..0000000 --- a/pspamm/codegen/architectures/rvv/generator.py +++ /dev/null @@ -1,311 +0,0 @@ -from pspamm.cursors import * - -from pspamm.codegen.architectures.rvv.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * - - -class Generator(AbstractGenerator): - template = """ -void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, const {real_type} alpha, const {real_type} beta, const {real_type}* prefetch) {{{{ - __asm__ __volatile__( - {body_text} - : : {args} : {clobbered}); - - #ifndef NDEBUG - #ifdef _OPENMP - #pragma omp atomic - #endif - pspamm_num_total_flops += {flop}; - #endif -}}}}; -""" - - is_sparse = False - v_len = 1 # vector register length: v_len * 128 bit - predicates = {} - - def get_v_size(self): - return (16 // self.precision.size()) * self.v_len - - def get_precision(self): - return self.precision - - def get_template(self): - return self.template - - def use_broadcast(self): - return False - - def has_masks(self): - return False # not yet - - def pred_n_trues(self, num_trues: int, v_size: int, suffix: str = None) -> Register_RV: - return None - - # is called at most one time in matmul.py - def set_sparse(self): - self.is_sparse = True - - def make_argument_load(self, starting_regs, prefetch): - asm = block("Load arguments") - asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) - asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) - if prefetch: - asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) - return asm - - def make_reg_blocks(self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int, prefetch: str): - vm = self.ceil_div(bm, v_size) # vm can be 0 if bm < v_size -> makes ceil_div necessary - - assert bn * bk + 2 <= 32 - assert (bn + bk) * vm <= 32 - - prec = { - Precision.DOUBLE: "d", - Precision.SINGLE: "s", - Precision.HALF: "h", - Precision.BFLOAT16: "h", - }[self.get_precision()] - - A_regs = Matrix([[v(vm * c + r) for c in range(bk)] for r in range(vm)]) - B_regs = Matrix([[f(bn * r + c + 2) for c in range(bn)] for r in range(bk)]) - C_regs = Matrix([[v(32 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)]) - - b_reg = 0 - alpha_reg = [f(0), f(0)] - beta_reg = [f(1), f(1)] - - # TODO: move x(5) out of here - starting_regs = [x(10), x(11), x(12), f(0), f(1), x(6), x(5)] - - additional_regs = [x(13), x(14), x(15), x(16), x(17), x(31), x(7)] - - loop_regs = [x(28), x(29), x(30)] - - mask_regs = [] - - prefetch_reg = prefetch is not None - - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs, prefetch_reg - - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: - - asm = block("No register based scaling") - return asm - - def init_mask(self, - m: int, - bm: int, - v_size: int, - tempreg, - maskreg - ) -> Block: - - asm = block("No register based scaling") - return asm - - def init_block(self, size): - if size < 32: - return rvsetvl(x(0), size) - else: - asm = block("Set vector length") - asm.add(mov(size, x(5), False)) - asm.add(rvsetvl(x(0), x(5))) - return asm - - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None, - is_B: bool = False - ) -> Block: - - rows, cols = registers.shape - action = "Store" if store else "Load" - asm = block(f"{action} {cursor.name} register block @ {block_offset}") - prec = self.get_precision() - - b_row, b_col, i, _ = cursor.get_block(cursor_ptr, block_offset) - - cur11 = 0 - #TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit) - threshold = 1 if self.is_sparse else (16 // self.v_len) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes - - # DONE if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically) - mul_vl = 16 * self.v_len # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4) - max_mem_ins_mult = 0 - max_offset = 0 # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL - - prev_disp = 0 - prev_base = None - - process_size = min(v_size, cursor.br) - - for ic in range(cols): - for ir in range(rows): - if (mask is None) or (mask[ir, ic]): - all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(process_size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] - if not any(has_nonzero): - continue - elif any(has_nonzero) and not all(has_nonzero) and not is_B: - raise NotImplementedError("Element-wise sparsity in A is not yet implemented.") - - processed = ir * process_size - if processed >= b_row: - continue - p = self.pred_n_trues(min(b_row - processed, process_size), v_size) if not is_B else self.pred_n_trues(process_size, v_size) - p_zeroing = self.pred_n_trues(min(b_row - processed, process_size), v_size, "z") if not is_B else self.pred_n_trues(process_size, v_size, "z") - cell_offset = Coords(down=ir * v_size, right=ic) - - # addr = base "pointer" + relative offset in bytes - addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - - offset = addr.disp - prev_disp - - # count how many elements we have processed between last step and this step - cont_counter = (offset // mul_vl) - larger_max_offset = cont_counter > max_mem_ins_mult - non_dividing_offset = offset % mul_vl != 0 - - # adjust addr.disp to a multiple of the RVV vector length - if prev_base is None: - prev_base = addr.base - - if larger_max_offset or addr.disp > 0 or non_dividing_offset: - offset_comment = f"move to new vector" - if offset < 2048 and offset >= -2048 and prev_base == additional_regs[0]: - asm.add(add(offset, additional_regs[0], offset_comment)) - else: - asm.add(add(addr.disp, additional_regs[0], offset_comment, addr.base)) - prev_disp = addr.disp - addr.base = additional_regs[0] - addr.disp = 0 - prev_base = additional_regs[0] - - if store: - asm.add(st(registers[ir, ic], addr, True, comment, pred=p, scalar_offs=False, - add_reg=additional_regs[2])) - # perform prefetching after a store instruction, similar to KNL case - if prefetching: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.size() * load_offset - if prev_disp > 0: - asm.add(add(prev_disp, additional_regs[3], "increment the prefetch register", addr.base)) - asm.add(prefetch(mem(additional_regs[3] if prev_disp > 0 else addr.base, addr.disp - prev_disp), - "", p, prec, access_type="r", closeness="L2", temporality="KEEP")) - else: - asm.add(ld(addr, registers[ir, ic], True, comment, pred=p_zeroing, is_B=is_B, scalar_offs=False, - add_reg=additional_regs[2])) - - return asm - - def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: - - rows, cols = registers.shape - asm = block("zero registers") - - for ic in range(cols): - for ir in range(rows): - asm.add(mov(0, registers[ir, ic], True)) - - return asm - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size: int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. - """ - - asm = block("Block GEMM microkernel") - """block_row, block_col, (start)index, pattern_matrix (true/false)""" - bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) - bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) - - # tell sparse_mask() that we use sve - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) - - bs = [] - cur11 = -10000 - Vm = max(self.ceil_div(bm, v_size), 1) - - multiple = self.precision.size() - # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 - # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 - # in both cases: instruction encodes the immediate offset within 6 bits - max_offs = 2047 - - for Vmi in range(Vm): - # set to all v_size predicates to true, we want to replicate a B element into a whole vector - for bni in range(bn): # inside this n-block - for bki in range(bk): # inside this k-block - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): - B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) - if B_regs[bki, bni] not in bs: - - # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value - if B_cell_addr.disp > max_offs: - moved = B_cell_addr.disp - cur11 - if moved > 0 and moved <= max_offs: - B_cell_addr.disp = moved - else: - asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) - cur11 = B_cell_addr.disp - B_cell_addr.disp = 0 - - B_cell_addr.base = additional_regs[0] - - asm.add(ld(B_cell_addr, B_regs[bki, bni], False, B_comment, pred=None, is_B=True)) - bs.append(B_regs[bki, bni]) - - for bki in range(bk): # inside this k-block - for Vmi in range(Vm): - p_merging = self.pred_n_trues(bm - Vmi * v_size, v_size, "m") - end_index = bm if Vmi + 1 == Vm else Vmi * v_size + v_size # end_index helps us print the right index ranges - for bni in range(bn): # inside this n-block - to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): - _, B_comment = B.look(B_ptr, to_B_block, to_bcell) - comment = f"C[{Vmi * v_size}:{end_index},{bni}] += A[{Vmi * v_size}:{end_index},{bki}]*{B_comment}" - - asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, pred=p_merging, bcast=True, sub=sub)) - return asm diff --git a/pspamm/codegen/ccode.py b/pspamm/codegen/ccode.py deleted file mode 100644 index 7ca2b6d..0000000 --- a/pspamm/codegen/ccode.py +++ /dev/null @@ -1,30 +0,0 @@ -from pspamm.codegen.ast import * -from pspamm.codegen.analysis import * -from pspamm.codegen.precision import * - -import pspamm.architecture - - -def make_cfunc(funcName:str, template:str, body:Block, flop:int, starting_regs:List[Register], precision: Precision) -> str: - Printer_class = pspamm.architecture.get_class("pspamm.codegen.architectures." + pspamm.architecture.arch + ".inlineprinter").InlinePrinter - - printer = Printer_class(precision) - printer.lmargin = 4 - body.accept(printer) - body_text = "\n".join(printer.output) - - analyzer = Analyzer(starting_regs) - analyzer.collect(body) - regs = set(f'"{reg.clobbered}"' for reg in analyzer.clobbered_registers if reg.clobbered is not None) - regs.add('"memory"') - regs.add('"cc"') - # TODO: maybe regs.add('"redzone"') ? - clobbered = ", ".join(sorted(regs)) - arglist = ", ".join(sorted(arg.arg for arg in analyzer.input_operands)) - return template.format(funcName = funcName, - body_text = body_text, - args = arglist, - clobbered = clobbered, - flop = flop, - real_type = Precision.getCType(precision)) - diff --git a/pspamm/codegen/forms.py b/pspamm/codegen/forms.py deleted file mode 100644 index 1b8c44a..0000000 --- a/pspamm/codegen/forms.py +++ /dev/null @@ -1,95 +0,0 @@ - -from typing import List -from pspamm.codegen.sugar import * - -# TODO: We might eventually want to make this part of our syntax tree -# in order to do unrolls and other fancy stuff with it -class Loop(Block): - - _labels = [] - def __init__(self, - iteration_var: Register, - final_val: int, - body_contents: Block = None, - unroll: int = 1, - overlap: bool = False - ) -> None: - - self.iteration_var = iteration_var - self.final_val = final_val - self.body_contents = body_contents - self.unroll = unroll - self.may_overlap = overlap - - self.comment = f'loop {self.iteration_var.ugly} in range({self.final_val}), unroll {self.unroll}' - - @property - def contents(self): - self.label = "loop_top_" + str(len(Loop._labels)) - Loop._labels.append(self.label) - - onestep = [*(self.body_contents.contents)] - body = [] - rest = [] - for _ in range(self.unroll): - body += onestep - - for _ in range(self.final_val % self.unroll): - rest += onestep - - true_final_val = (self.final_val // self.unroll) * self.unroll - - allcode = [] - if true_final_val == self.unroll: - allcode += body - elif true_final_val > self.unroll: - allcode += [mov(-true_final_val, self.iteration_var, vector=False), - label(self.label)] + body + [add(self.unroll, self.iteration_var), - jump(self.label, self.iteration_var, backwards=True)] - allcode += rest - - return allcode - - def body(self, *args): - self.body_contents = block("Loop body", *args) - return self - - def normalize(self): - yield loop(self.iteration_var, self.final_val, self.unroll, self.may_overlap).body(*[substmt for stmt in self.body_contents.contents for substmt in stmt.normalize()]) - - def __str__(self): - return f'loop {self.iteration_var.ugly} in range({self.final_val}), unroll {self.unroll}' + '{\n' + '\n'.join(str(content) for content in self.body_contents.contents) + '\n}' - -def loop(iter_var, final_val, unroll=1, overlap=False): - return Loop(iter_var, final_val, unroll=unroll, overlap=overlap) - -class Skip(Block): - - _labels = [] - def __init__(self, - skipreg: Register - ) -> None: - - self.skipreg = skipreg - - self.comment = f'if {self.checkreg} != 0' - - @property - def contents(self): - self.label = "skip_" + str(len(Loop._labels)) - Loop._labels.append(self.label) - - return [jump(self.label, self.skipreg, backwards=True)] + body + [label(self.label)] - - def body(self, *args): - self.body_contents = block("Skip body", *args) - return self - - def normalize(self): - yield skip(self.checkreg).body(*[substmt for stmt in self.body_contents.contents for substmt in stmt.normalize()]) - - def __str__(self): - return f'if {self.checkreg} != 0' + '{\n' + '\n'.join(str(content) for content in self.body_contents.contents) + '\n}' - -def skip(checkreg): - return Skip(checkreg) diff --git a/pspamm/codegen/generator.py b/pspamm/codegen/generator.py deleted file mode 100644 index 1eeeca5..0000000 --- a/pspamm/codegen/generator.py +++ /dev/null @@ -1,76 +0,0 @@ -from pspamm.cursors import * -from pspamm.codegen.ast import * -from pspamm.codegen.precision import * -from abc import ABC, abstractmethod - -class AbstractGenerator(ABC): - def __init__(self, precision: Precision): - self.precision = precision - - def get_precision(self): - return self.precision - - def set_sparse(self): - pass - - # taken from https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python - def ceil_div(self, n, d): - return -(n // -d) - - @abstractmethod - def init_mask(self, bm, v_size, tempreg, maskreg): - pass - - @abstractmethod - def use_broadcast(self): - pass - - @abstractmethod - def has_masks(self): - pass - - @abstractmethod - def get_v_size(self): - pass - - @abstractmethod - def get_template(self): - pass - - @abstractmethod - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): - pass - - @abstractmethod - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False - ) -> Block: - pass - - @abstractmethod - def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: - pass - - @abstractmethod - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - pass diff --git a/pspamm/codegen/precision.py b/pspamm/codegen/precision.py deleted file mode 100644 index 417c9a6..0000000 --- a/pspamm/codegen/precision.py +++ /dev/null @@ -1,31 +0,0 @@ -from enum import Enum - -class Precision(Enum): - DOUBLE = 8 - SINGLE = 4 - HALF = 2 - BFLOAT16 = 2.1 - - @classmethod - def getCType(cls, precision): - ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'uint16_t', cls.BFLOAT16: 'uint16_t'} - return ctype[precision] - - def ctype(self): - return self.getCType(self) - - def size(self): - return { - self.DOUBLE: 8, - self.SINGLE: 4, - self.HALF: 2, - self.BFLOAT16: 2 - }[self] - raise NotImplementedError() - - def __repr__(self): - return self.getCType(self) - - def __str__(self): - return self.getCType(self) - diff --git a/pspamm/cursors/__init__.py b/pspamm/cursors/__init__.py deleted file mode 100644 index 5805965..0000000 --- a/pspamm/cursors/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from pspamm.cursors.matrix import Matrix -from pspamm.cursors.coords import Coords -from pspamm.cursors.abstractcursor import BlockInfo, CursorLocation, Cursor -from pspamm.cursors.blockcursor import BlockCursor, sparse_mask -from pspamm.cursors.densecursor import DenseCursor - diff --git a/pspamm/cursors/blockcursor.py b/pspamm/cursors/blockcursor.py deleted file mode 100644 index b9109b4..0000000 --- a/pspamm/cursors/blockcursor.py +++ /dev/null @@ -1,215 +0,0 @@ -from pspamm.cursors.abstractcursor import * -from pspamm.cursors.matrix import Matrix -from pspamm.cursors.coords import Coords - -from pspamm.codegen.sugar import * -from typing import cast - -class BlockCursor(Cursor): - - blocks = None - patterns = None - offsets = None - - def __init__(self, - name: str, - base_ptr: Register, - rows: int, - cols: int, - ld: int, - block_rows: int, - block_cols: int, - scalar_bytes:int, - blocks: Matrix[int], - patterns: List[Matrix[bool]], - mtx_overhead) -> None: - - self.name = name - self.base_ptr = base_ptr - self.scalar_bytes = scalar_bytes - self.r = rows - self.c = cols - self.ld = ld - self.br = block_rows - self.bc = block_cols - self.blocks = blocks - self.patterns = patterns - - self.offsets = Matrix.full(rows, cols, -1) - x = 0 - for i in range(self.c): - for j in range(self.r): - Bci = i // self.bc - Bri = j // self.br - index = cast(int, blocks[Bri, Bci]) - pattern = patterns[index] - if pattern[j % self.br,i % self.bc]: - self.offsets[j, i] = x - x += 1 - if ld != 0: - x += self.ld - self.r - x += mtx_overhead[i] - - def offset(self, - src_loc: CursorLocation, - dest_loc: CursorLocation - ) -> int: - - src_block = src_loc.current_block - src_cell = src_loc.current_cell - dest_block = dest_loc.current_block - dest_cell = dest_loc.current_cell - - if not dest_block.absolute: - dest_block += src_block - - assert(src_block.absolute) - assert(not src_cell.absolute) - assert(not dest_cell.absolute) - - src_cell += Coords(src_block.down*self.br, src_block.right*self.bc, True) - dest_cell += Coords(dest_block.down*self.br, dest_block.right*self.bc, True) - - src_offset = self.offsets[src_cell.down, src_cell.right] - dest_offset = self.offsets[dest_cell.down, dest_cell.right] - - if (src_offset == -1 or dest_offset == -1): - raise Exception("Cursor location does not exist in memory!") - - return dest_offset - - - def move(self, - src_loc: CursorLocation, - dest_block: Coords - ) -> Tuple[AsmStmt, CursorLocation]: - - comment = f"Move {self.name} to {str(dest_block)}" - - if dest_block.absolute: - dest_loc = self.start_location(dest_block) - else: - dest_loc = self.start_location(dest_block + src_loc.current_block) - - offset_bytes = self.offset(src_loc, dest_loc) * self.scalar_bytes - - return add(offset_bytes, self.base_ptr, comment), dest_loc - - - def look(self, - src_loc: CursorLocation, - dest_block: Coords, - dest_cell: Coords - ) -> Tuple[MemoryAddress, str]: - - dest_loc = CursorLocation(dest_block, dest_cell) - offset_bytes = self.offset(src_loc, dest_loc) * self.scalar_bytes - comment = f"{self.name}[{dest_block.down},{dest_block.right}][{dest_cell.down},{dest_cell.right}]" - - addr = pspamm.architecture.operands.mem(self.base_ptr, offset_bytes) - - return (addr, comment) - - - def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockInfo: - - if src is None: # Have dest_block but no src - assert(dest_block is not None) - assert(dest_block.absolute == True) - block_abs = dest_block - - elif dest_block is None: # Have src but no dest_block - assert(src.current_block.absolute == True) - block_abs = src.current_block - - elif dest_block.absolute: # Have src and absolute dest_block - block_abs = dest_block - - else: # Have both src and relative dest_block - assert(src.current_block.absolute == True) - block_abs = dest_block + src.current_block - - - br = self.br if block_abs.down < self.Br else self.brf #TODO: Verify these - bc = self.bc if block_abs.right < self.Bc else self.bcf - index = self.blocks[block_abs.down, block_abs.right] - index = cast(int, index) # TODO: Overload functions correctly - pattern = self.patterns[index][0:br, 0:bc] - pattern = cast(Matrix[bool], pattern) - return BlockInfo(br, bc, index, pattern) - - - def has_nonzero_cell(self, - src_loc: CursorLocation, - dest_block: Coords, - dest_cell: Coords - ) -> bool: - - assert(not dest_cell.absolute) - if not dest_block.absolute: - dest_block += src_loc.current_block - - dest_cell += Coords(dest_block.down*self.br, dest_block.right*self.bc, True) - return self.offsets.shape[0] > dest_cell.down and self.offsets.shape[1] > dest_cell.right and self.offsets[dest_cell.down, dest_cell.right] != -1 - - - def has_nonzero_block(self, src: CursorLocation, dest_block: Coords) -> bool: - nonzero = False - br,bc,idx,pat = self.get_block(src, dest_block) - for bci in range(bc): - for bri in range(br): - if pat[bri,bci]: - nonzero = True - return nonzero - - - def start_location(self, dest_block: Coords = Coords(absolute=True)) -> CursorLocation: - - assert(dest_block.absolute == True) - br,bc,idx,pat = self.get_block(dest_block=dest_block) - for bci in range(bc): - for bri in range(br): - if pat[bri,bci]: - return CursorLocation(dest_block, Coords(down=bri, right=bci, absolute=False)) - - raise Exception(f"Block {dest_block} has no starting location because it is empty!") - - - def start(self) -> CursorLocation: - - Br, Bc = self.blocks.shape - for Bci in range(Bc): - for Bri in range(Br): - target_block = Coords(down=Bri, right=Bci, absolute=True) - if self.has_nonzero_block(None, target_block): - return self.start_location(target_block) - raise Exception("Matrix is completely empty!") - - -def sparse_mask(A_regs: Matrix[Register], - A: Cursor, - A_ptr: CursorLocation, - A_block_offset: Coords, - B: Cursor, - B_ptr: CursorLocation, - B_block_offset: Coords, - v_size: int, - has_mask: bool = False - ) -> Matrix[bool]: - - Vr, Vc = A_regs.shape - mask = Matrix.full(Vr, Vc, False) - A_br, A_bc, A_idx, A_pat = A.get_block(A_ptr, A_block_offset) - B_br, B_bc, B_idx, B_pat = B.get_block(B_ptr, B_block_offset) - - if not has_mask: - assert (A_br % v_size == 0) # bm must tile m exactly for now in non-mask-supporting ISAs - assert(Vc >= A_bc) # Matrix block must fit in register block - assert(A_bc == B_br) # Matrix blocks are compatible - - # Mask out registers not used in current block, including zero-rows of B and A - for Vci in range(A_bc): - if B_pat[Vci,:].any(axis=1): - mask[:,Vci] = A_pat[:,Vci] - - return mask diff --git a/pspamm/matmul.py b/pspamm/matmul.py deleted file mode 100644 index e856ba6..0000000 --- a/pspamm/matmul.py +++ /dev/null @@ -1,482 +0,0 @@ -from typing import Tuple - -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.forms import * -from pspamm.codegen.precision import * - -from pspamm.cursors import * - -from pspamm.codegen.virtual import * -from pspamm.codegen.prune import * - -import pspamm.architecture -import numpy - - -def decompose_pattern(k, n, pattern:Matrix[bool], bk:int, bn:int) -> Tuple[Matrix[int], List[Matrix[bool]]]: - Bk,Bn = k//bk, n//bn - patterns = [] - x = 0 - - n_overhead = n % bn - k_overhead = k % bk - - if n_overhead > 0: - Bn += 1 - if k_overhead > 0: - Bk += 1 - - blocks = Matrix.full(Bk,Bn,-1) - - for Bni in range(Bn): - for Bki in range(Bk): - if Bni + 1 == Bn and n_overhead > 0 and Bki + 1 == Bk and k_overhead > 0: - block = pattern[(Bki*bk):((Bki+1)*bk+k_overhead), (Bni*bn):((Bni)*bn+n_overhead)] - elif Bni + 1 == Bn and n_overhead > 0: - block = pattern[(Bki*bk):((Bki+1)*bk), (Bni*bn):((Bni)*bn+n_overhead)] - elif Bki + 1 == Bk and k_overhead > 0: - block = pattern[(Bki*bk):((Bki+1)*bk+k_overhead), (Bni*bn):((Bni+1)*bn)] - else: - block = pattern[(Bki*bk):((Bki+1)*bk), (Bni*bn):((Bni+1)*bn)] - - blocks[Bki,Bni] = x - x += 1 - patterns.append(block) - - mtx_overhead = [0] * n - - for i in range(n): - for j in range(k, pattern.rows): - if pattern[j, i]: - mtx_overhead[i] += 1 - - return blocks, patterns, mtx_overhead - -class MatMul: - def __init__(self, - m: int, - n: int, - k: int, - lda: int, - ldb: int, - ldc: int, - alpha: str, - beta: str, - mtx_filename: str, - amtx_filename: str, - bmtx_filename: str, - mtx_format: str = 'any', - output_funcname: str = None, - output_filename: str = None, - output_overwrite: bool = False, - bm: int = None, - bn: int = None, - bk: int = None, - arch: str = 'knl', - precision: str = 'd', - prefetching: str = None, - **kwargs # Accept and ignore args which don't belong - ) -> None: - - self.m = m - self.n = n - self.k = k - - self.lda = lda - self.ldb = ldb - self.ldc = ldc - - try: - self.alpha = float(alpha) - except: - self.alpha = 'generic' - try: - self.beta = float(beta) - except: - self.beta = 'generic' - - if arch.startswith('skx'): - arch = 'knl' + arch[3:] - - # hacky implementation of multi-register length - if arch.startswith('arm_sve'): - if len(arch) == 7: - v_len_regs = 4 # compatibility: arm_sve == arm_sve512 - else: - v_len_bits = int(arch[7:]) - assert v_len_bits % 128 == 0 and v_len_bits <= 2048 - v_len_regs = v_len_bits // 128 - arch = 'arm_sve' - - if arch.startswith('knl'): - if len(arch) == 3: - v_len_regs = 4 - else: - v_len_bits = int(arch[3:]) - assert v_len_bits in (128, 256, 512) - v_len_regs = v_len_bits // 128 - arch = 'knl' - - if arch.startswith('hsw'): - if len(arch) == 3: - v_len_regs = 2 - else: - v_len_bits = int(arch[3:]) - assert v_len_bits in (128, 256) - v_len_regs = v_len_bits // 128 - arch = 'hsw' - - if arch.startswith('rvv'): - if len(arch) == 3: - v_len_regs = 1 - else: - v_len_bits = int(arch[3:]) - assert v_len_bits in (128, 256, 512, 1024, 2048, 4096, 8192) - v_len_regs = v_len_bits // 128 - arch = 'rvv' - - if arch.startswith('arm') and not arch.startswith('arm_sve'): - # only 128 supported - v_len_regs = 1 - arch = 'arm' - - if arch.startswith('lsx'): - if len(arch) == 3: - v_len_regs = 1 - else: - v_len_bits = int(arch[3:]) - assert v_len_bits in (128, 256) - v_len_regs = v_len_bits // 128 - arch = 'lsx' - - if arch.startswith('lasx'): - if len(arch) == 4: - v_len_regs = 2 - else: - v_len_bits = int(arch[4:]) - assert v_len_bits in (128, 256) - v_len_regs = v_len_bits // 128 - arch = 'lsx' - - self.arch = arch - assert precision.lower() in ['bf16', 'h', 's', 'd'] - self.precision = { - 'h' : Precision.HALF, - 's' : Precision.SINGLE, - 'd' : Precision.DOUBLE, - 'bf16' : Precision.BFLOAT16 - }[precision.lower()] - - pspamm.architecture.init() - pspamm.architecture.arch = arch - pspamm.architecture.Generator = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".generator").Generator - pspamm.architecture.operands = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".operands") - pspamm.architecture.blocksize = pspamm.architecture.get_class("pspamm.codegen.architectures." + arch + ".blocksize").Default - - self.generator = pspamm.architecture.Generator(self.precision) - - # flag that determines if a matmul kernel uses sve instructions -> needed for sve predicates - self.masks = self.generator.has_masks() - # define which architectures need to use an explicit broadcast, necessary for alpha/beta values - self.use_bcst = self.generator.use_broadcast() - - self.generator.v_len = v_len_regs - - self.v_size = self.generator.get_v_size() - - if bk == None: - bk = 2 if arch == 'knl' else 1 - - if bm == None or bn == None: - (self.bm, self.bn, self.bk) = pspamm.architecture.blocksize.getBlocksize(m, n, bk, self.v_size, self.precision) - else: - self.bm = bm - self.bn = bn - self.bk = bk - - self.prefetching = prefetching - - self.output_funcname = output_funcname - self.output_filename = output_filename - self.output_overwrite = output_overwrite - - if ldb == 0: - if bmtx_filename is None or bmtx_filename == '': - bmtx_filename = mtx_filename - bpattern = Matrix.load(bmtx_filename) - self.generator.set_sparse() - else: - bpattern = Matrix.full(k, n, True) - assert self.k <= ldb - - if lda == 0: - apattern = Matrix.load(amtx_filename) - self.generator.set_sparse() - else: - apattern = Matrix.full(m, k, True) - assert self.m <= lda - - self.bmtx_filename = bmtx_filename - self.amtx_filename = amtx_filename - self.mtx_format = mtx_format - - assert self.m <= ldc - - self.bnnz = bpattern.nnz() - self.annz = apattern.nnz() - - # compute flops by splitting into outer products over k - kannz = apattern.nnz(1) - kbnnz = bpattern.nnz(0) - self.flop = 2 * sum(ka * kb for ka,kb in zip(kannz, kbnnz)) - - # if matrices are always padded to multiple of v_size, we can remove the if-part and execute the assert for SVE too - if not self.masks: - assert(self.m % self.v_size == 0) - - self.A_regs, self.B_regs, self.C_regs, self.starting_regs, self.alpha_reg, self.beta_reg, self.loop_regs, self.additional_regs, self.mask_regs, self.prefetch_reg = self.generator.make_reg_blocks(self.bm, self.bn, self.bk, self.v_size, self.bnnz, self.m, self.n, self.k, self.prefetching) - - self.A_pool = RegisterPool([self.A_regs[i,j] for i in range(self.A_regs.shape[0]) for j in range(self.A_regs.shape[1])]) - self.B_pool = RegisterPool([self.B_regs[i,j] for i in range(self.B_regs.shape[0]) for j in range(self.B_regs.shape[1])]) - self.C_pool = RegisterPool([self.C_regs[i,j] for i in range(self.C_regs.shape[0]) for j in range(self.C_regs.shape[1])]) - - self.alpha_bcst_reg, self.beta_bcst_reg = self.starting_regs[3], self.starting_regs[4] - - if lda == 0: - blocks, patterns, mtx_overhead = decompose_pattern(self.m, self.k, apattern, self.bm, self.bk) - self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size(), blocks, patterns, mtx_overhead) - self.annz += sum(mtx_overhead) - else: - self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size()) - if ldb == 0: - blocks, patterns, mtx_overhead = decompose_pattern(self.k, self.n, bpattern, self.bk, self.bn) - self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size(), blocks, patterns, mtx_overhead) - self.bnnz += sum(mtx_overhead) - else: - self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size()) - self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) - self.C_pf = DenseCursor("C_pf", self.starting_regs[5], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) if self.prefetch_reg else None - - self.unroll_n = ldb == 0 - self.unroll_m = lda == 0 - - # use unused loop registers for scaling instead - if self.unroll_m: - self.additional_regs += [self.loop_regs[0]] - if self.unroll_n: - self.additional_regs += [self.loop_regs[1]] - if self.unroll_m or self.unroll_n: - self.additional_regs += [self.loop_regs[2]] - - def microkernel(self, asm, Bmi, Bni, unroll, A_ptr, B_ptr, C_ptr, C_pf_ptr): - Bn = self.n // self.bn - Bk = self.k // self.bk - Bm = self.m // self.bm - - vm = self.generator.ceil_div(self.bm, self.v_size) - - n_overhead = self.n % self.bn - k_overhead = self.k % self.bk - m_overhead = self.m % self.bm - vm_overhead = -(m_overhead // -self.v_size) - - if n_overhead > 0: - Bn += 1 - if k_overhead > 0: - Bk += 1 - if m_overhead > 0: - Bm += 1 - - regs = Matrix([[VirtualRegister(self.C_regs[0,0].typeinfo, self.C_pool) for _ in range(self.C_regs.shape[1])] for _ in range(self.C_regs.shape[0])]) - - BnEnd = Bni + 1 == Bn - BmEnd = Bmi + 1 == Bm - - if BnEnd and n_overhead > 0: - regs = regs[:, :n_overhead] - if BmEnd and m_overhead > 0: - regs = regs[:vm_overhead, :] - - C_ptr_in = CursorLocation(Coords(right=Bni, down=Bmi, absolute=True)) - to_C = Coords() - C_ptr_pf_in = C_ptr_in - - if self.alpha in [-1.0, 1.0] and self.beta != 0.0: - asm.add(self.generator.move_register_block(self.C, C_ptr_in, to_C, regs, self.v_size, self.additional_regs, None, False)) - if self.beta != 1.0: - if self.use_bcst: - asm.add(bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta")) - for ic in range(regs.shape[1]): - for ir in range(regs.shape[0]): - pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir * self.v_size, self.v_size, "m") - asm.add(mul(regs[ir,ic], self.beta_reg[1], regs[ir,ic], "C = beta * C", pred=pred_m)) - else: - asm.add(self.generator.make_zero_block(regs, self.additional_regs)) - - def kernelK(asm, Bki): - if unroll: - # adjust registers if necessary for the last operation - - if BmEnd and m_overhead > 0 and not self.unroll_m: - A_ptr_in = CursorLocation(Coords(right=0, down=Bmi, absolute=True)) - else: - A_ptr_in = A_ptr - to_A = Coords(right=Bki, down=Bmi, absolute=True) if self.unroll_m else Coords(right=Bki) - - if BnEnd and n_overhead > 0 and not self.unroll_n: - B_ptr_in = CursorLocation(Coords(down=0, right=Bni, absolute=True)) - else: - B_ptr_in = B_ptr - to_B = Coords(right=Bni, down=Bki, absolute=True) if self.unroll_n else Coords(down=Bki) - keep = (not self.unroll_n or self.B.has_nonzero_block(B_ptr_in, to_B)) and (not self.unroll_m or self.A.has_nonzero_block(A_ptr_in, to_A)) - else: - # setting A_ptr, B_ptr here may be a bit too hacky... - A_ptr_in = CursorLocation(Coords(right=Bki, down=Bmi, absolute=True)) - B_ptr_in = CursorLocation(Coords(right=Bni, down=Bki, absolute=True)) - to_A = Coords() - to_B = Coords() - keep = True - - sub = self.alpha == -1.0 - - if keep: - A_regs = Matrix([[VirtualRegister(self.A_regs[0,0].typeinfo, self.A_pool) for _ in range(self.A_regs.shape[1])] for _ in range(self.A_regs.shape[0])]) - B_regs = Matrix([[VirtualRegister(self.B_regs[0,0].typeinfo, self.B_pool) for _ in range(self.B_regs.shape[1])] for _ in range(self.B_regs.shape[0])]) - asm.add(self.generator.make_microkernel(self.A, self.B, A_ptr_in, B_ptr_in, A_regs, B_regs, regs, self.v_size, self.additional_regs, to_A, to_B, sub)) - - self.loopwrap(asm, kernelK, Bk, k_overhead > 0, unroll, self.loop_regs[2], [self.A, self.B], [A_ptr, B_ptr], ['right', 'down'], loopunroll=1, overlap=True) - - if self.alpha not in [-1.0, 1.0]: - store_block = block("") - - if self.use_bcst: - store_block.add(bcst(self.alpha_bcst_reg, self.alpha_reg[1], "Broadcast alpha")) - if self.beta != 0.0 and self.beta != 1.0: - store_block.add(bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta")) - - for x in range(0, regs.shape[1], self.A_regs.shape[1]): - A_regs = Matrix([[VirtualRegister(self.A_regs[0,0].typeinfo, self.A_pool) for _ in range(self.A_regs.shape[1])] for _ in range(self.A_regs.shape[0])]) - A_regs_cut = A_regs[0:min(self.A_regs.shape[0], regs.shape[0]), 0:regs.shape[1]-x] - if self.beta != 0.0: - store_block.add(self.generator.move_register_block(self.C, C_ptr_in, to_C, A_regs_cut, self.v_size, self.additional_regs, None, False, None, self.ldc * x)) - - for ir in range(A_regs_cut.shape[0]): - for ic in range(A_regs_cut.shape[1]): - pred_m = None if not self.masks else self.generator.pred_n_trues(self.bm - ir*self.v_size, self.v_size, "m") - if self.beta != 0.0 and self.beta != 1.0: - store_block.add(mul(A_regs_cut[ir,ic], self.beta_reg[1], A_regs_cut[ir,ic], "C = beta * C + alpha * AB", pred=pred_m)) - - if self.beta == 0.0: - store_block.add(mul(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = alpha * AB", pred=pred_m)) - else: - store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", None, pred=pred_m)) - store_block.add(self.generator.move_register_block(self.C, C_ptr_in, to_C, A_regs_cut, self.v_size, self.additional_regs, None, True, self.prefetching, self.ldc * x, self.C_pf, C_pf_ptr)) - asm.add(store_block) - else: - asm.add(self.generator.move_register_block(self.C, C_ptr_in, to_C, regs, self.v_size, self.additional_regs, None, True, self.prefetching, 0, self.C_pf, C_pf_ptr)) - - def blockloop(self, asm, A_ptr, B_ptr, C_ptr, C_pf_ptr): - Bn = self.n // self.bn - Bk = self.k // self.bk - Bm = self.m // self.bm - - vm = self.generator.ceil_div(self.bm, self.v_size) - - n_overhead = self.n % self.bn - k_overhead = self.k % self.bk - m_overhead = self.m % self.bm - vm_overhead = -(m_overhead // -self.v_size) - - if n_overhead > 0: - Bn += 1 - if k_overhead > 0: - Bk += 1 - if m_overhead > 0: - Bm += 1 - - argsA = [Bm, m_overhead > 0, self.unroll_m, self.loop_regs[0], [self.A], [A_ptr], ['down']] - argsB = [Bn, n_overhead > 0, self.unroll_n, self.loop_regs[1], [self.B], [B_ptr], ['right']] - - if self.unroll_n and not self.unroll_m: - # swap loops - outerArgs, innerArgs = (argsB, argsA) - dirC, dirC2 = ('down', 'right') - args = lambda i,j: (j,i) - else: - outerArgs, innerArgs = (argsA, argsB) - dirC, dirC2 = ('right', 'down') - args = lambda i,j: (i,j) - - unroll_k = self.unroll_m | self.unroll_n - - def outerLoop(asm, i): - def innerLoop(asm, j): - Bmi, Bni = args(i, j) - self.microkernel(asm, Bmi, Bni, unroll_k, A_ptr, B_ptr, C_ptr, C_pf_ptr) - if j < innerArgs[0] - 1: - move_C, _ = self.C.move(C_ptr, Coords(**{dirC:1})) - asm.add(move_C) - if self.C_pf: - move_C_pf, _ = self.C_pf.move(C_pf_ptr, Coords(**{dirC:1})) - asm.add(move_C_pf) - overhead = self.loopwrap(asm, innerLoop, *innerArgs) - moveLength = 1-innerArgs[0] if overhead else -innerArgs[0] - asm.add(self.C.move(C_ptr, Coords(**{dirC2:1, dirC:moveLength}))[0]) - if self.C_pf: - asm.add(self.C_pf.move(C_pf_ptr, Coords(**{dirC2:1, dirC:moveLength}))[0]) - - self.loopwrap(asm, outerLoop, *outerArgs) - - def loopwrap(self, asm, inner, length, overhead, unroll, loopreg, matrices, ptrs, directions, loopunroll=1, overlap=False): - if unroll: - for i in range(length): - inner(asm, i) - return True - else: - def makeMove(dist): - asm = block(f"move by {dist}") - for matrix, ptr, direction in zip(matrices, ptrs, directions): - asm.add(matrix.move(ptr, Coords(**{direction:dist}))[0]) - return asm - def makeLoop(until): - loopblock = block("kernel") - inner(loopblock, 0) - loopblock.add(makeMove(1)) - return loop(loopreg, until, unroll=loopunroll, overlap=overlap).body(loopblock) - if length == 1: - inner(asm, 0) - return True - elif overhead: - if length > 1: - asm.add(makeLoop(length - 1)) - inner(asm, length - 1) - asm.add(makeMove(1-length)) - return True - else: - asm.add(makeLoop(length)) - asm.add(makeMove(-length)) - return False - - def make(self): - A_ptr = self.A.start() - B_ptr = self.B.start() - C_ptr = self.C.start() - C_pf_ptr = self.C_pf.start() if self.C_pf else None - - asm = block("kernel") - - asm.add(self.generator.make_argument_load(self.starting_regs, self.C_pf is not None)) - - asm.add(block("header", - self.generator.make_scaling_offsets(self.additional_regs, self.bnnz), - self.generator.init_mask(self.m, self.bm, self.v_size, self.loop_regs[0], self.mask_regs) - )) - - asm.add(self.generator.init_block(self.v_size)) - - self.blockloop(asm, A_ptr, B_ptr, C_ptr, C_pf_ptr) - - assignVirtualRegisters(asm, [self.A_pool, self.B_pool, self.C_pool]) - - return asm diff --git a/pspamm/metagen/arm.py b/pspamm/metagen/arm.py deleted file mode 100644 index 6c7436a..0000000 --- a/pspamm/metagen/arm.py +++ /dev/null @@ -1,10 +0,0 @@ - -def arm_basic(): - generator = MetaGenerator() - - generator.add_condition('', 'arm128') - generator.add_condition('svcntb() == 16', 'arm_sve128') - generator.add_condition('svcntb() == 32', 'arm_sve256') - generator.add_condition('svcntb() == 64', 'arm_sve512') - generator.add_condition('svcntb() == 128', 'arm_sve1024') - generator.add_condition('svcntb() == 256', 'arm_sve2048') diff --git a/pyproject.toml b/pyproject.toml index 7c04566..2a439aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,8 +12,6 @@ authors = [ { name = "Nathan Brei", email = "nathan.w.brei@gmail.com" }, { name = "Alex Puscas", email = "alex-puscas@gmx.de" }, { name = "David Schneller", email = "david.schneller@tum.de" }, - { name = "Lukas Krenz", email = "lukas@krenz.land" }, - { name = "Carsten Uphoff", email = "uphoff@in.tum.de" }, ] maintainers = [ { name = "David Schneller", email = "david.schneller@tum.de" }, @@ -27,8 +25,8 @@ dynamic = ["version", "readme", "dependencies"] [tool.setuptools.dynamic] readme = {file = ["README.md"], content-type = "text/markdown"} -version = {file = ["pspamm/VERSION"]} +version = {file = ["pypspamm/VERSION"]} dependencies = {file = ["requirements.txt"]} [project.scripts] -pspamm-generator = "pspamm.cli:main" +pspamm-generator = "pypspamm.cli:main" diff --git a/pspamm/VERSION b/pypspamm/VERSION similarity index 100% rename from pspamm/VERSION rename to pypspamm/VERSION diff --git a/pypspamm/__init__.py b/pypspamm/__init__.py new file mode 100644 index 0000000..2aae60a --- /dev/null +++ b/pypspamm/__init__.py @@ -0,0 +1 @@ +from pypspamm import * diff --git a/pspamm/architecture.py b/pypspamm/architecture.py old mode 100755 new mode 100644 similarity index 89% rename from pspamm/architecture.py rename to pypspamm/architecture.py index c6f8f35..03e55b7 --- a/pspamm/architecture.py +++ b/pypspamm/architecture.py @@ -1,5 +1,6 @@ from importlib import import_module + def init(): global arch global generator @@ -8,5 +9,6 @@ def init(): generator = None operands = None -def get_class( kls ): + +def get_class(kls): return import_module(kls) diff --git a/pspamm/cli.py b/pypspamm/cli.py similarity index 51% rename from pspamm/cli.py rename to pypspamm/cli.py index ab594f1..1255acb 100755 --- a/pspamm/cli.py +++ b/pypspamm/cli.py @@ -2,17 +2,13 @@ import argparse -import pspamm.architecture +import pypspamm.architecture +from pypspamm.codegen.architectures import * +from pypspamm.codegen.ccode import * +from pypspamm.matmul import * +from pypspamm.metagen.metagen import * -from pspamm.matmul import * - -from pspamm.codegen.ccode import * -from pspamm.codegen.architectures import * - -from pspamm.metagen.metagen import * - - -mtx_formats = ['any','csc','csr','bsc','bsr','bcsc','bcsr'] +mtx_formats = ["any", "csc", "csr", "bsc", "bsr", "bcsc", "bcsr"] def generate(alg: MatMul) -> None: @@ -29,16 +25,21 @@ def generate(alg: MatMul) -> None: f.write(text) - def main() -> None: - parser = argparse.ArgumentParser(description='Generate a sparse matrix multiplication algorithm for C = alpha * A * B + beta * C.') + parser = argparse.ArgumentParser( + description="Generate a sparse matrix multiplication algorithm for C = alpha * A * B + beta * C." + ) parser.add_argument("m", type=int, help="Number of rows of A and C") parser.add_argument("n", type=int, help="Number of cols of B and C") parser.add_argument("k", type=int, help="Number of cols of A, rows of B") - parser.add_argument("lda", type=int, help="Leading dimension of A (zero if A is sparse)") - parser.add_argument("ldb", type=int, help="Leading dimension of B (zero if B is sparse)") + parser.add_argument( + "lda", type=int, help="Leading dimension of A (zero if A is sparse)" + ) + parser.add_argument( + "ldb", type=int, help="Leading dimension of B (zero if B is sparse)" + ) parser.add_argument("ldc", type=int, help="Leading dimension of C") parser.add_argument("alpha", type=str, help="alpha, 1.0 or generic") @@ -50,23 +51,41 @@ def main() -> None: parser.add_argument("--bk", type=int, help="Size of k-blocks") parser.add_argument("--arch", help="Architecture", default="knl") - parser.add_argument("--precision", help="Precision of the matrix multiplication, either half (h), single (s), or double (d)", default="d") + parser.add_argument( + "--precision", + help="Precision of the matrix multiplication, either half (h), single (s), or double (d)", + default="d", + ) parser.add_argument("--prefetching", help="Prefetching") - parser.add_argument("--mtx_filename", help="Path to MTX file describing the sparse matrix") - parser.add_argument("--mtx_format", help="Constraint on sparsity pattern", choices=mtx_formats, default="Any") - - parser.add_argument("--amtx_filename", help="Path to MTX file describing the sparse matrix") - parser.add_argument("--bmtx_filename", help="Path to MTX file describing the sparse matrix") + parser.add_argument( + "--mtx_filename", help="Path to MTX file describing the sparse matrix" + ) + parser.add_argument( + "--mtx_format", + help="Constraint on sparsity pattern", + choices=mtx_formats, + default="Any", + ) + + parser.add_argument( + "--amtx_filename", help="Path to MTX file describing the sparse matrix" + ) + parser.add_argument( + "--bmtx_filename", help="Path to MTX file describing the sparse matrix" + ) parser.add_argument("--output_funcname", help="Name for generated C++ function") parser.add_argument("--output_filename", help="Path to destination C++ file") - parser.add_argument("--output_overwrite", action="store_true", help="Overwrite output file") + parser.add_argument( + "--output_overwrite", action="store_true", help="Overwrite output file" + ) args = parser.parse_args() alg = MatMul(**args.__dict__) generate(alg) + if __name__ == "__main__": main() diff --git a/pypspamm/codegen/__init__.py b/pypspamm/codegen/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pspamm/codegen/analysis.py b/pypspamm/codegen/analysis.py similarity index 81% rename from pspamm/codegen/analysis.py rename to pypspamm/codegen/analysis.py index 0157744..5ae84ca 100644 --- a/pspamm/codegen/analysis.py +++ b/pypspamm/codegen/analysis.py @@ -1,8 +1,9 @@ -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.sugar import * - from typing import List, Set +from pypspamm.codegen.sugar import * +from pypspamm.codegen.visitor import Visitor + + class Analyzer: def __init__(self, starting_regs: List[Register] = None): self.clobbered_registers = set(starting_regs) diff --git a/pypspamm/codegen/architectures/__init__.py b/pypspamm/codegen/architectures/__init__.py new file mode 100644 index 0000000..340b8ea --- /dev/null +++ b/pypspamm/codegen/architectures/__init__.py @@ -0,0 +1,12 @@ +from pypspamm.codegen.architectures.arm.generator import * +from pypspamm.codegen.architectures.arm.inlineprinter import * +from pypspamm.codegen.architectures.arm.operands import * +from pypspamm.codegen.architectures.arm_sve.generator import * +from pypspamm.codegen.architectures.arm_sve.inlineprinter import * +from pypspamm.codegen.architectures.arm_sve.operands import * +from pypspamm.codegen.architectures.hsw.generator import * +from pypspamm.codegen.architectures.hsw.inlineprinter import * +from pypspamm.codegen.architectures.hsw.operands import * +from pypspamm.codegen.architectures.knl.generator import * +from pypspamm.codegen.architectures.knl.inlineprinter import * +from pypspamm.codegen.architectures.knl.operands import * diff --git a/pypspamm/codegen/architectures/arm/__init__.py b/pypspamm/codegen/architectures/arm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pspamm/codegen/architectures/arm/blocksize.py b/pypspamm/codegen/architectures/arm/blocksize.py similarity index 69% rename from pspamm/codegen/architectures/arm/blocksize.py rename to pypspamm/codegen/architectures/arm/blocksize.py index c8e3740..2d98520 100644 --- a/pspamm/codegen/architectures/arm/blocksize.py +++ b/pypspamm/codegen/architectures/arm/blocksize.py @@ -1,20 +1,19 @@ - class Old: @classmethod - def getBlocksize(cls, m , n, bk, v_size, prec): + def getBlocksize(cls, m, n, bk, v_size, prec): bm = m bn = n - + if cls.ARM_condition(bm, bn, bk, v_size): - while cls.ARM_condition(bm, bn, bk+1, v_size): + while cls.ARM_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) while not cls.ARM_condition(bm, bn, bk, v_size): bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) - while cls.ARM_condition(bm, bn, bk+1, v_size): + while cls.ARM_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -36,7 +35,8 @@ def lowerToNextDiv(cls, m, n, bm, bn, v_size): def ARM_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm + bn*bk <= 32 + return (bn + bk) * vm + bn * bk <= 32 + class Max: @classmethod @@ -45,25 +45,25 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): if cls.ARM_condition(i, j, bk, v_size): - if i*j > maxval: - maxval = i*j + if i * j > maxval: + maxval = i * j bm = i bn = j - while cls.ARM_condition(bm, bn, bk+1, v_size): + while cls.ARM_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) - @classmethod def ARM_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm + bn*bk <= 32 + return (bn + bk) * vm + bn * bk <= 32 + class MaxK: @classmethod @@ -74,15 +74,15 @@ def getBlocksize(cls, m, n, bk, v_size, prec): elem128 = 16 // prec.size() - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): if cls.ARM_condition(i, j, bk, v_size, elem128): - if i*j > maxval: - maxval = i*j + if i * j > maxval: + maxval = i * j bm = i bn = j - while cls.ARM_condition(bm, bn, bk+1, v_size, elem128): + while cls.ARM_condition(bm, bn, bk + 1, v_size, elem128): bk += 1 return (bm, bn, bk) @@ -92,7 +92,8 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vm = -(bm // -v_size) vk = -(bk // -elem128) - return (bn+bk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn * vk <= 32 + class Cube: @classmethod @@ -103,12 +104,12 @@ def getBlocksize(cls, m, n, bk, v_size, prec): elem128 = 16 // prec.size() - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): for k in range(1, 200): if cls.ARM_condition(i, j, k, v_size, elem128): - if i*j*k > maxval: - maxval = i*j*k + if i * j * k > maxval: + maxval = i * j * k bm = i bn = j bk = k @@ -120,6 +121,7 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vm = -(bm // -v_size) vk = -(bk // -elem128) - return (bn+bk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn * vk <= 32 + Default = MaxK diff --git a/pypspamm/codegen/architectures/arm/generator.py b/pypspamm/codegen/architectures/arm/generator.py new file mode 100644 index 0000000..4986fca --- /dev/null +++ b/pypspamm/codegen/architectures/arm/generator.py @@ -0,0 +1,413 @@ +from pypspamm.codegen.architectures.arm.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * + + +class Generator(AbstractGenerator): + + template = """ +void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, {real_type} alpha, {real_type} beta, const {real_type}* prefetch) {{ + __asm__ __volatile__( +{body_text} + : : {args} : {clobbered}); + + #ifndef NDEBUG + #ifdef _OPENMP + #pragma omp atomic + #endif + pspamm_num_total_flops += {flop}; + #endif +}} +""" + + def get_v_size(self): + return 16 // self.precision.size() + + def get_template(self): + return Generator.template + + def use_broadcast(self): + return True + + def has_masks(self): + return False + + def init_mask(self, m, bm, v_size, tempreg, maskregs): + return block("") + + def make_argument_load(self, starting_regs, prefetch): + asm = block("Load arguments") + asm.add(ld(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(ld(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(ld(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(ld(InputOperand(f"3", "m", "alpha"), starting_regs[3], False)) + asm.add(ld(InputOperand(f"4", "m", "beta"), starting_regs[4], False)) + if prefetch: + asm.add(ld(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) + return asm + + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + assert bm % v_size == 0 + vm = bm // v_size + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + assert (bn + bk) * vm + bn * vk <= 32 # Needs to fit in NEON v registers + + prec = { + Precision.DOUBLE: "2d", + Precision.SINGLE: "4s", + Precision.HALF: "8h", + }[self.get_precision()] + + A_regs = Matrix([[v(vm * c + r, prec) for c in range(bk)] for r in range(vm)]) + B_regs = Matrix( + [[v(vm * bk + bn * r + c, prec) for c in range(bn)] for r in range(vk)] + ) + C_regs = Matrix( + [[v(32 - vm * bn + vm * c + r, prec) for c in range(bn)] for r in range(vm)] + ) + + # get vector register number of the first vector in B_regs + b_reg = vm * bk + alpha_reg = [v(b_reg, prec), v(b_reg, prec)] + beta_reg = [v(b_reg + 1, prec), v(b_reg + 1, prec)] + + starting_regs = [r(0), r(1), r(2), r(3), r(4), r(5), r(11)] + + additional_regs = [r(8), xzr, r(10)] + + loop_regs = [r(12), r(13), r(14)] + + prefetch_reg = prefetch is not None + + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + [], + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: + + asm = block("No register based scaling") + return asm + + def init_block(self, size): + return block("") + + class LoadStoreLocation: + def __init__(self, addr, register, comment, pfaddr=None): + self.addr = addr + self.register = register + self.comment = comment + self.pfaddr = pfaddr + + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + ) -> Block: + + rows, cols = registers.shape + + locations = [] + for ic in range(cols): + for ir in range(rows): + if (mask is None) or (mask[ir, ic]): + all_coords = [ + Coords(down=ir * v_size + i, right=ic) for i in range(v_size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] + if not any(has_nonzero): + continue + elif any(has_nonzero) and not all(has_nonzero): + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented." + ) + + cell_offset = Coords(down=ir * v_size, right=ic) + addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr.disp += self.precision.size() * load_offset + + if prefetching: + pfaddr, _ = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) + pfaddr.disp += self.precision.size() * load_offset + else: + pfaddr = None + locations += [ + self.LoadStoreLocation(addr, registers[ir, ic], comment, pfaddr) + ] + + return self.fuse_loadstore_block( + locations, store, cursor.name, block_offset, additional_regs + ) + + def fuse_loadstore_block( + self, locations, store, name, block_offset, additional_regs + ): + offsets = list( + sorted([(location.addr.disp, location) for location in locations]) + ) + + action = "Store" if store else "Load" + asm = block(f"{action} {name} register block @ {block_offset}") + + curpf = 0 + cur11 = -1000 + fuse_cache = [] + + def try_flush_cache(force, cur11): + if len(fuse_cache) == 0: + return + + if force: + op1 = fuse_cache[0] + op2 = fuse_cache[1] if len(fuse_cache) > 1 else None + op3 = fuse_cache[2] if len(fuse_cache) > 2 else None + op4 = fuse_cache[3] if len(fuse_cache) > 3 else None + + max_offset = [65520, 1008, 48, 64][len(fuse_cache) - 1] + div_offset = [16, 16, 24, 32][len(fuse_cache) - 1] + + comment = f"{op1.comment}" + if op2 is not None: + comment += f", {op2.comment}" + if op3 is not None: + comment += f", {op3.comment}" + if op4 is not None: + comment += f", {op4.comment}" + + offset = op1.addr.disp - cur11 if cur11 >= 0 else op1.addr.disp + + if cur11 >= 0: + op1.addr.disp = offset + op1.addr.base = additional_regs[0] + + if offset > max_offset or offset % div_offset != 0: + if cur11 < 0: + asm.add(add(offset, additional_regs[0], "", op1.addr.base)) + cur11 = offset + else: + asm.add(add(offset, additional_regs[0], "")) + cur11 += offset + op1.addr.disp = 0 + op1.addr.base = additional_regs[0] + + op1r = op1.register + op2r = op2.register if op2 is not None else None + op3r = op3.register if op3 is not None else None + op4r = op4.register if op4 is not None else None + + if store: + asm.add( + st( + op1r, + op1.addr, + True, + comment, + src2=op2r, + src3=op3r, + src4=op4r, + ) + ) + else: + asm.add( + ld( + op1.addr, + op1r, + True, + comment, + dest2=op2r, + dest3=op3r, + dest4=op4r, + ) + ) + + fuse_cache.clear() + + return cur11 + + for _, location in offsets: + if len(fuse_cache) > 0: + can_fuse = location.addr.disp == fuse_cache[-1].addr.disp + 16 + + # TODO: extend to 4? + max_length = len(fuse_cache) == 2 + + cur11 = try_flush_cache(not can_fuse or max_length, cur11) + + fuse_cache += [location] + + if location.pfaddr is not None: + if location.pfaddr.disp - curpf >= 32768: + asm.add( + add( + location.pfaddr.disp, + additional_regs[2], + "increment the prefetch register", + location.pfaddr.base, + ) + ) + curpf = location.pfaddr.disp + if curpf > 0: + reg = additional_regs[2] + disp = location.pfaddr.disp - curpf + else: + reg = location.pfaddr.base + disp = location.pfaddr.disp + asm.add( + prefetch( + mem(reg, disp), + "", + access_type="LD", + closeness="L2", + temporality="KEEP", + ) + ) + + cur11 = try_flush_cache(True, cur11) + + return asm + + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: + + rows, cols = registers.shape + asm = block("zero registers") + + for ic in range(cols): + for ir in range(rows): + asm.add(mov(additional_regs[1], registers[ir, ic], True)) + + return asm + + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. + """ + + asm = block("Block GEMM microkernel") + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + assert bm % v_size == 0 + + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) + asm.add( + self.move_register_block( + A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False + ) + ) + + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + + # TODO: fuse loads here as well + bs = [] + firstloc = {} + locations = [] + for Vmi in range(bm // v_size): + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block + bki_reg = bki // elem128 + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): + if (bki_reg, bni) not in firstloc: + B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) + firstloc[(bki_reg, bni)] = self.LoadStoreLocation( + B_cell_addr, B_regs[bki_reg, bni], B_comment + ) + if ( + A.has_nonzero_cell(A_ptr, to_A_block, to_acell) + and B_regs[bki_reg, bni] not in bs + ): + locations += [firstloc[(bki_reg, bni)]] + bs.append(B_regs[bki_reg, bni]) + asm.add( + self.fuse_loadstore_block( + locations, False, B.name, to_B_block, additional_regs + ) + ) + + cell_indices = {} + for bki in range(bk): # inside this k-block + # TODO: refactor cell_indices into the cursors/blocks + for Vmi in range(bm // v_size): + for bni in range(bn): # inside this n-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + + bki_reg = bki // elem128 + if (Vmi, bki_reg, bni) not in cell_indices: + cell_indices[(Vmi, bki_reg, bni)] = 0 + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + _, B_comment = B.look(B_ptr, to_B_block, to_bcell) + comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" + asm.add( + fma( + B_regs[bki_reg, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + bcast=cell_indices[(Vmi, bki_reg, bni)], + sub=sub, + ) + ) + + if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): + cell_indices[(Vmi, bki_reg, bni)] += 1 + + return asm diff --git a/pspamm/codegen/architectures/arm/inlineprinter.py b/pypspamm/codegen/architectures/arm/inlineprinter.py similarity index 80% rename from pspamm/codegen/architectures/arm/inlineprinter.py rename to pypspamm/codegen/architectures/arm/inlineprinter.py index a8a8bc5..6a83336 100644 --- a/pspamm/codegen/architectures/arm/inlineprinter.py +++ b/pypspamm/codegen/architectures/arm/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * + +from pypspamm.codegen.ast import * +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -16,7 +17,6 @@ class InlinePrinter(Visitor): output = None stack = None - def __init__(self, precision: Precision): self.output = [] self.stack = [] @@ -26,10 +26,9 @@ def __init__(self, precision: Precision): def show(self): print("\n".join(self.output)) - def addLine(self, stmt: str, comment: str): - line = " "*self.lmargin + self.indent*self.depth + line = " " * self.lmargin + self.indent * self.depth if stmt is not None and comment is not None and self.show_comments: stmt = '"' + stmt + '\\r\\n"' @@ -43,8 +42,6 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - - def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly m = stmt.mult_src.ugly @@ -65,7 +62,11 @@ def visitMul(self, stmt: MulStmt): self.addLine(s, stmt.comment) def visitBcst(self, stmt: BcstStmt): - b = stmt.bcast_src.ugly if self.precision == Precision.DOUBLE else stmt.bcast_src.ugly_b32 + b = ( + stmt.bcast_src.ugly + if self.precision == Precision.DOUBLE + else stmt.bcast_src.ugly_b32 + ) a = stmt.dest.ugly s = f"dup {a}, {b}" self.addLine(s, stmt.comment) @@ -74,25 +75,35 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): + if isinstance(stmt.src, Constant) and ( + stmt.src.value > 4095 or stmt.src.value < -4095 + ): if (stmt.src.value >> 16) & 0xFFFF > 0 and stmt.src.value < 0: s = "mov x11, #-1" val1 = (stmt.src.value) & 0xFFFF s1 = f"movk x11, #{val1}" - val2 = ((stmt.src.value >> 16) & 0xFFFF) + val2 = (stmt.src.value >> 16) & 0xFFFF s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s, "") - self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") - self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") + self.addLine( + s1, "load lower 16 bit of immediate that requires more than 16 bit" + ) + self.addLine( + s2, "load upper 16 bit of immediate that requires more than 16 bit" + ) elif (stmt.src.value >> 16) & 0xFFFF: val1 = (stmt.src.value) & 0xFFFF s1 = f"mov x11, #{val1}" - val2 = ((stmt.src.value >> 16) & 0xFFFF) + val2 = (stmt.src.value >> 16) & 0xFFFF s2 = f"movk x11, #{val2}, lsl #16" - self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") - self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") + self.addLine( + s1, "load lower 16 bit of immediate that requires more than 16 bit" + ) + self.addLine( + s2, "load upper 16 bit of immediate that requires more than 16 bit" + ) else: s = f"mov x11, {stmt.src.ugly}" self.addLine(s, "load lower 16 bit of immediate ") @@ -131,7 +142,6 @@ def visitMov(self, stmt: MovStmt): s = f"mov {stmt.dest.ugly}, {src_str}" self.addLine(s, stmt.comment) - def visitLoad(self, stmt: LoadStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly @@ -142,10 +152,10 @@ def visitLoad(self, stmt: LoadStmt): s = f"ldr {stmt.dest.ugly}, {src_str}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.dest4 is not None: - dispadd = '' if stmt.src.disp == 0 else f', {stmt.src.disp}' + dispadd = "" if stmt.src.disp == 0 else f", {stmt.src.disp}" s = f"ld1 {{ {stmt.dest.ugly},{stmt.dest2.ugly},{stmt.dest3.ugly},{stmt.dest4.ugly} }}, {stmt.src.ugly_base}{dispadd}" elif stmt.dest3 is not None: - dispadd = '' if stmt.src.disp == 0 else f', {stmt.src.disp}' + dispadd = "" if stmt.src.disp == 0 else f", {stmt.src.disp}" s = f"ld1 {{ {stmt.dest.ugly},{stmt.dest2.ugly},{stmt.dest3.ugly} }}, {stmt.src.ugly_base}{dispadd}" elif stmt.dest2 is not None: s = f"ldp {stmt.dest.ugly_scalar}, {stmt.dest2.ugly_scalar}, {src_str}" @@ -155,7 +165,6 @@ def visitLoad(self, stmt: LoadStmt): raise NotImplementedError() self.addLine(s, stmt.comment) - def visitStore(self, stmt: StoreStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly @@ -166,10 +175,10 @@ def visitStore(self, stmt: StoreStmt): s = f"str {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if stmt.src4 is not None: - dispadd = '' if stmt.dest.disp == 0 else f', {stmt.dest.disp}' + dispadd = "" if stmt.dest.disp == 0 else f", {stmt.dest.disp}" s = f"ld1 {{ {stmt.src.ugly},{stmt.src2.ugly},{stmt.src3.ugly},{stmt.src4.ugly} }}, {stmt.dest.ugly_base}{dispadd}" elif stmt.src3 is not None: - dispadd = '' if stmt.dest.disp == 0 else f', {stmt.dest.disp}' + dispadd = "" if stmt.dest.disp == 0 else f", {stmt.dest.disp}" s = f"ld1 {{ {stmt.src.ugly},{stmt.src2.ugly},{stmt.src3.ugly} }}, {stmt.dest.ugly_base}{dispadd}" elif stmt.src2 is not None: s = f"stp {stmt.src.ugly_scalar}, {stmt.src2.ugly_scalar}, {stmt.dest.ugly}" @@ -178,13 +187,13 @@ def visitStore(self, stmt: StoreStmt): else: raise NotImplementedError() self.addLine(s, stmt.comment) - + def visitPrefetch(self, stmt: PrefetchStmt): cache_level = stmt.closeness temporality = stmt.temporality src_string = stmt.dest.ugly - s = f'prfm P{stmt.access_type}{cache_level}{temporality}, {src_string}' + s = f"prfm P{stmt.access_type}{cache_level}{temporality}, {src_string}" self.addLine(s, stmt.comment) def visitBlock(self, block: Block): diff --git a/pspamm/codegen/architectures/arm/operands.py b/pypspamm/codegen/architectures/arm/operands.py similarity index 90% rename from pspamm/codegen/architectures/arm/operands.py rename to pypspamm/codegen/architectures/arm/operands.py index 9efa435..405194e 100644 --- a/pspamm/codegen/architectures/arm/operands.py +++ b/pypspamm/codegen/architectures/arm/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_ARM: @@ -48,25 +48,21 @@ class Register_ARM(Register): @property def ugly(self): return self.value - + @property def ugly_precision(self): return self.value.split(".")[1] - + @property def ugly_lsl_shift(self): - return { - "d": 3, - "s": 2, - "h": 1 - }[self.ugly_precision] + return {"d": 3, "s": 2, "h": 1}[self.ugly_precision] @property def clobbered(self): if self.value == "xzr": return None # removed [this comment should stay here for now---in case there's some compiler expecting it]: .replace("x", "r") - return (self.value.split(".")[0]) + return self.value.split(".")[0] @property def ugly_scalar(self): @@ -75,7 +71,7 @@ def ugly_scalar(self): @property def ugly_scalar_1d(self): return (self.value.split(".")[0]).replace("v", "d") - + @property def ugly_b32(self): return (self.value.split(".")[0]).replace("x", "w") @@ -91,15 +87,16 @@ class MemoryAddress_ARM(MemoryAddress): @property def ugly(self): return f"[{self.base.ugly}, {self.disp}]" - + @property def ugly_base(self): return f"[{self.base.ugly}]" - + @property def ugly_offset(self): # TODO: is this already dynamic? -> if precision is single, we need LSL #2 return str(self.disp) + def mem(base, offset): return MemoryAddress_ARM(base, offset) diff --git a/pypspamm/codegen/architectures/arm_sve/__init__.py b/pypspamm/codegen/architectures/arm_sve/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pspamm/codegen/architectures/arm_sve/blocksize.py b/pypspamm/codegen/architectures/arm_sve/blocksize.py similarity index 76% rename from pspamm/codegen/architectures/arm_sve/blocksize.py rename to pypspamm/codegen/architectures/arm_sve/blocksize.py index 3a46568..6b7ae65 100644 --- a/pspamm/codegen/architectures/arm_sve/blocksize.py +++ b/pypspamm/codegen/architectures/arm_sve/blocksize.py @@ -18,9 +18,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = j if maxval == 0: - raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + raise RuntimeError( + "Could not find an appropriate block size. We suggest padding the matrix dimensions" + ) - while cls.ARM_condition(bm, bn, bk+1, v_size): + while cls.ARM_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -28,13 +30,14 @@ def getBlocksize(cls, m, n, bk, v_size, prec): @classmethod def ARM_condition(cls, bm, bn, bk, v_size): # ceiling division - vm = -(bm // -v_size) - return (bn + bk) * vm + bn*bk <= 32 + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 32 @classmethod def tileable(cls, m, bm): return m % bm == 0 + class MaxK: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -55,9 +58,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = j if maxval == 0: - raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + raise RuntimeError( + "Could not find an appropriate block size. We suggest padding the matrix dimensions" + ) - while cls.ARM_condition(bm, bn, bk+1, v_size, elem128): + while cls.ARM_condition(bm, bn, bk + 1, v_size, elem128): bk += 1 return (bm, bn, bk) @@ -66,15 +71,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vkext = -(bk // -elem128) - isvkext = bn*vkext <= 16 if elem128 == 2 else bn*vkext <= 8 + isvkext = bn * vkext <= 16 if elem128 == 2 else bn * vkext <= 8 vm = -(bm // -v_size) vk = vkext if isvkext else bk - return (bn + bk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn * vk <= 32 @classmethod def tileable(cls, m, bm): return m % bm == 0 + class Cube: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -97,7 +103,9 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bk = k if maxval == 0: - raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions") + raise RuntimeError( + "Could not find an appropriate block size. We suggest padding the matrix dimensions" + ) return (bm, bn, bk) @@ -105,13 +113,14 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def ARM_condition(cls, bm, bn, bk, v_size, elem128): # ceiling division vkext = -(bk // -elem128) - isvkext = bn*vkext <= 16 if elem128 == 2 else bn*vkext <= 8 + isvkext = bn * vkext <= 16 if elem128 == 2 else bn * vkext <= 8 vm = -(bm // -v_size) vk = vkext if isvkext else bk - return (bn + bk) * vm + bn*vk <= 32 + return (bn + bk) * vm + bn * vk <= 32 @classmethod def tileable(cls, m, bm): return m % bm == 0 + Default = MaxK diff --git a/pypspamm/codegen/architectures/arm_sve/generator.py b/pypspamm/codegen/architectures/arm_sve/generator.py new file mode 100644 index 0000000..eba412e --- /dev/null +++ b/pypspamm/codegen/architectures/arm_sve/generator.py @@ -0,0 +1,658 @@ +from pypspamm.codegen.architectures.arm_sve.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * + + +class Generator(AbstractGenerator): + template = """ +void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, const {real_type} alpha, const {real_type} beta, const {real_type}* prefetch) {{{{ + __asm__ __volatile__( +{init_registers} +{body_text} + : : {args} : {clobbered}); + + #ifndef NDEBUG + #ifdef _OPENMP + #pragma omp atomic + #endif + pspamm_num_total_flops += {flop}; + #endif + +}}}} +""" + + prefetch_count = 0 + is_sparse = False + v_len = 4 # vector register length: v_len * 128 bit + predicates = {} + + def get_v_size(self): + return (16 // self.precision.size()) * self.v_len + + def get_precision(self): + return self.precision + + def get_template(self): + return self.template + + def use_broadcast(self): + return True + + def has_masks(self): + return True + + def make_argument_load(self, starting_regs, prefetch): + asm = block("Load arguments") + asm.add(ld(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(ld(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(ld(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(ld(InputOperand(f"3", "m", "alpha"), starting_regs[3], False)) + asm.add(ld(InputOperand(f"4", "m", "beta"), starting_regs[4], False)) + if prefetch: + asm.add(ld(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) + return asm + + def pred_n_trues( + self, num_trues: int, v_size: int, suffix: str = None + ) -> Register_ARM: + """pred takes num_trues=num of true elements and suffix=type of predicate (m or z) for merging or zeroing + we only use p7 as all-true predicate and p0 as overhead predicate + e.g. pred_n_trues(n=4, v_size=8, suffix="m") returns the predicate p0/m with the first 4 elements + set to true""" + assert num_trues > 0 + assert suffix == "m" or suffix == "z" or suffix is None + + # we only use p7 or p0 as predicates (1 == p0, 8 == p7) + index = 7 if num_trues >= v_size else self.predicates[num_trues] + + if suffix is None: + s = f"p{index}" + else: + s = f"p{index}/{suffix}" + return Register_ARM(AsmType.p64x8, s) + + # is called at most one time in matmul.py + def set_sparse(self): + self.is_sparse = True + + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + vm = self.ceil_div( + bm, v_size + ) # vm can be 0 if bm < v_size -> makes ceil_div necessary + + # k-broadcasting only works in 128-bit lanes + elem128 = 16 // self.get_precision().size() + vkext = -(bk // -elem128) + + # inline broadcasting is only allowed for the lower-numbered registers + self.inline_broadcast = False + if bn * vkext <= 16 if self.get_precision().size() == 8 else bn * vkext <= 8: + self.inline_broadcast = True + if bk == 1: + self.inline_broadcast = False + + if self.inline_broadcast: + vk = vkext + else: + vk = bk + + assert (bn + bk) * vm + bn * vk <= 32 # Needs to fit in SVE z registers + + prec = { + Precision.DOUBLE: "d", + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", + }[self.get_precision()] + + # make place for the two broadcasting registers + a_offset = 1 if bn * vk == 1 else 0 + assert (bn + bk) * vm + bn * vk + a_offset <= 32 + + A_regs = Matrix( + [ + [z(vm * c + r + bn * vk + a_offset, prec) for c in range(bk)] + for r in range(vm) + ] + ) + B_regs = Matrix([[z(bn * r + c, prec) for c in range(bn)] for r in range(vk)]) + C_regs = Matrix( + [[z(32 - vm * bn + vm * c + r, prec) for c in range(bn)] for r in range(vm)] + ) + + b_reg = 0 + alpha_reg = [z(b_reg, prec), z(b_reg, prec)] + beta_reg = [z(b_reg + 1, prec), z(b_reg + 1, prec)] + + starting_regs = [ + r(0), + r(1), + r(2), + r(3), + r(4), + r(5), + r(6), + r(11), + ] # r6 is needed for predicate creation, r5 is added in init_prefetching() + + additional_regs = [r(8), l("0.0"), r(10), r(6)] # r10 used for scaling offsets + + loop_regs = [r(12), r(13), r(14)] + + mask_regs = [p(0), p(7)] + + self.init_registers(m, bm, k, bk, v_size, nnz) + + prefetch_reg = prefetch is not None + + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + mask_regs, + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: + + asm = block("No register based scaling") + return asm + + def init_block(self, size): + return block("") + + def init_mask(self, m: int, bm: int, v_size: int, tempreg, maskreg) -> Block: + + asm = block("No register based scaling") + return asm + + def init_registers( + self, m: int, bm: int, k: int, bk: int, v_size: int, nnz: int + ) -> None: + + bmmod = bm % v_size + elem128 = 16 // self.get_precision().size() + bkmod = bk % elem128 if self.inline_broadcast else 0 + kmod = (k % bk) % elem128 if self.inline_broadcast else 0 + mmod = (m % bm) % v_size + + eol = "\\n\\t" # define the "end of line" sequence for easy assembly + # determine the predicate suffix + p_suffix = { + Precision.DOUBLE: "d", + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", + }[self.get_precision()] + # determine length of 'dup' registers + gen_reg = "w" if self.get_precision().size() <= 4 else "x" + overhead_counter = 6 + + comment = "// p7 denotes the 'all-true' predicate\n\t" + comment += "// if given, p0 denotes the 'bm % v_size' predicate\n\t" + comment += "// if given, p1 denotes the 'bk % elem128' predicate\n\t" + comment += "// if given, p2 denotes the 'k % elem128' predicate\n\t" + comment += "// if given, p4 denotes the 'k % v_size' predicate\n\t" + + self.has_k_overhead = kmod != 0 + self.has_bk_overhead = bkmod != 0 + self.has_nnz_overhead = nnz % elem128 != 0 + + # specification for ptrue: https://developer.arm.com/documentation/ddi0596/2021-12/SVE-Instructions/PTRUE--Initialise-predicate-from-named-constraint- + # search for 'DecodePredCount' for the explanation of how the pattern in 'ptrue p{d}.{suffix}, #pattern' is decoded: + # https://developer.arm.com/documentation/ddi0596/2020-12/Shared-Pseudocode/AArch64-Functions?lang=en#impl-aarch64.DecodePredCount.2 + # 'ptrue' doesnt work for initialising overhead predicate when using single precision -> see valid patterns from above + # overhead = "\"ptrue p0.{suffix}, #{overhead}{eol}\"\n\t" if bm != 0 else "" # define overhead predicate + overhead_bm = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_bm}{eol}"\n\t"whilelo p0.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if bmmod != 0 + else "" + ) + overhead_bk = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_bk}{eol}"\n\t"whilelo p1.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if self.has_bk_overhead + else "" + ) + overhead_k = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_k}{eol}"\n\t"whilelo p2.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if self.has_k_overhead + else "" + ) + overhead_nnz = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_nnz}{eol}"\n\t"whilelo p3.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if self.has_nnz_overhead + else "" + ) + overhead_m = ( + '"mov {gen_reg}{overhead_counter}, #{overhead_m}{eol}"\n\t"whilelo p4.{suffix}, {gen_reg}zr, {gen_reg}{overhead_counter}{eol}"\n\t' + if mmod != 0 + else "" + ) + all_true = '"ptrue p7.{suffix}, #31{eol}"' # define all true predicate + init_registers = ( + comment + + overhead_bm + + overhead_bk + + overhead_k + + overhead_nnz + + overhead_m + + all_true + ).format( + suffix=p_suffix, + gen_reg=gen_reg, + overhead_counter=overhead_counter, + v_size=v_size, + overhead_bm=bmmod, + overhead_bk=bkmod, + overhead_k=kmod, + overhead_m=mmod, + overhead_nnz=nnz % elem128, + eol=eol, + ) + + self.predicates[v_size] = 7 + if bmmod != 0: + self.predicates[bmmod] = 0 + if bkmod != 0: + self.predicates[bkmod] = 1 + if kmod != 0: + self.predicates[kmod] = 2 + if mmod != 0: + self.predicates[mmod] = 4 + + # since .format() doesn't allow partial formatting, we need to re-include the + # placeholders that are replaced at the end of generating a kernel + self.template = self.get_template().format( + init_registers=init_registers, + funcName="{funcName}", + body_text="{body_text}", + clobbered="{clobbered}", + flop="{flop}", + real_type="{real_type}", + args="{args}", + ) + + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + is_B: bool = False, + ) -> Block: + + rows, cols = registers.shape + action = "Store" if store else "Load" + asm = block(f"{action} {cursor.name} register block @ {block_offset}") + prec = self.get_precision() + + b_row, b_col, i, _ = cursor.get_block(cursor_ptr, block_offset) + + cur11 = 0 + # TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit) + threshold = ( + 1 if self.is_sparse else (16 // self.v_len) + ) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes + + # DONE if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically) + mul_vl = ( + 16 * self.v_len + ) # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4) + max_mem_ins_mult = 7 # A64FX allows a maximum positive offset of 7 in memory instructions, e.g. ld1d z1.d, p0/z, [x0, 7, MUL VL] (TODO: tune, if ever different) + max_offset = ( + mul_vl * max_mem_ins_mult + ) # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL + + prev_disp = 0 + prev_overhead = True + prev_base = None + + process_size = min(v_size, cursor.br) + + for ic in range(cols): + for ir in range(rows): + if (mask is None) or (mask[ir, ic]): + processed = ir * process_size + size = min(process_size, b_row - processed) + all_coords = [ + Coords(down=ir * process_size + i, right=ic) + for i in range(size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] + if not any(has_nonzero): + continue + elif any(has_nonzero) and not all(has_nonzero) and not is_B: + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented." + ) + + p = ( + self.pred_n_trues(size, v_size) + if not is_B + else self.pred_n_trues(process_size, v_size) + ) + p_zeroing = ( + self.pred_n_trues(size, v_size, "z") + if not is_B + else self.pred_n_trues(process_size, v_size, "z") + ) + cell_offset = Coords(down=ir * process_size, right=ic) + + # addr = base "pointer" + relative offset in bytes + addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr.disp += self.precision.size() * load_offset + + offset = addr.disp - prev_disp + + # count how many elements we have processed between last step and this step + cont_counter = offset // mul_vl + larger_max_offset = cont_counter > max_mem_ins_mult + non_dividing_offset = offset % mul_vl != 0 + + if ( + larger_max_offset + or (prev_overhead and addr.disp > 0) + or non_dividing_offset + ): + offset_comment = ( + f"disp > {max_offset}" + if larger_max_offset + else ( + "disp % VL != 0" + if non_dividing_offset + else "previous mem. instr. used p0" + ) + ) + asm.add( + add( + addr.disp, additional_regs[0], offset_comment, addr.base + ) + ) + prev_disp = addr.disp + addr.base = additional_regs[0] + prev_base = addr.base + + # adjust addr.disp to a multiple of a SVE vector's length + if prev_base is None: + prev_base = addr.base + + addr.base = prev_base + addr.disp = (addr.disp - prev_disp) // mul_vl + + if store: + asm.add( + st( + registers[ir, ic], + addr, + True, + comment, + pred=p, + scalar_offs=False, + add_reg=additional_regs[2], + ) + ) + # perform prefetching after a store instruction, similar to KNL case + if prefetching: + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) + addr.disp += self.precision.size() * load_offset + if prev_disp > 0: + asm.add( + add( + prev_disp, + additional_regs[3], + "increment the prefetch register", + addr.base, + ) + ) + asm.add( + prefetch( + mem( + ( + additional_regs[3] + if prev_disp > 0 + else addr.base + ), + (addr.disp - prev_disp) // mul_vl, + ), + "", + p, + prec, + access_type="LD", + closeness="L2", + temporality="KEEP", + ) + ) + else: + asm.add( + ld( + addr, + registers[ir, ic], + True, + comment, + pred=p_zeroing, + is_B=is_B, + scalar_offs=False, + add_reg=additional_regs[2], + ) + ) + + prev_overhead = ( + p is None or int(p.ugly[1]) == 0 + ) # determine if we previously used p0 (overhead predicate) + + return asm + + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: + + rows, cols = registers.shape + asm = block("zero registers") + + for ic in range(cols): + for ir in range(rows): + asm.add(mov(additional_regs[1], registers[ir, ic], True)) + + return asm + + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. + """ + + asm = block("Block GEMM microkernel") + """block_row, block_col, (start)index, pattern_matrix (true/false)""" + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + + # tell sparse_mask() that we use sve + mask = sparse_mask( + A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True + ) + asm.add( + self.move_register_block( + A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False + ) + ) + + # x = 0; + bs = [] + cur11 = -1000 + Vm = max(self.ceil_div(bm, v_size), 1) + + multiple = self.precision.size() + # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 + # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 + # in both cases: instruction encodes the immediate offset within 6 bits + if not self.inline_broadcast: + max_offs = (2**6 - 1) * multiple + divider = 1 + elem128 = 1 + vk = bk + else: + max_offs = 127 + divider = 16 + elem128 = 16 // self.get_precision().size() + vk = -(bk // -elem128) + + preg = self.pred_n_trues(elem128, elem128, "z") + preg_last = ( + preg if bk % elem128 == 0 else self.pred_n_trues(bk % elem128, elem128, "z") + ) + firstloc = {} + for Vmi in range(Vm): + # set to all v_size predicates to true, we want to replicate a B element into a whole vector + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block + bki_reg = bki // elem128 + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): + if (bki_reg, bni) not in firstloc: + B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) + firstloc[(bki_reg, bni)] = (B_cell_addr, B_comment) + if ( + A.has_nonzero_cell(A_ptr, to_A_block, to_acell) + and B_regs[bki_reg, bni] not in bs + ): + p_zeroing = preg_last if bki_reg + 1 == vk else preg + + B_cell_addr = firstloc[(bki_reg, bni)][0] + B_comment = firstloc[(bki_reg, bni)][1] + + # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value + if ( + B_cell_addr.disp > max_offs + or B_cell_addr.disp % divider != 0 + ): + moved = B_cell_addr.disp - cur11 + if ( + moved > 0 + and moved <= max_offs + and moved % divider == 0 + ): + B_cell_addr.disp = moved + else: + asm.add( + add( + B_cell_addr.disp, + additional_regs[0], + "", + B_cell_addr.base, + ) + ) + cur11 = B_cell_addr.disp + B_cell_addr.disp = 0 + + B_cell_addr.base = additional_regs[0] + + if not self.inline_broadcast: + asm.add( + ld( + B_cell_addr, + B_regs[bki_reg, bni], + True, + B_comment, + pred=p_zeroing, + is_B=True, + ) + ) + else: + asm.add( + ld( + B_cell_addr, + B_regs[bki_reg, bni], + True, + B_comment, + pred=p_zeroing, + sub128=True, + ) + ) + bs.append(B_regs[bki_reg, bni]) + + # TODO: refactor cell_indices into the cursors/blocks + cell_indices = {} + for bki in range(bk): # inside this k-block + for Vmi in range(Vm): + p_merging = self.pred_n_trues(bm - Vmi * v_size, v_size, "m") + end_index = ( + bm if Vmi + 1 == Vm else Vmi * v_size + v_size + ) # end_index helps us print the right index ranges + for bni in range(bn): # inside this n-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + bki_reg = bki // elem128 + if (Vmi, bki_reg, bni) not in cell_indices: + cell_indices[(Vmi, bki_reg, bni)] = 0 + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + _, B_comment = B.look(B_ptr, to_B_block, to_bcell) + comment = f"C[{Vmi * v_size}:{end_index},{bni}] += A[{Vmi * v_size}:{end_index},{bki}]*{B_comment}" + + if not self.inline_broadcast: + bcast = None + else: + bcast = cell_indices[(Vmi, bki_reg, bni)] + asm.add( + fma( + B_regs[bki_reg, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + pred=p_merging, + bcast=bcast, + sub=sub, + ) + ) + + if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell): + cell_indices[(Vmi, bki_reg, bni)] += 1 + return asm diff --git a/pspamm/codegen/architectures/arm_sve/inlineprinter.py b/pypspamm/codegen/architectures/arm_sve/inlineprinter.py similarity index 82% rename from pspamm/codegen/architectures/arm_sve/inlineprinter.py rename to pypspamm/codegen/architectures/arm_sve/inlineprinter.py index 27d3dc8..ad1c9ca 100644 --- a/pspamm/codegen/architectures/arm_sve/inlineprinter.py +++ b/pypspamm/codegen/architectures/arm_sve/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * + +from pypspamm.codegen.ast import * +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -19,14 +20,19 @@ def __init__(self, precision: Precision): self.output = [] self.stack = [] self.precision = precision - self.ugly_precision ={ + self.ugly_precision = { Precision.DOUBLE: "d", Precision.SINGLE: "w", Precision.HALF: "h", Precision.BFLOAT16: "h", }[self.precision] - assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) + assert precision in ( + Precision.BFLOAT16, + Precision.HALF, + Precision.SINGLE, + Precision.DOUBLE, + ) def show(self): print("\n".join(self.output)) @@ -89,7 +95,9 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - if isinstance(stmt.src, Constant) and (stmt.src.value > 4095 or stmt.src.value < -4095): + if isinstance(stmt.src, Constant) and ( + stmt.src.value > 4095 or stmt.src.value < -4095 + ): # This condition is probably related to immediate values being restricted to 12 bits for add instructions # https://developer.arm.com/documentation/dui0802/a/A64-General-Instructions/ADD--immediate- # https://developer.arm.com/documentation/ddi0596/2020-12/Base-Instructions/ADD--immediate---Add--immediate-- @@ -97,19 +105,27 @@ def visitAdd(self, stmt: AddStmt): s = "mov x11, #-1" val1 = (stmt.src.value) & 0xFFFF s1 = f"movk x11, #{val1}" - val2 = ((stmt.src.value >> 16) & 0xFFFF) + val2 = (stmt.src.value >> 16) & 0xFFFF s2 = f"movk x11, #{val2}, lsl #16" self.addLine(s, "") - self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") - self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") + self.addLine( + s1, "load lower 16 bit of immediate that requires more than 16 bit" + ) + self.addLine( + s2, "load upper 16 bit of immediate that requires more than 16 bit" + ) elif (stmt.src.value >> 16) != 0: val1 = (stmt.src.value) & 0xFFFF s1 = "mov x11, #{val1}" - val2 = ((stmt.src.value >> 16) & 0xFFFF) + val2 = (stmt.src.value >> 16) & 0xFFFF s2 = "movk x11, #{val2}, lsl #16" - self.addLine(s1, "load lower 16 bit of immediate that requires more than 16 bit") - self.addLine(s2, "load upper 16 bit of immediate that requires more than 16 bit") + self.addLine( + s1, "load lower 16 bit of immediate that requires more than 16 bit" + ) + self.addLine( + s2, "load upper 16 bit of immediate that requires more than 16 bit" + ) else: s = f"mov x11, {stmt.src.ugly}" self.addLine(s, "load lower 16 bit of immediate ") @@ -152,13 +168,18 @@ def visitMov(self, stmt: MovStmt): def visitLoad(self, stmt: LoadStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly - elif isinstance(stmt.dest, MemoryAddress) and (stmt.src.ugly_offset != "0" and stmt.scalar_offs): - self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.src.ugly_offset}", f"move immediate offset into {stmt.add_reg.ugly}") + elif isinstance(stmt.dest, MemoryAddress) and ( + stmt.src.ugly_offset != "0" and stmt.scalar_offs + ): + self.addLine( + f"mov {stmt.add_reg.ugly}, #{stmt.src.ugly_offset}", + f"move immediate offset into {stmt.add_reg.ugly}", + ) # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision src_str = f"[{stmt.src.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.dest.ugly_lsl_shift}]" elif stmt.typ == AsmType.f64x4 or stmt.typ == AsmType.f64x2: # (note: the 128-bit and 256-bit broadcasts need the following more rudimentary format here) - if stmt.src.ugly_offset == '0': + if stmt.src.ugly_offset == "0": src_str = f"[{stmt.src.ugly_base}]" else: src_str = f"[{stmt.src.ugly_base}, #{stmt.src.ugly_offset}]" @@ -186,9 +207,15 @@ def visitLoad(self, stmt: LoadStmt): def visitStore(self, stmt: StoreStmt): if isinstance(stmt.src, Label): src_str = "#" + stmt.src.ugly - elif isinstance(stmt.dest, MemoryAddress) and stmt.dest.ugly_offset != "0" and stmt.scalar_offs: - self.addLine(f"mov {stmt.add_reg.ugly}, #{stmt.dest.ugly_offset}", - f"move immediate offset into {stmt.add_reg.ugly}") + elif ( + isinstance(stmt.dest, MemoryAddress) + and stmt.dest.ugly_offset != "0" + and stmt.scalar_offs + ): + self.addLine( + f"mov {stmt.add_reg.ugly}, #{stmt.dest.ugly_offset}", + f"move immediate offset into {stmt.add_reg.ugly}", + ) # TODO: adapt ugly_lsl_shift to account for possible single precision instead of double precision regsize = stmt.add_dest.size() // 16 dest_str = f"[{stmt.dest.ugly_base}, {stmt.add_reg.ugly}, LSL #{stmt.src.ugly_lsl_shift}]" diff --git a/pspamm/codegen/architectures/arm_sve/operands.py b/pypspamm/codegen/architectures/arm_sve/operands.py similarity index 92% rename from pspamm/codegen/architectures/arm_sve/operands.py rename to pypspamm/codegen/architectures/arm_sve/operands.py index 90328ff..c56cff0 100644 --- a/pspamm/codegen/architectures/arm_sve/operands.py +++ b/pypspamm/codegen/architectures/arm_sve/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_ARM: @@ -52,18 +52,14 @@ def ugly_precision(self): @property def ugly_lsl_shift(self): - return { - "d": 3, - "s": 2, - "h": 1 - }[self.ugly_precision] + return {"d": 3, "s": 2, "h": 1}[self.ugly_precision] @property def clobbered(self): if self.value == "xzr": return None # removed [this comment should stay here for now---in case there's some compiler expecting it]: .replace("x", "r") - return (self.value.split(".")[0].split("/")[0]) + return self.value.split(".")[0].split("/")[0] @property def ugly_scalar(self): @@ -71,7 +67,7 @@ def ugly_scalar(self): @property def ugly_scalar_1d(self): - #turns "Vn.2d" into "Dn" + # turns "Vn.2d" into "Dn" return (self.value.split(".")[0]).replace("v", "d") @@ -80,6 +76,7 @@ def ugly_scalar_1d(self): z = lambda n, prec: Register_ARM(AsmType.f64x8, "z" + str(n) + "." + prec) p = lambda n: Register_ARM(AsmType.i64, "p" + str(n)) + class MemoryAddress_ARM(MemoryAddress): @property def ugly(self): diff --git a/pypspamm/codegen/architectures/hsw/__init__.py b/pypspamm/codegen/architectures/hsw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pspamm/codegen/architectures/hsw/blocksize.py b/pypspamm/codegen/architectures/hsw/blocksize.py similarity index 76% rename from pspamm/codegen/architectures/hsw/blocksize.py rename to pypspamm/codegen/architectures/hsw/blocksize.py index 0a38028..9ad370b 100644 --- a/pspamm/codegen/architectures/hsw/blocksize.py +++ b/pypspamm/codegen/architectures/hsw/blocksize.py @@ -4,16 +4,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bm = m bn = n - + if cls.HSW_condition(bm, bn, bk, v_size): - while cls.HSW_condition(bm, bn, bk+1, v_size): + while cls.HSW_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn) while not cls.HSW_condition(bm, bn, bk, v_size): bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) - while cls.HSW_condition(bm, bn, bk+1, v_size): + while cls.HSW_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn) @@ -37,6 +37,7 @@ def HSW_condition(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return (bn + bk) * vm + bn * bk <= 16 + class Max: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -44,17 +45,19 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): # can be replaced by cls.HSW_condition_extended here # (but that seemed to be slower in the end) if cls.HSW_condition(i, j, bk, v_size): - if i*j > maxval and (cls.HSW_condition(i, j, bk, v_size) or j > 1): - maxval = i*j + if i * j > maxval and ( + cls.HSW_condition(i, j, bk, v_size) or j > 1 + ): + maxval = i * j bm = i - bn = j + bn = j - while cls.HSW_condition(bm, bn, bk+1, v_size): + while cls.HSW_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -71,6 +74,7 @@ def HSW_condition_extended(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return bn * vm + bn * bk + 1 <= 16 + class Cube: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -78,14 +82,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): for k in range(1, 200): # can be replaced by cls.HSW_condition_extended here # (but that seemed to be slower in the end) if cls.HSW_condition(i, j, bk, v_size): - if i*j*k >= maxval and (cls.HSW_condition(i, j, k, v_size) or j > 1): - maxval = i*j*k + if i * j * k >= maxval and ( + cls.HSW_condition(i, j, k, v_size) or j > 1 + ): + maxval = i * j * k bm = i bn = j bk = k @@ -104,4 +110,5 @@ def HSW_condition_extended(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return bn * vm + bn * bk + 1 <= 16 + Default = Max diff --git a/pypspamm/codegen/architectures/hsw/generator.py b/pypspamm/codegen/architectures/hsw/generator.py new file mode 100644 index 0000000..c468616 --- /dev/null +++ b/pypspamm/codegen/architectures/hsw/generator.py @@ -0,0 +1,393 @@ +from pypspamm.codegen.architectures.hsw.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.regcache import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * + + +class Generator(AbstractGenerator): + template = """ +void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, {real_type} alpha, {real_type} beta, {real_type} const* prefetch) {{ + {real_type}* alpha_p = α + {real_type}* beta_p = β + __asm__ __volatile__( +{body_text} + : : {args} : {clobbered}); + + #ifndef NDEBUG + #ifdef _OPENMP + #pragma omp atomic + #endif + pspamm_num_total_flops += {flop}; + #endif +}} +""" + v_len = 2 + + def get_v_size(self): + return (16 // self.precision.size()) * self.v_len + + def get_template(self): + return Generator.template + + def use_broadcast(self): + return True + + def has_masks(self): + return False + + def init_mask(self, m, bm, v_size, tempreg, maskregs): + return block("") + + def scale_base(self): + return 256 + + def pred_n_trues(self, count, v_size, mode): + # hacked in right now: we set a number as predicate if we need it + if count < v_size: + return (1 << count) - 1 + else: + return None + + def make_argument_load(self, starting_regs, prefetch): + asm = block("Load arguments") + asm.add(mov(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(mov(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(mov(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(mov(InputOperand(f"3", "m", "alpha_p"), starting_regs[3], False)) + asm.add(mov(InputOperand(f"4", "m", "beta_p"), starting_regs[4], False)) + if prefetch: + asm.add(mov(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) + return asm + + def make_expand_predicate(self, mask): + combined = 0 + offset = 0 + for i, value in enumerate(mask): + if value: + combined |= offset << (8 * i) + offset += 1 + else: + combined |= 255 << (8 * i) + return combined + + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + assert bm % v_size == 0 + vm = self.ceil_div(bm, v_size) + + # Needs to fit in AVX/AVX2 ymm registers + if (bn + bk) * vm + bn * bk <= 16: + self.preloadA = True + else: + self.preloadA = False + assert bn * vm + bn * bk + 1 <= 16 + + vmm = {1: xmm, 2: ymm}[self.v_len] + + if self.preloadA: + A_regs = Matrix([[vmm(vm * c + r) for c in range(bk)] for r in range(vm)]) + Aoffset = vm * bk + else: + A_regs = Matrix([[vmm(0) for c in range(bk)] for r in range(vm)]) + Aoffset = 1 + + B_regs = Matrix( + [[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)] + ) + C_regs = Matrix( + [[vmm(16 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)] + ) + starting_regs = [rdi, rsi, rdx, rbx, rcx] + + b_reg = Aoffset + alpha_reg = [xmm(b_reg), vmm(b_reg)] + beta_reg = [xmm(b_reg + 1), vmm(b_reg + 1)] + + additional_regs = [r(9), r(10), r(11), r(15), rax] # ,r(13),r(14) + + prefetch_reg = prefetch == "BL2viaC" + if prefetch_reg: + starting_regs += [r(8)] + else: + additional_regs += [r(8)] + + loop_regs = [r(12), r(13), r(14)] + + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + [], + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: + + asm = block("Optimize usage of offsets when accessing B Matrix") + + for i in range(1, len(additional_regs)): + asm.add(mov(c(self.scale_base() * (2 * i - 1)), additional_regs[i], False)) + + return asm + + def init_block(self, size): + return block("") + + def reg_based_scaling( + self, asm, addr: MemoryAddress, additional_regs: List[Register] + ): + halfscale = self.scale_base() // 2 + if addr.disp >= halfscale: + base = (addr.disp + halfscale) // self.scale_base() + scaling = 1 + while base % 2 == 0: + base //= 2 + scaling *= 2 + register = base // 2 + 1 + + if register < len(additional_regs) and scaling <= 8: + addr.index = additional_regs[register] + addr.scaling = scaling + addr.disp = ((addr.disp + halfscale) % self.scale_base()) - halfscale + + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + temp=None, + ) -> Block: + + rows, cols = registers.shape + action = "Store" if store else "Load" + asm = block(f"{action} {cursor.name} register block @ {block_offset}") + + for ic in range(cols): + for ir in range(rows): + if (mask is None) or (mask[ir, ic]): + all_coords = [ + Coords(down=ir * v_size + i, right=ic) for i in range(v_size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] + if all(has_nonzero): + cell_offset = all_coords[0] + addr, comment = cursor.look( + cursor_ptr, block_offset, cell_offset + ) + addr.disp += self.precision.size() * load_offset + self.reg_based_scaling(asm, addr, additional_regs) + if store: + asm.add(mov(registers[ir, ic], addr, True, comment)) + if prefetching == "BL2viaC" and pf_cursor is not None: + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) + addr.disp += self.precision.size() * load_offset + self.reg_based_scaling(asm, addr, additional_regs) + asm.add(prefetch(addr, closeness="L2")) + else: + asm.add(mov(addr, registers[ir, ic], True, comment)) + elif any(has_nonzero): + raise NotImplementedError( + "Element-wise sparsity in A is not yet fully implemented." + ) + firsti = 0 + for i in range(v_size): + if has_nonzero[i]: + firsti = i + break + addr, comment = cursor.look( + cursor_ptr, block_offset, all_coords[firsti] + ) + # assume contiguous memory here + + asm.add( + mov( + self.make_expand_predicate(all_coords), + additional_regs[0], + False, + ) + ) + return asm + + def move_register_single( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + ir, + ic, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + ) -> Block: + + asm = block("") + + if (mask is None) or (mask[ir, ic]): + cell_offset = Coords(down=ir * v_size, right=ic) + addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr.disp += self.precision.size() * load_offset + asm.add(mov(addr, registers[ir, ic], True, comment)) + return asm + + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: + + rows, cols = registers.shape + asm = block("zero registers") + + for ic in range(cols): + for ir in range(rows): + asm.add(mov(0, registers[ir, ic], True)) + + return asm + + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. + """ + asm = block("Block GEMM microkernel") + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + assert bm % v_size == 0 + + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) + if self.preloadA: + asm.add( + self.move_register_block( + A, + A_ptr, + to_A_block, + A_regs, + v_size, + additional_regs, + mask, + store=False, + temp=B_regs[0, 0], + ) + ) + else: + asm.add( + self.move_register_single( + A, + A_ptr, + to_A_block, + A_regs, + v_size, + additional_regs, + 0, + 0, + mask, + store=False, + ) + ) + + Vm = self.ceil_div(bm, v_size) + + bs = [] + bsv = [] + for Vmi in range(Vm): + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + B_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) + self.reg_based_scaling(asm, B_addr, additional_regs) + if B_regs[bki, bni] not in bs: + asm.add(bcst(B_addr, B_regs[bki, bni], comment=B_comment)) + bs.append(B_regs[bki, bni]) + bsv.append(B_addr) + else: + # just to make sure we do not use registers differently in a block + assert bsv[bs.index(B_regs[bki, bni])].ugly == B_addr.ugly + + for bki in range(bk): # inside this k-block + for Vmi in range(Vm): + if not self.preloadA and not (Vmi, bki) == (0, 0): + asm.add( + self.move_register_single( + A, + A_ptr, + to_A_block, + A_regs, + v_size, + additional_regs, + Vmi, + bki, + mask, + store=False, + ) + ) + for bni in range(bn): # inside this n-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + _, B_comment = B.look(B_ptr, to_B_block, to_bcell) + comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" + asm.add( + fma( + B_regs[bki, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + bcast=None, + sub=sub, + ) + ) + return asm diff --git a/pspamm/codegen/architectures/hsw/inlineprinter.py b/pypspamm/codegen/architectures/hsw/inlineprinter.py similarity index 70% rename from pspamm/codegen/architectures/hsw/inlineprinter.py rename to pypspamm/codegen/architectures/hsw/inlineprinter.py index a1d6fc2..c5f9388 100644 --- a/pspamm/codegen/architectures/hsw/inlineprinter.py +++ b/pypspamm/codegen/architectures/hsw/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * + +from pypspamm.codegen.ast import * +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -16,27 +17,20 @@ class InlinePrinter(Visitor): output = None stack = None - def __init__(self, precision: Precision): self.output = [] self.stack = [] assert precision in (Precision.SINGLE, Precision.DOUBLE) self.precision = precision - self.psuffix = { - Precision.DOUBLE: "d", - Precision.SINGLE: "s" - }[precision] - self.bpsuffix = { - Precision.DOUBLE: "q", - Precision.SINGLE: "d" - }[precision] + self.psuffix = {Precision.DOUBLE: "d", Precision.SINGLE: "s"}[precision] + self.bpsuffix = {Precision.DOUBLE: "q", Precision.SINGLE: "d"}[precision] def show(self): print("\n".join(self.output)) def addLine(self, stmt: str, comment: str): - line = " "*self.lmargin + self.indent*self.depth + line = " " * self.lmargin + self.indent * self.depth if stmt is not None and comment is not None and self.show_comments: stmt = '"' + stmt + '\\r\\n"' @@ -76,7 +70,11 @@ def visitBcst(self, stmt: BcstStmt): # reformat bcast_src to be a memory address b = f"0({b})" regsize = stmt.dest.size() - instruction = "vmovddup" if self.precision == Precision.DOUBLE and regsize == 16 else f"vbroadcasts{self.psuffix}" + instruction = ( + "vmovddup" + if self.precision == Precision.DOUBLE and regsize == 16 + else f"vbroadcasts{self.psuffix}" + ) s = f"{instruction} {b}, {a}" self.addLine(s, stmt.comment) @@ -88,7 +86,7 @@ def visitAdd(self, stmt: AddStmt): self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - self.addLine('.align 16', 'Align label') + self.addLine(".align 16", "Align label") s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) @@ -114,21 +112,40 @@ def visitMov(self, stmt: MovStmt): s = f"vxorps {stmt.dest.ugly_xmm}, {stmt.dest.ugly_xmm}, {stmt.dest.ugly_xmm}" self.addLine(s, stmt.comment) elif stmt.pred is not None: - self.addLine(f"vpxor {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly}", "") - self.addLine(f"vpblendd {src_str}, {stmt.dest.ugly}, {stmt.pred}, {stmt.dest.ugly}", "") + self.addLine( + f"vpxor {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly}", "" + ) + self.addLine( + f"vpblendd {src_str}, {stmt.dest.ugly}, {stmt.pred}, {stmt.dest.ugly}", + "", + ) elif stmt.expand: # TODO: unfinished - self.addLine(f"vpxor {stmt.temp.ugly}, {stmt.temp.ugly}, {stmt.temp.ugly}") + self.addLine( + f"vpxor {stmt.temp.ugly}, {stmt.temp.ugly}, {stmt.temp.ugly}" + ) regsize = stmt.dest.size() if self.precision == Precision.SINGLE and regsize == 32: self.addLine(f"vmovq {stmt.pred.ugly}, {stmt.dest.ugly_xmm}", "") - self.addLine(f"vpmovzxb{self.bpsuffix} {stmt.dest.ugly_xmm}, {stmt.dest.ugly}", "") - self.addLine(f"vpermd {src_str}, {stmt.dest.ugly}, {stmt.dest.ugly}", "") + self.addLine( + f"vpmovzxb{self.bpsuffix} {stmt.dest.ugly_xmm}, {stmt.dest.ugly}", + "", + ) + self.addLine( + f"vpermd {src_str}, {stmt.dest.ugly}, {stmt.dest.ugly}", "" + ) elif regsize == 16: - self.addLine(f"vpermilps {src_str}, MISSING_PREDICATE, {stmt.dest.ugly}", "") + self.addLine( + f"vpermilps {src_str}, MISSING_PREDICATE, {stmt.dest.ugly}", "" + ) elif self.precision == Precision.DOUBLE: - self.addLine(f"vpermpd {src_str}, MISSING_PREDICATE, {stmt.dest.ugly}", "") - self.addLine(f"vpblendd {stmt.temp.ugly}, {stmt.dest.ugly}, MISSING_PREDICATE, {stmt.dest.ugly}", "") + self.addLine( + f"vpermpd {src_str}, MISSING_PREDICATE, {stmt.dest.ugly}", "" + ) + self.addLine( + f"vpblendd {stmt.temp.ugly}, {stmt.dest.ugly}, MISSING_PREDICATE, {stmt.dest.ugly}", + "", + ) else: s = f"vmovup{self.psuffix} {src_str}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) @@ -152,7 +169,7 @@ def visitPrefetch(self, stmt: PrefetchStmt): def visitBlock(self, block: Block): self.stack.append(block) self.depth += 1 - if self.show_comments and block.comment != '': + if self.show_comments and block.comment != "": self.addLine(None, block.comment) for stmt in block.contents: stmt.accept(self) diff --git a/pspamm/codegen/architectures/hsw/operands.py b/pypspamm/codegen/architectures/hsw/operands.py similarity index 76% rename from pspamm/codegen/architectures/hsw/operands.py rename to pypspamm/codegen/architectures/hsw/operands.py index d26ba47..72e33b5 100644 --- a/pspamm/codegen/architectures/hsw/operands.py +++ b/pypspamm/codegen/architectures/hsw/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_HSW: @@ -20,14 +20,14 @@ def c(n): return Constant_HSW(value=int(n)) - class Label_HSW(Label): @property def ugly(self): - #return self.ordinal + # return self.ordinal return self.value.upper() + "_%=" + def l(label: str): return Label_HSW(label) @@ -37,7 +37,7 @@ class Register_HSW(Register): @property def ugly(self): return "%%" + self.value - + @property def ugly_xmm(self): return "%%x" + self.value[1:] @@ -50,20 +50,16 @@ def ugly_xmm(self): rdi = Register_HSW(AsmType.i64, "rdi") rsi = Register_HSW(AsmType.i64, "rsi") -r = lambda n: Register_HSW(AsmType.i64, "r"+str(n)) if n > 7 else gen_regs[n] -xmm = lambda n: Register_HSW(AsmType.f64x2, "xmm"+str(n)) -ymm = lambda n: Register_HSW(AsmType.f64x4, "ymm"+str(n)) - - +r = lambda n: Register_HSW(AsmType.i64, "r" + str(n)) if n > 7 else gen_regs[n] +xmm = lambda n: Register_HSW(AsmType.f64x2, "xmm" + str(n)) +ymm = lambda n: Register_HSW(AsmType.f64x4, "ymm" + str(n)) class MemoryAddress_HSW(MemoryAddress): - - def __init__(self, - base: Register, - disp: int, - index: Register = None, - scaling: int = None) -> None: + + def __init__( + self, base: Register, disp: int, index: Register = None, scaling: int = None + ) -> None: self.base = base self.disp = disp self.index = index @@ -74,15 +70,10 @@ def ugly(self): if self.index is None: return f"{self.disp}({self.base.ugly})" return f"{self.disp}({self.base.ugly},{self.index.ugly},{self.scaling})" - + def registers(self): return [self.base, self.index] + def mem(base, offset, index=None, scaling=None): return MemoryAddress_HSW(base, offset, index, scaling) - - - - - - diff --git a/pypspamm/codegen/architectures/knl/__init__.py b/pypspamm/codegen/architectures/knl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pspamm/codegen/architectures/knl/blocksize.py b/pypspamm/codegen/architectures/knl/blocksize.py similarity index 75% rename from pspamm/codegen/architectures/knl/blocksize.py rename to pypspamm/codegen/architectures/knl/blocksize.py index b9cd420..8c3afee 100644 --- a/pspamm/codegen/architectures/knl/blocksize.py +++ b/pypspamm/codegen/architectures/knl/blocksize.py @@ -4,16 +4,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bm = m bn = n - + if cls.KNL_condition(bm, bn, bk, v_size): - while cls.KNL_condition(bm, bn, bk+1, v_size): + while cls.KNL_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn) while not cls.KNL_condition(bm, bn, bk, v_size): bm, bn = cls.lowerToNextDiv(m, n, bm, bn, v_size) - while cls.KNL_condition(bm, bn, bk+1, v_size): + while cls.KNL_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn) @@ -35,7 +35,8 @@ def lowerToNextDiv(cls, m, n, bm, bn, v_size): def KNL_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 + return (bn + bk) * vm <= 32 + class Max: @classmethod @@ -45,16 +46,16 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(1, m+1): + for i in range(1, m + 1): next_multiple = -(i // -v_size) - for j in range(1, n+1): + for j in range(1, n + 1): if cls.KNL_condition(next_multiple, j, bk, v_size): - if i*j >= maxval: - maxval = i*j + if i * j >= maxval: + maxval = i * j bm = i - bn = j - - while cls.KNL_condition(bm, bn, bk+1, v_size): + bn = j + + while cls.KNL_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -63,12 +64,13 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def KNL_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 + return (bn + bk) * vm <= 32 @classmethod def tileable(cls, m, bm): return m % bm == 0 + class MaxBn: @classmethod def getBlocksize(cls, m, n, bk, v_size, prec): @@ -76,11 +78,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bm = v_size bn = 1 - for j in range(1, n+1): + for j in range(1, n + 1): if cls.KNL_condition(bm, j, bk, v_size): bn = j - while cls.KNL_condition(bm, bn, bk+1, v_size): + while cls.KNL_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -89,7 +91,8 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def KNL_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 + return (bn + bk) * vm <= 32 + class CubeBn: @classmethod @@ -100,11 +103,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): maxval = 0 - for j in range(1, n+1): + for j in range(1, n + 1): for k in range(1, 200): if cls.KNL_condition(bm, j, k, v_size): - if j*k >= maxval: - maxval = j*k + if j * k >= maxval: + maxval = j * k bn = j bk = k @@ -114,6 +117,7 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def KNL_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 + return (bn + bk) * vm <= 32 + Default = MaxBn diff --git a/pspamm/codegen/architectures/knl/generator.py b/pypspamm/codegen/architectures/knl/generator.py similarity index 51% rename from pspamm/codegen/architectures/knl/generator.py rename to pypspamm/codegen/architectures/knl/generator.py index f039f80..8da717b 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pypspamm/codegen/architectures/knl/generator.py @@ -1,11 +1,11 @@ -from pspamm.cursors import * +from pypspamm.codegen.architectures.knl.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.regcache import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * -from pspamm.codegen.architectures.knl.operands import * -from pspamm.codegen.ast import * -from pspamm.codegen.sugar import * -from pspamm.codegen.generator import * -from pspamm.codegen.precision import * -from pspamm.codegen.regcache import * class Generator(AbstractGenerator): template = """ @@ -26,7 +26,7 @@ class Generator(AbstractGenerator): }} """ v_len = 4 - predicates = {0:kmask(0)} + predicates = {0: kmask(0)} def get_v_size(self): return (16 // self.precision.size()) * self.v_len @@ -39,7 +39,7 @@ def use_broadcast(self): def has_masks(self): return True - + def scale_base(self): # larger scaling range for B inline broadcasts return self.precision.size() * 256 @@ -47,44 +47,52 @@ def scale_base(self): def pred_n_trues(self, count, v_size, mode): # a bit hacky at the moment (won't work for all masks) if count < v_size: - return Predicate(self.predicates[count], mode=='z') + return Predicate(self.predicates[count], mode == "z") else: return None - + def make_argument_load(self, starting_regs, prefetch): asm = block("Load arguments") - asm.add(mov(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) - asm.add(mov(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) - asm.add(mov(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) - asm.add(mov(InputOperand(f'3', 'm', 'alpha_p'), starting_regs[3], False)) - asm.add(mov(InputOperand(f'4', 'm', 'beta_p'), starting_regs[4], False)) + asm.add(mov(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(mov(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(mov(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(mov(InputOperand(f"3", "m", "alpha_p"), starting_regs[3], False)) + asm.add(mov(InputOperand(f"4", "m", "beta_p"), starting_regs[4], False)) if prefetch: - asm.add(mov(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) + asm.add(mov(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) return asm - def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): vm = self.ceil_div(bm, v_size) - assert((bn+bk) * vm <= 32) # Needs to fit in AVX512 xmm/ymm/zmm registers + assert (bn + bk) * vm <= 32 # Needs to fit in AVX512 xmm/ymm/zmm registers - vmm = { - 1: xmm, - 2: ymm, - 4: zmm - }[self.v_len] + vmm = {1: xmm, 2: ymm, 4: zmm}[self.v_len] - A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) + A_regs = Matrix([[vmm(vm * c + r) for c in range(bk)] for r in range(vm)]) B_regs = Matrix([[]]) - C_regs = Matrix([[vmm(32 - vm*bn + vm*c + r) for c in range(bn)] - for r in range(vm)]) + C_regs = Matrix( + [[vmm(32 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)] + ) starting_regs = [rdi, rsi, rdx, rbx, rcx] alpha_reg = [rbx, rbx] beta_reg = [rcx, rcx] - additional_regs = [r(9),r(10),r(11),r(15),rax] # ,r(13),r(14) + additional_regs = [r(9), r(10), r(11), r(15), rax] # ,r(13),r(14) - prefetch_reg = prefetch == 'BL2viaC' + prefetch_reg = prefetch == "BL2viaC" if prefetch_reg: starting_regs += [r(8)] else: @@ -101,7 +109,18 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: self.predicates[rest2] = kmask(2) self.predicates[0] = kmask(0) - return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs, prefetch_reg + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + mask_regs, + prefetch_reg, + ) def init_mask(self, m, bm, v_size, tempreg, maskregs): rest = bm % v_size @@ -117,23 +136,22 @@ def init_mask(self, m, bm, v_size, tempreg, maskregs): asm.add(mov(tempreg, maskregs[1], False)) return asm - def make_scaling_offsets(self, - additional_regs: List[Register], - nnz: int - ) -> Block: + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: asm = block("Optimize usage of offsets when accessing B Matrix") scale = self.scale_base() for i in range(1, len(additional_regs)): - asm.add(mov(c((2*i-1) * scale), additional_regs[i], False)) - + asm.add(mov(c((2 * i - 1) * scale), additional_regs[i], False)) + return asm def init_block(self, size): return block("") - def reg_based_scaling(self, asm, addr: MemoryAddress, additional_regs: List[Register]): + def reg_based_scaling( + self, asm, addr: MemoryAddress, additional_regs: List[Register] + ): halfscale = self.scale_base() // 2 if addr.disp >= halfscale: base = (addr.disp + halfscale) // self.scale_base() @@ -148,20 +166,21 @@ def reg_based_scaling(self, asm, addr: MemoryAddress, additional_regs: List[Regi addr.scaling = scaling addr.disp = ((addr.disp + halfscale) % self.scale_base()) - halfscale - def move_register_block(self, - cursor: Cursor, - cursor_ptr: CursorLocation, - block_offset: Coords, - registers: Matrix[Register], - v_size: int, - additional_regs, - mask: Matrix[bool] = None, - store: bool = False, - prefetching: str = None, - load_offset: int = 0, - pf_cursor: Cursor = None, - pf_cursor_ptr: CursorLocation = None - ) -> Block: + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + ) -> Block: rows, cols = registers.shape action = "Store" if store else "Load" @@ -175,15 +194,21 @@ def move_register_block(self, for ic in range(cols): for ir in range(rows): - if (mask is None) or (mask[ir,ic]): + if (mask is None) or (mask[ir, ic]): # no register-based scaling here (for now) processed = ir * process_size size = min(process_size, b_row - processed) - all_coords = [Coords(down=ir*process_size+i,right=ic) for i in range(size)] - has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] + all_coords = [ + Coords(down=ir * process_size + i, right=ic) + for i in range(size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] if any(has_nonzero): contiguous = True firsti = 0 @@ -203,7 +228,9 @@ def move_register_block(self, lasti = i if lasti is None: lasti = size - addr, comment = cursor.look(cursor_ptr, block_offset, all_coords[firsti]) + addr, comment = cursor.look( + cursor_ptr, block_offset, all_coords[firsti] + ) addr.disp += self.precision.size() * load_offset # assume contiguous memory here @@ -219,10 +246,14 @@ def move_register_block(self, maskFound = True else: # mostly implemented, but there are still bugs - raise NotImplementedError("Element-wise sparsity in A is not yet implemented") + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented" + ) else: - raise NotImplementedError("Element-wise sparsity in A is not yet implemented") - + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented" + ) + if not maskFound: maskreg, needsAssign = maskcache.get(bitmask) if needsAssign: @@ -231,13 +262,33 @@ def move_register_block(self, pred = Predicate(maskreg, True) if store: - asm.add(mov(registers[ir,ic], addr, True, comment, pred=pred, expand=needsExpand)) - if prefetching == 'BL2viaC' and pf_cursor is not None: - addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, all_coords[firsti]) + asm.add( + mov( + registers[ir, ic], + addr, + True, + comment, + pred=pred, + expand=needsExpand, + ) + ) + if prefetching == "BL2viaC" and pf_cursor is not None: + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, all_coords[firsti] + ) addr.disp += self.precision.size() * load_offset asm.add(prefetch(addr, closeness="L2")) else: - asm.add(mov(addr, registers[ir,ic], True, comment, pred=pred, expand=needsExpand)) + asm.add( + mov( + addr, + registers[ir, ic], + True, + comment, + pred=pred, + expand=needsExpand, + ) + ) return asm def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: @@ -247,49 +298,65 @@ def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block for ic in range(cols): for ir in range(rows): - asm.add(mov(0, registers[ir,ic], True)) + asm.add(mov(0, registers[ir, ic], True)) return asm - - def make_microkernel(self, - A: Cursor, - B: Cursor, - A_ptr: CursorLocation, - B_ptr: CursorLocation, - A_regs: Matrix[Register], - B_regs, - C_regs: Matrix[Register], - v_size:int, - additional_regs, - to_A_block: Coords = Coords(), - to_B_block: Coords = Coords(), - sub: bool = False - ) -> Block: - - """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. - It is responsible for loading and unloading the A block, - It does not assume that the A or B cursors point to the start of the block. - Instead, the coordinates to the start of the block are passed separately. - It does not modify any cursor pointers. + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. """ asm = block("Block GEMM microkernel") - bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) - bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) - - mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True) - asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False)) + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + + mask = sparse_mask( + A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True + ) + asm.add( + self.move_register_block( + A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False + ) + ) Vm = max(self.ceil_div(bm, v_size), 1) - for bki in range(bk): # inside this k-block + for bki in range(bk): # inside this k-block for Vmi in range(Vm): - for bni in range(bn): # inside this n-block + for bni in range(bn): # inside this n-block to_bcell = Coords(down=bki, right=bni) - to_acell = Coords(down=Vmi*v_size, right=bki) - if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): B_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) self.reg_based_scaling(asm, B_addr, additional_regs) comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" - asm.add(fma(B_addr, A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=0, sub=sub)) + asm.add( + fma( + B_addr, + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + bcast=0, + sub=sub, + ) + ) return asm diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pypspamm/codegen/architectures/knl/inlineprinter.py similarity index 82% rename from pspamm/codegen/architectures/knl/inlineprinter.py rename to pypspamm/codegen/architectures/knl/inlineprinter.py index 3de0659..185f088 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pypspamm/codegen/architectures/knl/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * + +from pypspamm.codegen.ast import * +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -16,23 +17,27 @@ class InlinePrinter(Visitor): output = None stack = None - def __init__(self, precision: Precision): self.output = [] self.stack = [] - assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) + assert precision in ( + Precision.BFLOAT16, + Precision.HALF, + Precision.SINGLE, + Precision.DOUBLE, + ) self.precision = precision self.psuffix = { - Precision.DOUBLE: 'd', - Precision.SINGLE: 's', - Precision.HALF: 'h', - Precision.BFLOAT16: 'h' + Precision.DOUBLE: "d", + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", }[precision] self.alupsuffix = { - Precision.DOUBLE: 'pd', - Precision.SINGLE: 'ps', - Precision.HALF: 'ph', - Precision.BFLOAT16: 'nepbf16' + Precision.DOUBLE: "pd", + Precision.SINGLE: "ps", + Precision.HALF: "ph", + Precision.BFLOAT16: "nepbf16", }[precision] self.bpsuffix = { Precision.DOUBLE: "q", @@ -44,16 +49,15 @@ def __init__(self, precision: Precision): Precision.DOUBLE: 2, Precision.SINGLE: 4, Precision.HALF: 8, - Precision.BFLOAT16: 8 + Precision.BFLOAT16: 8, }[precision] def show(self): print("\n".join(self.output)) - def addLine(self, stmt: str, comment: str): - line = " "*self.lmargin + self.indent*self.depth + line = " " * self.lmargin + self.indent * self.depth if stmt is not None and comment is not None and self.show_comments: stmt = '"' + stmt + '\\r\\n"' @@ -67,13 +71,13 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - def maskformat(self, pred, ignoreZero = False): + def maskformat(self, pred, ignoreZero=False): if pred is None: - return '' + return "" elif pred.zero and not ignoreZero: - return f'%{{{pred.register.ugly}%}}%{{z%}}' + return f"%{{{pred.register.ugly}%}}%{{z%}}" else: - return f'%{{{pred.register.ugly}%}}' + return f"%{{{pred.register.ugly}%}}" def visitFma(self, stmt: FmaStmt): mask = self.maskformat(stmt.pred) @@ -113,9 +117,9 @@ def visitBcst(self, stmt: BcstStmt): a = stmt.dest.ugly regsize = stmt.dest.size() if self.precision == Precision.HALF or self.precision == Precision.BFLOAT16: - instruction = 'vpbroadcastw' + instruction = "vpbroadcastw" elif self.precision == Precision.DOUBLE and regsize == 16: - instruction = 'vmovddup' + instruction = "vmovddup" else: instruction = f"vbroadcasts{self.psuffix}" s = f"{instruction} {b}, {a} {mask}" @@ -125,13 +129,13 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - + # only used for scalar addition right now s = f"addq {stmt.src.ugly}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) def visitLabel(self, stmt: LabelStmt): - self.addLine('.align 16', 'Align label') + self.addLine(".align 16", "Align label") s = f"{stmt.label.ugly}:" self.addLine(s, stmt.comment) @@ -153,15 +157,15 @@ def visitMov(self, stmt: MovStmt): src_str = stmt.src.ugly if stmt.typ == AsmType.i64: - assert(stmt.pred == None) + assert stmt.pred == None # FIXME: no hack - if stmt.dest.ugly[2] == 'k': + if stmt.dest.ugly[2] == "k": s = f"kmovq {src_str}, {stmt.dest.ugly}" else: s = f"movq {src_str}, {stmt.dest.ugly}" elif stmt.typ == AsmType.f64x8 and stmt.aligned: if isinstance(stmt.src, Constant) and stmt.src.value == 0: - suffix = 'd' if self.bpsuffix == 'w' else self.bpsuffix + suffix = "d" if self.bpsuffix == "w" else self.bpsuffix s = f"vpxor{suffix} {stmt.dest.ugly}, {stmt.dest.ugly}, {stmt.dest.ugly} {mask}" elif stmt.expand: if isinstance(stmt.src, MemoryAddress): @@ -169,7 +173,7 @@ def visitMov(self, stmt: MovStmt): else: s = f"vpcompress{self.bpsuffix} {src_str}, {stmt.dest.ugly} {mask}" else: - if self.bpsuffix == 'w' and stmt.pred is not None: + if self.bpsuffix == "w" and stmt.pred is not None: instr = "vmovsh" else: instr = f"vmovup{self.psuffix}" @@ -198,7 +202,7 @@ def visitPrefetch(self, stmt: PrefetchStmt): def visitBlock(self, block: Block): self.stack.append(block) self.depth += 1 - if self.show_comments and block.comment != '': + if self.show_comments and block.comment != "": self.addLine(None, block.comment) for stmt in block.contents: stmt.accept(self) diff --git a/pspamm/codegen/architectures/knl/operands.py b/pypspamm/codegen/architectures/knl/operands.py similarity index 75% rename from pspamm/codegen/architectures/knl/operands.py rename to pypspamm/codegen/architectures/knl/operands.py index c9ed45e..572f893 100644 --- a/pspamm/codegen/architectures/knl/operands.py +++ b/pypspamm/codegen/architectures/knl/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_KNL: @@ -20,14 +20,14 @@ def c(n): return Constant_KNL(value=int(n)) - class Label_KNL(Label): @property def ugly(self): - #return self.ordinal + # return self.ordinal return self.value.upper() + "_%=" + def l(label: str): return Label_KNL(label) @@ -38,13 +38,12 @@ class Register_KNL(Register): def ugly(self): return "%%" + self.value + class MemoryAddress_KNL(MemoryAddress): - - def __init__(self, - base: Register, - disp: int, - index: Register = None, - scaling: int = None) -> None: + + def __init__( + self, base: Register, disp: int, index: Register = None, scaling: int = None + ) -> None: self.base = base self.disp = disp self.index = index @@ -59,10 +58,11 @@ def ugly(self): @property def clobbered(self): return self.base.clobbered - + def registers(self): return [self.base, self.index] + def mem(base, offset, index=None, scaling=None): return MemoryAddress_KNL(base, offset, index, scaling) @@ -74,25 +74,26 @@ def mem(base, offset, index=None, scaling=None): rdi = Register_KNL(AsmType.i64, "rdi") rsi = Register_KNL(AsmType.i64, "rsi") -r = lambda n: Register_KNL(AsmType.i64, "r"+str(n)) if n > 7 else gen_regs[n] -xmm = lambda n: Register_KNL(AsmType.f64x2, "xmm"+str(n)) -ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm"+str(n)) -zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm"+str(n)) -kmask= lambda n: Register_KNL(AsmType.i64, "k"+str(n)) +r = lambda n: Register_KNL(AsmType.i64, "r" + str(n)) if n > 7 else gen_regs[n] +xmm = lambda n: Register_KNL(AsmType.f64x2, "xmm" + str(n)) +ymm = lambda n: Register_KNL(AsmType.f64x4, "ymm" + str(n)) +zmm = lambda n: Register_KNL(AsmType.f64x8, "zmm" + str(n)) +kmask = lambda n: Register_KNL(AsmType.i64, "k" + str(n)) + class Predicate: def __init__(self, register: Register_KNL, zero: bool): self.register = register self.zero = zero - + @property def ugly(self): # TODO? return self.register.ugly - + @property def clobbered(self): return self.register.clobbered - + def registers(self): return [self.register] diff --git a/pypspamm/codegen/architectures/lsx/__init__.py b/pypspamm/codegen/architectures/lsx/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pspamm/codegen/architectures/lsx/blocksize.py b/pypspamm/codegen/architectures/lsx/blocksize.py similarity index 62% rename from pspamm/codegen/architectures/lsx/blocksize.py rename to pypspamm/codegen/architectures/lsx/blocksize.py index db4fa3c..1df01e6 100644 --- a/pspamm/codegen/architectures/lsx/blocksize.py +++ b/pypspamm/codegen/architectures/lsx/blocksize.py @@ -5,17 +5,19 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bn = 1 maxval = 0 - for i in range(v_size, m+1, v_size): - for j in range(1, n+1): + for i in range(v_size, m + 1, v_size): + for j in range(1, n + 1): # can be replaced by cls.LSX_condition_extended here # (but that seemed to be slower in the end) if cls.LSX_condition(i, j, bk, v_size): - if i*j > maxval and (cls.LSX_condition(i, j, bk, v_size) or j > 1): - maxval = i*j + if i * j > maxval and ( + cls.LSX_condition(i, j, bk, v_size) or j > 1 + ): + maxval = i * j bm = i - bn = j + bn = j - while cls.LSX_condition(bm, bn, bk+1, v_size): + while cls.LSX_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -26,4 +28,5 @@ def LSX_condition(cls, bm, bn, bk, v_size): vm = -(bm // -v_size) return (bn + bk) * vm + bn * bk <= 32 + Default = Max diff --git a/pypspamm/codegen/architectures/lsx/generator.py b/pypspamm/codegen/architectures/lsx/generator.py new file mode 100644 index 0000000..3a576b7 --- /dev/null +++ b/pypspamm/codegen/architectures/lsx/generator.py @@ -0,0 +1,307 @@ +from pypspamm.codegen.architectures.lsx.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.regcache import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * + + +class Generator(AbstractGenerator): + template = """ +void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, {real_type} alpha, {real_type} beta, {real_type} const* prefetch) {{ + __asm__ __volatile__( +{body_text} + : : {args} : {clobbered}); + + #ifndef NDEBUG + #ifdef _OPENMP + #pragma omp atomic + #endif + pspamm_num_total_flops += {flop}; + #endif +}} +""" + v_len = 2 + + def get_v_size(self): + return (16 // self.precision.size()) * self.v_len + + def get_template(self): + return Generator.template + + def use_broadcast(self): + return True + + def has_masks(self): + return False + + def init_mask(self, m, bm, v_size, tempreg, maskregs): + return block("") + + def make_argument_load(self, starting_regs, prefetch): + asm = block("Load arguments") + asm.add(ld(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(ld(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(ld(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(ld(InputOperand(f"3", "m", "alpha"), starting_regs[3], False)) + asm.add(ld(InputOperand(f"4", "m", "beta"), starting_regs[4], False)) + if prefetch: + asm.add(ld(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) + return asm + + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + assert bm % v_size == 0 + vm = self.ceil_div(bm, v_size) + + assert (bn + bk) * vm + bn * bk <= 32 + + vmm = {1: vr, 2: xr}[self.v_len] + + A_regs = Matrix([[vmm(vm * c + r) for c in range(bk)] for r in range(vm)]) + Aoffset = vm * bk + + B_regs = Matrix( + [[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)] + ) + C_regs = Matrix( + [[vmm(32 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)] + ) + + b_reg = Aoffset + alpha_reg = [vmm(b_reg)] * 2 + beta_reg = [vmm(b_reg + 1)] * 2 + + starting_regs = [r(10), r(11), r(12), r(13), r(14), r(6), r(5)] + + additional_regs = [r(15), r(16), r(17), r(31), r(7)] + + loop_regs = [r(28), r(29), r(30)] + + prefetch_reg = prefetch == "BL2viaC" + + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + [], + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: + return block("") + + def init_block(self, size): + return block("") + + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + temp=None, + ) -> Block: + + rows, cols = registers.shape + action = "Store" if store else "Load" + asm = block(f"{action} {cursor.name} register block @ {block_offset}") + + max_offs = 2047 + cur11 = 0 + + for ic in range(cols): + for ir in range(rows): + if (mask is None) or (mask[ir, ic]): + all_coords = [ + Coords(down=ir * v_size + i, right=ic) for i in range(v_size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] + if all(has_nonzero): + cell_offset = all_coords[0] + addr, comment = cursor.look( + cursor_ptr, block_offset, cell_offset + ) + addr.disp += self.precision.size() * load_offset + needsmove = False + if addr.disp > max_offs: + moved = addr.disp - cur11 + if moved > 0 and moved <= max_offs: + addr.disp = moved + else: + asm.add( + add(addr.disp, additional_regs[0], "", addr.base) + ) + cur11 = addr.disp + addr.disp = 0 + needsmove = True + + addr.base = additional_regs[0] + if store: + asm.add(st(registers[ir, ic], addr, True, comment)) + if prefetching == "BL2viaC" and pf_cursor is not None: + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) + addr.disp += self.precision.size() * load_offset + if addr.disp > max_offs: + moved = addr.disp - cur11 + if needsmove: + asm.add( + add( + addr.disp, + additional_regs[3], + "", + addr.base, + ) + ) + addr.disp = 0 + else: + addr.disp = moved + addr.base = additional_regs[3] + asm.add(prefetch(addr, closeness="L2")) + else: + asm.add(ld(addr, registers[ir, ic], True, comment)) + elif any(has_nonzero): + raise NotImplementedError( + "Element-wise sparsity in A is not yet fully implemented." + ) + return asm + + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: + + rows, cols = registers.shape + asm = block("zero registers") + + for ic in range(cols): + for ir in range(rows): + asm.add(mov(0, registers[ir, ic], True)) + + return asm + + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. + """ + asm = block("Block GEMM microkernel") + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + assert bm % v_size == 0 + + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) + asm.add( + self.move_register_block( + A, + A_ptr, + to_A_block, + A_regs, + v_size, + additional_regs, + mask, + store=False, + temp=B_regs[0, 0], + ) + ) + + Vm = self.ceil_div(bm, v_size) + cur11 = 0 + max_offs = 2047 + + bs = [] + for Vmi in range(Vm): + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) + if B_regs[bki, bni] not in bs: + # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value + if B_cell_addr.disp > max_offs: + moved = B_cell_addr.disp - cur11 + if moved > 0 and moved <= max_offs: + B_cell_addr.disp = moved + else: + asm.add( + add( + B_cell_addr.disp, + additional_regs[0], + "", + B_cell_addr.base, + ) + ) + cur11 = B_cell_addr.disp + B_cell_addr.disp = 0 + + B_cell_addr.base = additional_regs[0] + + asm.add(bcst(B_cell_addr, B_regs[bki, bni], B_comment)) + bs.append(B_regs[bki, bni]) + + for bki in range(bk): # inside this k-block + for Vmi in range(Vm): + for bni in range(bn): # inside this n-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + _, B_comment = B.look(B_ptr, to_B_block, to_bcell) + comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" + asm.add( + fma( + B_regs[bki, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + bcast=None, + sub=sub, + ) + ) + return asm diff --git a/pspamm/codegen/architectures/lsx/inlineprinter.py b/pypspamm/codegen/architectures/lsx/inlineprinter.py similarity index 69% rename from pspamm/codegen/architectures/lsx/inlineprinter.py rename to pypspamm/codegen/architectures/lsx/inlineprinter.py index fe13715..199d9d3 100644 --- a/pspamm/codegen/architectures/lsx/inlineprinter.py +++ b/pypspamm/codegen/architectures/lsx/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * + +from pypspamm.codegen.ast import * +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -16,27 +17,20 @@ class InlinePrinter(Visitor): output = None stack = None - def __init__(self, precision: Precision): self.output = [] self.stack = [] assert precision in (Precision.SINGLE, Precision.DOUBLE) self.precision = precision - self.psuffix = { - Precision.DOUBLE: "d", - Precision.SINGLE: "s" - }[precision] - self.bpsuffix = { - Precision.DOUBLE: "d", - Precision.SINGLE: "w" - }[precision] + self.psuffix = {Precision.DOUBLE: "d", Precision.SINGLE: "s"}[precision] + self.bpsuffix = {Precision.DOUBLE: "d", Precision.SINGLE: "w"}[precision] def show(self): print("\n".join(self.output)) def addLine(self, stmt: str, comment: str): - line = " "*self.lmargin + self.indent*self.depth + line = " " * self.lmargin + self.indent * self.depth if stmt is not None and comment is not None and self.show_comments: stmt = '"' + stmt + '\\r\\n"' @@ -51,16 +45,13 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) def prefix(self, register): - return { - 16: "v", - 32: "xv" - }[register.size()] - + return {16: "v", 32: "xv"}[register.size()] + def iname(self, root, refreg, bp): prefix = self.prefix(refreg) suffix = self.bpsuffix if bp else self.psuffix return f"{prefix}{root}.{suffix}" - + def to_addi(self, value): ADDILENGTH = 12 ADDIBLOCK = (1 << ADDILENGTH) - 1 @@ -98,9 +89,9 @@ def visitBcst(self, stmt: BcstStmt): a = stmt.dest.ugly # check if we broadcast a general register if isinstance(stmt.bcast_src, Register): - instruction = self.iname('replgr2vr', stmt.dest, True) + instruction = self.iname("replgr2vr", stmt.dest, True) else: - instruction = self.iname('ldrepl', stmt.dest, True) + instruction = self.iname("ldrepl", stmt.dest, True) s = f"{instruction} {a}, {b}" self.addLine(s, stmt.comment) @@ -108,7 +99,9 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - if isinstance(stmt.src, Constant) and (stmt.src.value > 2047 or stmt.src.value < -2048): + if isinstance(stmt.src, Constant) and ( + stmt.src.value > 2047 or stmt.src.value < -2048 + ): # we need an intermediate register here # TODO: do not hard-code x5 here, make well-defined @@ -118,18 +111,32 @@ def visitAdd(self, stmt: AddStmt): addival, luival = self.to_addi(-stmt.src.value) else: addival, luival = self.to_addi(stmt.src.value) - self.addLine(f"lu12i.w {itmp}, {luival}", f"Intermediate add: place upper 12 bits of {stmt.src.value}") + self.addLine( + f"lu12i.w {itmp}, {luival}", + f"Intermediate add: place upper 12 bits of {stmt.src.value}", + ) if addival != 0: - self.addLine(f"addi.d {itmp}, {itmp}, {addival}", f"Intermediate add: place lower 12 bits of {stmt.src.value}") + self.addLine( + f"addi.d {itmp}, {itmp}, {addival}", + f"Intermediate add: place lower 12 bits of {stmt.src.value}", + ) if stmt.src.value < 0: - self.addLine(f"sub.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + self.addLine( + f"sub.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment + ) else: - self.addLine(f"add.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + self.addLine( + f"add.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment + ) else: # if stmt.src is a Constant but outside of the above range of value < -2048 or value > 2047 # we can simply add the Constant to a register - accumulate = stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly - self.addLine(f"addi.d {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment) + accumulate = ( + stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly + ) + self.addLine( + f"addi.d {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment + ) def visitLabel(self, stmt: LabelStmt): s = f"{stmt.label.ugly}:" @@ -145,24 +152,37 @@ def visitJump(self, stmt: JumpStmt): def visitMov(self, stmt: MovStmt): if isinstance(stmt.src, Constant): if stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4]: - assert stmt.src.ugly == '0' - self.addLine(f"{self.prefix(stmt.dest)}ldi {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment) + assert stmt.src.ugly == "0" + self.addLine( + f"{self.prefix(stmt.dest)}ldi {stmt.dest.ugly}, {stmt.src.ugly}", + stmt.comment, + ) else: if stmt.src.value < 2**12: - self.addLine(f"addi.w {stmt.dest.ugly}, $r0, {stmt.src.value}", stmt.comment) + self.addLine( + f"addi.w {stmt.dest.ugly}, $r0, {stmt.src.value}", stmt.comment + ) elif stmt.src.value < 2**32: addival, luival = self.to_addi(stmt.src.value) - self.addLine(f"lu12i.w {stmt.dest.ugly}, {luival}", "Intermediate mov: place upper 12 bits") + self.addLine( + f"lu12i.w {stmt.dest.ugly}, {luival}", + "Intermediate mov: place upper 12 bits", + ) if addival != 0: - self.addLine(f"addi.w {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", stmt.comment) + self.addLine( + f"addi.w {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", + stmt.comment, + ) else: raise NotImplementedError() elif isinstance(stmt.src, Register): if stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4]: - iname = self.iname('replgr2vr', stmt.dest, True) + iname = self.iname("replgr2vr", stmt.dest, True) self.addLine(f"{iname} {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment) else: - self.addLine(f"addi.w {stmt.dest.ugly}, {stmt.src.ugly}, 0", stmt.comment) + self.addLine( + f"addi.w {stmt.dest.ugly}, {stmt.src.ugly}, 0", stmt.comment + ) else: raise NotImplementedError() @@ -176,14 +196,14 @@ def visitPrefetch(self, stmt: PrefetchStmt): # TODO: maybe preldx here? s = f"preld {hint}, {stmt.dest.ugly}" self.addLine(s, stmt.comment) - + def visitLoad(self, stmt: LoadStmt): if stmt.dest.typeinfo == AsmType.f64: s = f"fl{self.ugly_precision} {stmt.dest.ugly}, {stmt.src.ugly}" elif stmt.dest.typeinfo == AsmType.i64: s = f"ld.d {stmt.dest.ugly}, {stmt.src.ugly}" elif stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4] and stmt.aligned: - instr = f'{self.prefix(stmt.dest)}ld' + instr = f"{self.prefix(stmt.dest)}ld" s = f"{instr} {stmt.dest.ugly}, {stmt.src.ugly}" else: raise NotImplementedError() @@ -195,7 +215,7 @@ def visitStore(self, stmt: StoreStmt): elif stmt.src.typeinfo == AsmType.i64: s = f"st.d {stmt.src.ugly}, {stmt.dest.ugly}" elif stmt.src.typeinfo in [AsmType.f64x2, AsmType.f64x4] and stmt.aligned: - instr = f'{self.prefix(stmt.src)}st' + instr = f"{self.prefix(stmt.src)}st" s = f"{instr} {stmt.src.ugly}, {stmt.dest.ugly}" else: raise NotImplementedError() @@ -204,7 +224,7 @@ def visitStore(self, stmt: StoreStmt): def visitBlock(self, block: Block): self.stack.append(block) self.depth += 1 - if self.show_comments and block.comment != '': + if self.show_comments and block.comment != "": self.addLine(None, block.comment) for stmt in block.contents: stmt.accept(self) diff --git a/pspamm/codegen/architectures/lsx/operands.py b/pypspamm/codegen/architectures/lsx/operands.py similarity index 66% rename from pspamm/codegen/architectures/lsx/operands.py rename to pypspamm/codegen/architectures/lsx/operands.py index 5267726..5d9d5bd 100644 --- a/pspamm/codegen/architectures/lsx/operands.py +++ b/pypspamm/codegen/architectures/lsx/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_LSX: @@ -20,14 +20,14 @@ def c(n): return Constant_LSX(value=int(n)) - class Label_LSX(Label): @property def ugly(self): - #return self.ordinal + # return self.ordinal return self.value.upper() + "_%=" + def l(label: str): return Label_LSX(label) @@ -38,20 +38,17 @@ class Register_LSX(Register): def ugly(self): return "$" + self.value -r = lambda n: Register_LSX(AsmType.i64, "r"+str(n)) -vr = lambda n: Register_LSX(AsmType.f64x2, "vr"+str(n)) -xr = lambda n: Register_LSX(AsmType.f64x4, "xr"+str(n)) - +r = lambda n: Register_LSX(AsmType.i64, "r" + str(n)) +vr = lambda n: Register_LSX(AsmType.f64x2, "vr" + str(n)) +xr = lambda n: Register_LSX(AsmType.f64x4, "xr" + str(n)) class MemoryAddress_LSX(MemoryAddress): - - def __init__(self, - base: Register, - disp: int, - index: Register = None, - scaling: int = None) -> None: + + def __init__( + self, base: Register, disp: int, index: Register = None, scaling: int = None + ) -> None: self.base = base self.disp = disp self.index = index @@ -59,19 +56,14 @@ def __init__(self, @property def ugly(self): - #if self.index is None: + # if self.index is None: # return f"{self.disp}({self.base.ugly})" - #return f"{self.disp}({self.base.ugly},{self.index.ugly},{self.scaling})" + # return f"{self.disp}({self.base.ugly},{self.index.ugly},{self.scaling})" return f"{self.base.ugly},{self.disp}" - + def registers(self): return [self.base, self.index] + def mem(base, offset, index=None, scaling=None): return MemoryAddress_LSX(base, offset, index, scaling) - - - - - - diff --git a/pypspamm/codegen/architectures/rvv/__init__.py b/pypspamm/codegen/architectures/rvv/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pspamm/codegen/architectures/rvv/blocksize.py b/pypspamm/codegen/architectures/rvv/blocksize.py similarity index 72% rename from pspamm/codegen/architectures/rvv/blocksize.py rename to pypspamm/codegen/architectures/rvv/blocksize.py index be67492..9d06990 100644 --- a/pspamm/codegen/architectures/rvv/blocksize.py +++ b/pypspamm/codegen/architectures/rvv/blocksize.py @@ -5,11 +5,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): bm = v_size bn = 1 - for j in range(1, n+1): + for j in range(1, n + 1): if cls.RVV_condition(bm, j, bk, v_size): bn = j - while cls.RVV_condition(bm, bn, bk+1, v_size): + while cls.RVV_condition(bm, bn, bk + 1, v_size): bk += 1 return (bm, bn, bk) @@ -18,7 +18,8 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def RVV_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 and bn*bk + 2 <= 32 + return (bn + bk) * vm <= 32 and bn * bk + 2 <= 32 + class CubeBn: @classmethod @@ -29,11 +30,11 @@ def getBlocksize(cls, m, n, bk, v_size, prec): maxval = 0 - for j in range(1, n+1): + for j in range(1, n + 1): for k in range(1, 200): if cls.RVV_condition(bm, j, k, v_size): - if j*k >= maxval: - maxval = j*k + if j * k >= maxval: + maxval = j * k bn = j bk = k @@ -43,6 +44,7 @@ def getBlocksize(cls, m, n, bk, v_size, prec): def RVV_condition(cls, bm, bn, bk, v_size): # ceiling division vm = -(bm // -v_size) - return (bn+bk) * vm <= 32 and bn*bk + 2 <= 32 + return (bn + bk) * vm <= 32 and bn * bk + 2 <= 32 + Default = MaxBn diff --git a/pypspamm/codegen/architectures/rvv/generator.py b/pypspamm/codegen/architectures/rvv/generator.py new file mode 100644 index 0000000..f341e1f --- /dev/null +++ b/pypspamm/codegen/architectures/rvv/generator.py @@ -0,0 +1,445 @@ +from pypspamm.codegen.architectures.rvv.operands import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.generator import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.sugar import * +from pypspamm.cursors import * + + +class Generator(AbstractGenerator): + template = """ +void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, const {real_type} alpha, const {real_type} beta, const {real_type}* prefetch) {{{{ + __asm__ __volatile__( + {body_text} + : : {args} : {clobbered}); + + #ifndef NDEBUG + #ifdef _OPENMP + #pragma omp atomic + #endif + pspamm_num_total_flops += {flop}; + #endif +}}}}; +""" + + is_sparse = False + v_len = 1 # vector register length: v_len * 128 bit + predicates = {} + + def get_v_size(self): + return (16 // self.precision.size()) * self.v_len + + def get_precision(self): + return self.precision + + def get_template(self): + return self.template + + def use_broadcast(self): + return False + + def has_masks(self): + return False # not yet + + def pred_n_trues( + self, num_trues: int, v_size: int, suffix: str = None + ) -> Register_RV: + return None + + # is called at most one time in matmul.py + def set_sparse(self): + self.is_sparse = True + + def make_argument_load(self, starting_regs, prefetch): + asm = block("Load arguments") + asm.add(ld(InputOperand(f"0", "m", "A"), starting_regs[0], False)) + asm.add(ld(InputOperand(f"1", "m", "B"), starting_regs[1], False)) + asm.add(ld(InputOperand(f"2", "m", "C"), starting_regs[2], False)) + asm.add(ld(InputOperand(f"3", "m", "alpha"), starting_regs[3], False)) + asm.add(ld(InputOperand(f"4", "m", "beta"), starting_regs[4], False)) + if prefetch: + asm.add(ld(InputOperand(f"5", "m", "prefetch"), starting_regs[5], False)) + return asm + + def make_reg_blocks( + self, + bm: int, + bn: int, + bk: int, + v_size: int, + nnz: int, + m: int, + n: int, + k: int, + prefetch: str, + ): + vm = self.ceil_div( + bm, v_size + ) # vm can be 0 if bm < v_size -> makes ceil_div necessary + + assert bn * bk + 2 <= 32 + assert (bn + bk) * vm <= 32 + + prec = { + Precision.DOUBLE: "d", + Precision.SINGLE: "s", + Precision.HALF: "h", + Precision.BFLOAT16: "h", + }[self.get_precision()] + + A_regs = Matrix([[v(vm * c + r) for c in range(bk)] for r in range(vm)]) + B_regs = Matrix([[f(bn * r + c + 2) for c in range(bn)] for r in range(bk)]) + C_regs = Matrix( + [[v(32 - vm * bn + vm * c + r) for c in range(bn)] for r in range(vm)] + ) + + b_reg = 0 + alpha_reg = [f(0), f(0)] + beta_reg = [f(1), f(1)] + + # TODO: move x(5) out of here + starting_regs = [x(10), x(11), x(12), f(0), f(1), x(6), x(5)] + + additional_regs = [x(13), x(14), x(15), x(16), x(17), x(31), x(7)] + + loop_regs = [x(28), x(29), x(30)] + + mask_regs = [] + + prefetch_reg = prefetch is not None + + return ( + A_regs, + B_regs, + C_regs, + starting_regs, + alpha_reg, + beta_reg, + loop_regs, + additional_regs, + mask_regs, + prefetch_reg, + ) + + def make_scaling_offsets(self, additional_regs: List[Register], nnz: int) -> Block: + + asm = block("No register based scaling") + return asm + + def init_mask(self, m: int, bm: int, v_size: int, tempreg, maskreg) -> Block: + + asm = block("No register based scaling") + return asm + + def init_block(self, size): + if size < 32: + return rvsetvl(x(0), size) + else: + asm = block("Set vector length") + asm.add(mov(size, x(5), False)) + asm.add(rvsetvl(x(0), x(5))) + return asm + + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + is_B: bool = False, + ) -> Block: + + rows, cols = registers.shape + action = "Store" if store else "Load" + asm = block(f"{action} {cursor.name} register block @ {block_offset}") + prec = self.get_precision() + + b_row, b_col, i, _ = cursor.get_block(cursor_ptr, block_offset) + + cur11 = 0 + # TODO: figure out appropriate threshold (the 16 // self.v_len may still not be optimal; especially if 16 % self.v_len != 0, e.g. 384 bit) + threshold = ( + 1 if self.is_sparse else (16 // self.v_len) + ) # uses whole 256 byte cache line, as one SVE-512 vector = 64 bytes + + # DONE if another CPU implements SVE at VL != 64 bytes, rewrite mul_vl (maybe do this dynamically) + mul_vl = ( + 16 * self.v_len + ) # e.g. A64FX has VL of 64 bytes in memory (thus, use v_len==4) + max_mem_ins_mult = 0 + max_offset = 0 # ld1d/st1d instruction encodes the immediate offset using 4 bits, multiplies it with MUL VL + + prev_disp = 0 + prev_base = None + + process_size = min(v_size, cursor.br) + + for ic in range(cols): + for ir in range(rows): + if (mask is None) or (mask[ir, ic]): + all_coords = [ + Coords(down=ir * v_size + i, right=ic) + for i in range(process_size) + ] + has_nonzero = [ + cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) + for offset in all_coords + ] + if not any(has_nonzero): + continue + elif any(has_nonzero) and not all(has_nonzero) and not is_B: + raise NotImplementedError( + "Element-wise sparsity in A is not yet implemented." + ) + + processed = ir * process_size + if processed >= b_row: + continue + p = ( + self.pred_n_trues(min(b_row - processed, process_size), v_size) + if not is_B + else self.pred_n_trues(process_size, v_size) + ) + p_zeroing = ( + self.pred_n_trues( + min(b_row - processed, process_size), v_size, "z" + ) + if not is_B + else self.pred_n_trues(process_size, v_size, "z") + ) + cell_offset = Coords(down=ir * v_size, right=ic) + + # addr = base "pointer" + relative offset in bytes + addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr.disp += self.precision.size() * load_offset + + offset = addr.disp - prev_disp + + # count how many elements we have processed between last step and this step + cont_counter = offset // mul_vl + larger_max_offset = cont_counter > max_mem_ins_mult + non_dividing_offset = offset % mul_vl != 0 + + # adjust addr.disp to a multiple of the RVV vector length + if prev_base is None: + prev_base = addr.base + + if larger_max_offset or addr.disp > 0 or non_dividing_offset: + offset_comment = f"move to new vector" + if ( + offset < 2048 + and offset >= -2048 + and prev_base == additional_regs[0] + ): + asm.add(add(offset, additional_regs[0], offset_comment)) + else: + asm.add( + add( + addr.disp, + additional_regs[0], + offset_comment, + addr.base, + ) + ) + prev_disp = addr.disp + addr.base = additional_regs[0] + addr.disp = 0 + prev_base = additional_regs[0] + + if store: + asm.add( + st( + registers[ir, ic], + addr, + True, + comment, + pred=p, + scalar_offs=False, + add_reg=additional_regs[2], + ) + ) + # perform prefetching after a store instruction, similar to KNL case + if prefetching: + addr, comment = pf_cursor.look( + pf_cursor_ptr, block_offset, cell_offset + ) + addr.disp += self.precision.size() * load_offset + if prev_disp > 0: + asm.add( + add( + prev_disp, + additional_regs[3], + "increment the prefetch register", + addr.base, + ) + ) + asm.add( + prefetch( + mem( + ( + additional_regs[3] + if prev_disp > 0 + else addr.base + ), + addr.disp - prev_disp, + ), + "", + p, + prec, + access_type="r", + closeness="L2", + temporality="KEEP", + ) + ) + else: + asm.add( + ld( + addr, + registers[ir, ic], + True, + comment, + pred=p_zeroing, + is_B=is_B, + scalar_offs=False, + add_reg=additional_regs[2], + ) + ) + + return asm + + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: + + rows, cols = registers.shape + asm = block("zero registers") + + for ic in range(cols): + for ir in range(rows): + asm.add(mov(0, registers[ir, ic], True)) + + return asm + + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + """make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. + """ + + asm = block("Block GEMM microkernel") + """block_row, block_col, (start)index, pattern_matrix (true/false)""" + bm, bk, aidx, apattern = A.get_block(A_ptr, to_A_block) + bk, bn, bidx, bpattern = B.get_block(B_ptr, to_B_block) + + # tell sparse_mask() that we use sve + mask = sparse_mask( + A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size, True + ) + asm.add( + self.move_register_block( + A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False + ) + ) + + bs = [] + cur11 = -10000 + Vm = max(self.ceil_div(bm, v_size), 1) + + multiple = self.precision.size() + # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 + # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 + # in both cases: instruction encodes the immediate offset within 6 bits + max_offs = 2047 + + for Vmi in range(Vm): + # set to all v_size predicates to true, we want to replicate a B element into a whole vector + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) + if B_regs[bki, bni] not in bs: + + # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value + if B_cell_addr.disp > max_offs: + moved = B_cell_addr.disp - cur11 + if moved > 0 and moved <= max_offs: + B_cell_addr.disp = moved + else: + asm.add( + add( + B_cell_addr.disp, + additional_regs[0], + "", + B_cell_addr.base, + ) + ) + cur11 = B_cell_addr.disp + B_cell_addr.disp = 0 + + B_cell_addr.base = additional_regs[0] + + asm.add( + ld( + B_cell_addr, + B_regs[bki, bni], + False, + B_comment, + pred=None, + is_B=True, + ) + ) + bs.append(B_regs[bki, bni]) + + for bki in range(bk): # inside this k-block + for Vmi in range(Vm): + p_merging = self.pred_n_trues(bm - Vmi * v_size, v_size, "m") + end_index = ( + bm if Vmi + 1 == Vm else Vmi * v_size + v_size + ) # end_index helps us print the right index ranges + for bni in range(bn): # inside this n-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi * v_size, right=bki) + if B.has_nonzero_cell( + B_ptr, to_B_block, to_bcell + ) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + _, B_comment = B.look(B_ptr, to_B_block, to_bcell) + comment = f"C[{Vmi * v_size}:{end_index},{bni}] += A[{Vmi * v_size}:{end_index},{bki}]*{B_comment}" + + asm.add( + fma( + B_regs[bki, bni], + A_regs[Vmi, bki], + C_regs[Vmi, bni], + comment=comment, + pred=p_merging, + bcast=True, + sub=sub, + ) + ) + return asm diff --git a/pspamm/codegen/architectures/rvv/inlineprinter.py b/pypspamm/codegen/architectures/rvv/inlineprinter.py similarity index 77% rename from pspamm/codegen/architectures/rvv/inlineprinter.py rename to pypspamm/codegen/architectures/rvv/inlineprinter.py index 527eac8..234a68c 100644 --- a/pspamm/codegen/architectures/rvv/inlineprinter.py +++ b/pypspamm/codegen/architectures/rvv/inlineprinter.py @@ -1,8 +1,9 @@ from typing import List -from pspamm.codegen.ast import * -from pspamm.codegen.visitor import Visitor -from pspamm.codegen.operands import * -from pspamm.codegen.precision import * + +from pypspamm.codegen.ast import * +from pypspamm.codegen.operands import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.visitor import Visitor class InlinePrinter(Visitor): @@ -26,7 +27,12 @@ def __init__(self, precision: Precision): Precision.BFLOAT16: "h", }[self.precision] - assert precision in (Precision.BFLOAT16, Precision.HALF, Precision.SINGLE, Precision.DOUBLE) + assert precision in ( + Precision.BFLOAT16, + Precision.HALF, + Precision.SINGLE, + Precision.DOUBLE, + ) def to_addi(self, value): ADDILENGTH = 12 @@ -103,7 +109,9 @@ def visitAdd(self, stmt: AddStmt): if isinstance(stmt.src, Constant) and stmt.src.value == 0: # avoid 0 instructions return - if isinstance(stmt.src, Constant) and (stmt.src.value > 2047 or stmt.src.value < -2048): + if isinstance(stmt.src, Constant) and ( + stmt.src.value > 2047 or stmt.src.value < -2048 + ): # we need an intermediate register here # TODO: do not hard-code x5 here, make well-defined @@ -113,18 +121,32 @@ def visitAdd(self, stmt: AddStmt): addival, luival = self.to_addi(-stmt.src.value) else: addival, luival = self.to_addi(stmt.src.value) - self.addLine(f"lui {itmp}, {luival}", f"Intermediate add: place upper 12 bits of {stmt.src.value}") + self.addLine( + f"lui {itmp}, {luival}", + f"Intermediate add: place upper 12 bits of {stmt.src.value}", + ) if addival != 0: - self.addLine(f"addi {itmp}, {itmp}, {addival}", f"Intermediate add: place lower 12 bits of {stmt.src.value}") + self.addLine( + f"addi {itmp}, {itmp}, {addival}", + f"Intermediate add: place lower 12 bits of {stmt.src.value}", + ) if stmt.src.value < 0: - self.addLine(f"sub {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + self.addLine( + f"sub {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment + ) else: - self.addLine(f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + self.addLine( + f"add {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment + ) else: # if stmt.src is a Constant but outside of the above range of value < -2048 or value > 2047 # we can simply add the Constant to a register - accumulate = stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly - self.addLine(f"addi {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment) + accumulate = ( + stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly + ) + self.addLine( + f"addi {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment + ) def visitLabel(self, stmt: LabelStmt): s = f"{stmt.label.ugly}:" @@ -143,12 +165,20 @@ def visitMov(self, stmt: MovStmt): self.addLine(f"vmv.v.i {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment) else: if stmt.src.value < 2**12: - self.addLine(f"addi {stmt.dest.ugly}, x0, {stmt.src.value}", stmt.comment) + self.addLine( + f"addi {stmt.dest.ugly}, x0, {stmt.src.value}", stmt.comment + ) elif stmt.src.value < 2**32: addival, luival = self.to_addi(stmt.src.value) - self.addLine(f"lui {stmt.dest.ugly}, {luival}", "Intermediate mov: place upper 12 bits") + self.addLine( + f"lui {stmt.dest.ugly}, {luival}", + "Intermediate mov: place upper 12 bits", + ) if addival != 0: - self.addLine(f"addi {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", stmt.comment) + self.addLine( + f"addi {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", + stmt.comment, + ) else: raise NotImplementedError() elif isinstance(stmt.src, Register): @@ -188,11 +218,11 @@ def visitStore(self, stmt: StoreStmt): self.addLine(s, stmt.comment) def visitPrefetch(self, stmt: PrefetchStmt): - s = f'prefetch.r {stmt.dest.ugly}' + s = f"prefetch.r {stmt.dest.ugly}" self.addLine(s, stmt.comment) - + def visitRVSetVLStmt(self, stmt: RVSetVLStmt): - opcode = 'setivli' if isinstance(stmt.requested, Constant) else 'setvli' + opcode = "setivli" if isinstance(stmt.requested, Constant) else "setvli" s = f"v{opcode} {stmt.actual.ugly}, {stmt.requested.ugly}, e{self.precision.size() * 8}" self.addLine(s, stmt.comment) @@ -209,7 +239,7 @@ def visitBlock(self, block: Block): def p_string(self, predicate: Register): # returns "pk{/z or /m}, " or an empty string "" with contents in {} being optional # at this point the contents are already generated, we simply turn them into a string - return f', {predicate}' if predicate is not None else "" + return f", {predicate}" if predicate is not None else "" def render(s: AsmStmt): diff --git a/pspamm/codegen/architectures/rvv/operands.py b/pypspamm/codegen/architectures/rvv/operands.py similarity index 83% rename from pspamm/codegen/architectures/rvv/operands.py rename to pypspamm/codegen/architectures/rvv/operands.py index 54f4bda..9322130 100644 --- a/pspamm/codegen/architectures/rvv/operands.py +++ b/pypspamm/codegen/architectures/rvv/operands.py @@ -1,4 +1,4 @@ -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * class Operand_RV: @@ -12,6 +12,7 @@ class Constant_RV(Constant): def ugly(self): return str(self.value) + def c(n): """Sugar for conveniently defining integer constants""" return Constant_RV(value=int(n)) @@ -38,16 +39,12 @@ def ugly_precision(self): @property def ugly_lsl_shift(self): - return { - "d": 3, - "s": 2, - "h": 1 - }[self.ugly_precision] + return {"d": 3, "s": 2, "h": 1}[self.ugly_precision] @property def clobbered(self): # removed [this comment should stay here for now---in case there's some compiler expecting it]: .replace("x", "r") - return (self.value.split(".")[0]) + return self.value.split(".")[0] @property def ugly_scalar(self): @@ -55,7 +52,7 @@ def ugly_scalar(self): @property def ugly_scalar_1d(self): - #turns "Vn.2d" into "Dn" + # turns "Vn.2d" into "Dn" return (self.value.split(".")[0]).replace("v", "d") @@ -63,13 +60,14 @@ def ugly_scalar_1d(self): f = lambda n: Register_RV(AsmType.f64, "f" + str(n)) v = lambda n: Register_RV(AsmType.f64x8, "v" + str(n)) + class MemoryAddress_RV(MemoryAddress): @property def ugly(self): if self.disp == 0: - return f'({self.base.ugly})' + return f"({self.base.ugly})" else: - return f'{self.disp}({self.base.ugly})' + return f"{self.disp}({self.base.ugly})" @property def clobbered(self): diff --git a/pspamm/codegen/ast.py b/pypspamm/codegen/ast.py similarity index 71% rename from pspamm/codegen/ast.py rename to pypspamm/codegen/ast.py index ba743b4..c7f672d 100644 --- a/pspamm/codegen/ast.py +++ b/pypspamm/codegen/ast.py @@ -1,9 +1,9 @@ +from typing import TYPE_CHECKING, List -from typing import List, TYPE_CHECKING -from pspamm.codegen.operands import * +from pypspamm.codegen.operands import * if TYPE_CHECKING: - from pspamm.codegen.arm.visitors import Visitor + from pypspamm.codegen.arm.visitors import Visitor class AsmStmt: @@ -11,47 +11,68 @@ class AsmStmt: def accept(self, visitor: "Visitor"): raise Exception("AsmStmt is supposed to be abstract") - + def reg_in_candidate(self): return () - + def reg_out_candidate(self): return () - + def regs_in(self): - return set(reg for regc in self.reg_in_candidate() if regc is not None for reg in regc.registers() if isinstance(reg, Register)) + return set( + reg + for regc in self.reg_in_candidate() + if regc is not None + for reg in regc.registers() + if isinstance(reg, Register) + ) def regs_out(self): - return set(reg for regc in self.reg_out_candidate() if regc is not None for reg in regc.registers() if isinstance(reg, Register)) - + return set( + reg + for regc in self.reg_out_candidate() + if regc is not None + for reg in regc.registers() + if isinstance(reg, Register) + ) + def regs(self): return self.regs_in() | self.regs_out() - + def args_in(self): - return set(reg for reg in self.reg_in_candidate() if reg is not None and isinstance(reg, InputOperand)) + return set( + reg + for reg in self.reg_in_candidate() + if reg is not None and isinstance(reg, InputOperand) + ) def args_out(self): - return set(reg for reg in self.reg_out_candidate() if reg is not None and isinstance(reg, InputOperand)) - + return set( + reg + for reg in self.reg_out_candidate() + if reg is not None and isinstance(reg, InputOperand) + ) + def barrier(self): return False - + def args(self): return self.args_in() | self.args_out() - + def normalize(self): yield self - + def flatten(self): yield self - + def stmtname(self): - return '???' - + return "???" + def __str__(self): - inregs = ', '.join(reg.ugly for reg in self.regs_in()) - outregs = ', '.join(reg.ugly for reg in self.regs_out()) - return f'{self.stmtname()} {inregs} -> {outregs}' + inregs = ", ".join(reg.ugly for reg in self.regs_in()) + outregs = ", ".join(reg.ugly for reg in self.regs_out()) + return f"{self.stmtname()} {inregs} -> {outregs}" + class GenericStmt(AsmStmt): operation = None @@ -71,15 +92,16 @@ class MovStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitMov(self) - + def reg_in_candidate(self): - return (self.src,self.temp,self.pred) - + return (self.src, self.temp, self.pred) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'mov' + return "mov" + class LeaStmt(AsmStmt): src = None @@ -91,15 +113,16 @@ class LeaStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitLea(self) - + def reg_in_candidate(self): - return (self.src,self.pred) - + return (self.src, self.pred) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'lea' + return "lea" + class LoadStmt(AsmStmt): src = None @@ -117,15 +140,16 @@ class LoadStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitLoad(self) - + def reg_in_candidate(self): - return (self.src,self.pred,self.add_reg) - + return (self.src, self.pred, self.add_reg) + def reg_out_candidate(self): return (self.dest, self.dest2, self.dest3, self.dest4) - + def stmtname(self): - return 'load' + return "load" + class StoreStmt(AsmStmt): src = None @@ -142,15 +166,16 @@ class StoreStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitStore(self) - + def reg_in_candidate(self): return (self.src, self.src2, self.src3, self.src4, self.pred, self.add_reg) - + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'store' + return "store" + class PrefetchStmt(AsmStmt): dest = None @@ -161,9 +186,9 @@ class PrefetchStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitPrefetch(self) - + def stmtname(self): - return 'prefetch' + return "prefetch" class FmaStmt(AsmStmt): @@ -176,15 +201,16 @@ class FmaStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitFma(self) - + def reg_in_candidate(self): return (self.add_dest, self.bcast_src, self.mult_src, self.pred) - + def reg_out_candidate(self): return (self.add_dest,) - + def stmtname(self): - return 'fma' + return "fma" + class MulStmt(AsmStmt): src = None @@ -194,15 +220,16 @@ class MulStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitMul(self) - + def reg_in_candidate(self): - return (self.mult_src,self.src,self.pred) - + return (self.mult_src, self.src, self.pred) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'mul' + return "mul" + class BcstStmt(AsmStmt): bcast_src = None @@ -211,15 +238,19 @@ class BcstStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitBcst(self) - + def reg_in_candidate(self): - return (self.bcast_src,self.pred,) - + return ( + self.bcast_src, + self.pred, + ) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'broadcast' + return "broadcast" + class AddStmt(AsmStmt): src = None @@ -233,15 +264,16 @@ def accept(self, visitor: "Visitor"): def reg_in_candidate(self): if self.additional is not None: - return (self.src,self.dest,self.additional,self.pred) + return (self.src, self.dest, self.additional, self.pred) else: - return (self.src,self.dest,self.pred) - + return (self.src, self.dest, self.pred) + def reg_out_candidate(self): return (self.dest,) - + def stmtname(self): - return 'add' + return "add" + class CmpStmt(AsmStmt): lhs = None @@ -250,21 +282,22 @@ class CmpStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitCmp(self) - + def reg_in_candidate(self): - return (self.lhs,self.rhs,self.pred) - + return (self.lhs, self.rhs, self.pred) + def stmtname(self): - return 'cmp' + return "cmp" + class LabelStmt(AsmStmt): label = None def accept(self, visitor: "Visitor"): visitor.visitLabel(self) - + def __str__(self): - return f'Label: {self.label.ugly}' + return f"Label: {self.label.ugly}" class JumpStmt(AsmStmt): @@ -273,12 +306,13 @@ class JumpStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitJump(self) - + def reg_in_candidate(self): return (self.cmpreg,) def stmtname(self): - return 'branch' + return "branch" + class DataStmt(AsmStmt): value = None @@ -287,48 +321,59 @@ class DataStmt(AsmStmt): def accept(self, visitor: "Visitor"): visitor.visitData(self) + class RVSetVLStmt(AsmStmt): actual = None requested = None def accept(self, visitor: "Visitor"): visitor.visitRVSetVLStmt(self) - + def reg_in_candidate(self): return (self.requested,) - + def reg_out_candidate(self): return (self.actual,) - + def barrier(self): return True + class Block(AsmStmt): contents = [] def accept(self, visitor: "Visitor"): visitor.visitBlock(self) - + def normalize(self): - return (subcontent for content in self.contents for subcontent in content.normalize()) - + return ( + subcontent + for content in self.contents + for subcontent in content.normalize() + ) + def flatten(self): - return (subcontent for content in self.contents for subcontent in content.flatten()) - + return ( + subcontent for content in self.contents for subcontent in content.flatten() + ) + def regs_in(self): regs = set() for instr in self.contents: regs |= instr.regs_in() return regs - + def regs_out(self): regs = set() for instr in self.contents: regs |= instr.regs_out() return regs - + def __str__(self): - return 'block {\n' + '\n'.join(str(content) for content in self.contents) + '\n}' + return ( + "block {\n" + "\n".join(str(content) for content in self.contents) + "\n}" + ) + class Command(AsmStmt): name = None diff --git a/pypspamm/codegen/ccode.py b/pypspamm/codegen/ccode.py new file mode 100644 index 0000000..a1defaf --- /dev/null +++ b/pypspamm/codegen/ccode.py @@ -0,0 +1,45 @@ +import pypspamm.architecture +from pypspamm.codegen.analysis import * +from pypspamm.codegen.ast import * +from pypspamm.codegen.precision import * + + +def make_cfunc( + funcName: str, + template: str, + body: Block, + flop: int, + starting_regs: List[Register], + precision: Precision, +) -> str: + Printer_class = pypspamm.architecture.get_class( + "pypspamm.codegen.architectures." + + pypspamm.architecture.arch + + ".inlineprinter" + ).InlinePrinter + + printer = Printer_class(precision) + printer.lmargin = 4 + body.accept(printer) + body_text = "\n".join(printer.output) + + analyzer = Analyzer(starting_regs) + analyzer.collect(body) + regs = set( + f'"{reg.clobbered}"' + for reg in analyzer.clobbered_registers + if reg.clobbered is not None + ) + regs.add('"memory"') + regs.add('"cc"') + # TODO: maybe regs.add('"redzone"') ? + clobbered = ", ".join(sorted(regs)) + arglist = ", ".join(sorted(arg.arg for arg in analyzer.input_operands)) + return template.format( + funcName=funcName, + body_text=body_text, + args=arglist, + clobbered=clobbered, + flop=flop, + real_type=Precision.getCType(precision), + ) diff --git a/pypspamm/codegen/forms.py b/pypspamm/codegen/forms.py new file mode 100644 index 0000000..e5b7ff6 --- /dev/null +++ b/pypspamm/codegen/forms.py @@ -0,0 +1,136 @@ +from typing import List + +from pypspamm.codegen.sugar import * + + +# TODO: We might eventually want to make this part of our syntax tree +# in order to do unrolls and other fancy stuff with it +class Loop(Block): + + _labels = [] + + def __init__( + self, + iteration_var: Register, + final_val: int, + body_contents: Block = None, + unroll: int = 1, + overlap: bool = False, + ) -> None: + + self.iteration_var = iteration_var + self.final_val = final_val + self.body_contents = body_contents + self.unroll = unroll + self.may_overlap = overlap + + self.comment = f"loop {self.iteration_var.ugly} in range({self.final_val}), unroll {self.unroll}" + + @property + def contents(self): + self.label = "loop_top_" + str(len(Loop._labels)) + Loop._labels.append(self.label) + + onestep = [*(self.body_contents.contents)] + body = [] + rest = [] + for _ in range(self.unroll): + body += onestep + + for _ in range(self.final_val % self.unroll): + rest += onestep + + true_final_val = (self.final_val // self.unroll) * self.unroll + + allcode = [] + if true_final_val == self.unroll: + allcode += body + elif true_final_val > self.unroll: + allcode += ( + [ + mov(-true_final_val, self.iteration_var, vector=False), + label(self.label), + ] + + body + + [ + add(self.unroll, self.iteration_var), + jump(self.label, self.iteration_var, backwards=True), + ] + ) + allcode += rest + + return allcode + + def body(self, *args): + self.body_contents = block("Loop body", *args) + return self + + def normalize(self): + yield loop( + self.iteration_var, self.final_val, self.unroll, self.may_overlap + ).body( + *[ + substmt + for stmt in self.body_contents.contents + for substmt in stmt.normalize() + ] + ) + + def __str__(self): + return ( + f"loop {self.iteration_var.ugly} in range({self.final_val}), unroll {self.unroll}" + + "{\n" + + "\n".join(str(content) for content in self.body_contents.contents) + + "\n}" + ) + + +def loop(iter_var, final_val, unroll=1, overlap=False): + return Loop(iter_var, final_val, unroll=unroll, overlap=overlap) + + +class Skip(Block): + + _labels = [] + + def __init__(self, skipreg: Register) -> None: + + self.skipreg = skipreg + + self.comment = f"if {self.checkreg} != 0" + + @property + def contents(self): + self.label = "skip_" + str(len(Loop._labels)) + Loop._labels.append(self.label) + + return ( + [jump(self.label, self.skipreg, backwards=True)] + + body + + [label(self.label)] + ) + + def body(self, *args): + self.body_contents = block("Skip body", *args) + return self + + def normalize(self): + yield skip(self.checkreg).body( + *[ + substmt + for stmt in self.body_contents.contents + for substmt in stmt.normalize() + ] + ) + + def __str__(self): + return ( + f"if {self.checkreg} != 0" + + "{\n" + + "\n".join(str(content) for content in self.body_contents.contents) + + "\n}" + ) + + +def skip(checkreg): + return Skip(checkreg) diff --git a/pypspamm/codegen/generator.py b/pypspamm/codegen/generator.py new file mode 100644 index 0000000..051a2c0 --- /dev/null +++ b/pypspamm/codegen/generator.py @@ -0,0 +1,82 @@ +from abc import ABC, abstractmethod + +from pypspamm.codegen.ast import * +from pypspamm.codegen.precision import * +from pypspamm.cursors import * + + +class AbstractGenerator(ABC): + def __init__(self, precision: Precision): + self.precision = precision + + def get_precision(self): + return self.precision + + def set_sparse(self): + pass + + # taken from https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python + def ceil_div(self, n, d): + return -(n // -d) + + @abstractmethod + def init_mask(self, bm, v_size, tempreg, maskreg): + pass + + @abstractmethod + def use_broadcast(self): + pass + + @abstractmethod + def has_masks(self): + pass + + @abstractmethod + def get_v_size(self): + pass + + @abstractmethod + def get_template(self): + pass + + @abstractmethod + def make_reg_blocks( + self, bm: int, bn: int, bk: int, v_size: int, nnz: int, m: int, n: int, k: int + ): + pass + + @abstractmethod + def move_register_block( + self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + ) -> Block: + pass + + @abstractmethod + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: + pass + + @abstractmethod + def make_microkernel( + self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size: int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False, + ) -> Block: + pass diff --git a/pspamm/codegen/operands.py b/pypspamm/codegen/operands.py similarity index 81% rename from pspamm/codegen/operands.py rename to pypspamm/codegen/operands.py index 52468f7..c470c17 100644 --- a/pspamm/codegen/operands.py +++ b/pypspamm/codegen/operands.py @@ -1,13 +1,29 @@ from enum import Enum -from typing import List, Dict +from typing import Dict, List + +AsmType = Enum( + "AsmType", + [ + "unknown", + "i8", + "i16", + "i32", + "i64", + "f32", + "f64", + "f32x4", + "f32x8", + "f32x16", + "f64x2", + "f64x4", + "f64x8", + "p64x8", + ], +) + +RegisterType = Enum("RegisterType", ["undefined", "scalar", "vector", "predicate"]) -AsmType = Enum('AsmType', ['unknown','i8','i16','i32','i64','f32','f64', - 'f32x4','f32x8','f32x16','f64x2','f64x4','f64x8', - 'p64x8']) - -RegisterType = Enum('RegisterType', ['undefined', 'scalar', 'vector', 'predicate']) - class Operand: @property def ugly(self): @@ -16,20 +32,23 @@ def ugly(self): def registers(self): return [] + # TODO: Rename this 'Immediate' class Constant(Operand): - def __init__(self, value:int) -> None: + def __init__(self, value: int) -> None: self.value = value @property def ugly(self): raise NotImplementedError() + class Label(Operand): _interns = {} _last = -1 + def __init__(self, value) -> None: - assert(isinstance(value, str)) + assert isinstance(value, str) self.value = value if value in Label._interns: self.ordinal = Label._interns[value] @@ -42,12 +61,13 @@ def __init__(self, value) -> None: def ugly(self): raise NotImplementedError() + class Register(Operand): def __init__(self, typeinfo, value) -> None: self.typeinfo = typeinfo self.value = str(value) - + def size(self): if self.typeinfo == AsmType.i8: return 1 @@ -81,41 +101,41 @@ def ugly(self): @property def clobbered(self): return self.value - + def registers(self): return [self] - + def __eq__(self, other): return self.ugly == other.ugly - + def __hash__(self): return hash(self.ugly) + class MemoryAddress(Operand): - def __init__(self, - base, - disp) -> None: + def __init__(self, base, disp) -> None: self.base = base self.disp = disp @property def ugly(self): raise NotImplementedError() - + def registers(self): return [self.base] + class InputOperand(Operand): def __init__(self, name, optype, source): self.name = str(name) self.optype = optype self.source = source - + @property def ugly(self): - return f'%{self.name}' - + return f"%{self.name}" + @property def arg(self): return f'"{self.optype}"({self.source})' diff --git a/pypspamm/codegen/precision.py b/pypspamm/codegen/precision.py new file mode 100644 index 0000000..2c5d125 --- /dev/null +++ b/pypspamm/codegen/precision.py @@ -0,0 +1,31 @@ +from enum import Enum + + +class Precision(Enum): + DOUBLE = 8 + SINGLE = 4 + HALF = 2 + BFLOAT16 = 2.1 + + @classmethod + def getCType(cls, precision): + ctype = { + cls.DOUBLE: "double", + cls.SINGLE: "float", + cls.HALF: "uint16_t", + cls.BFLOAT16: "uint16_t", + } + return ctype[precision] + + def ctype(self): + return self.getCType(self) + + def size(self): + return {self.DOUBLE: 8, self.SINGLE: 4, self.HALF: 2, self.BFLOAT16: 2}[self] + raise NotImplementedError() + + def __repr__(self): + return self.getCType(self) + + def __str__(self): + return self.getCType(self) diff --git a/pspamm/codegen/prune.py b/pypspamm/codegen/prune.py similarity index 70% rename from pspamm/codegen/prune.py rename to pypspamm/codegen/prune.py index fb8aa0e..cd1f2c3 100644 --- a/pspamm/codegen/prune.py +++ b/pypspamm/codegen/prune.py @@ -1,13 +1,18 @@ from .ast import * -from .operands import * from .forms import * +from .operands import * + def prune(block, toplevel=True): pruned = [] cached = [] for instr in block: - if isinstance(instr, AddStmt) and isinstance(instr.src, Constant) and instr.additional is None: + if ( + isinstance(instr, AddStmt) + and isinstance(instr.src, Constant) + and instr.additional is None + ): combinedValue = instr.src.value for i, cinstr in enumerate(cached): if cinstr.dest == instr.dest: @@ -19,9 +24,11 @@ def prune(block, toplevel=True): pruned += cached cached = [] if isinstance(instr, Loop): - instr.body_contents.contents = prune(instr.body_contents.contents, False) + instr.body_contents.contents = prune( + instr.body_contents.contents, False + ) pruned += [instr] - + if not toplevel: pruned += cached return pruned diff --git a/pspamm/codegen/regcache.py b/pypspamm/codegen/regcache.py similarity index 99% rename from pspamm/codegen/regcache.py rename to pypspamm/codegen/regcache.py index 44bb0ae..f8e38ce 100644 --- a/pspamm/codegen/regcache.py +++ b/pypspamm/codegen/regcache.py @@ -1,11 +1,10 @@ - class RegisterCache: def __init__(self, registers): self.access = 0 self.lru = [-1] * len(registers) self.registers = registers self.storage = {} - + def get(self, value): self.access += 1 diff --git a/pspamm/codegen/schedule.py b/pypspamm/codegen/schedule.py similarity index 75% rename from pspamm/codegen/schedule.py rename to pypspamm/codegen/schedule.py index 1222832..3c4ae19 100644 --- a/pspamm/codegen/schedule.py +++ b/pypspamm/codegen/schedule.py @@ -1,15 +1,29 @@ from .ast import * -from .operands import * from .forms import * +from .operands import * + def isStore(instr): - return isinstance(instr, StoreStmt) or (isinstance(instr, MovStmt) and isinstance(instr.dest, MemoryAddress)) + return isinstance(instr, StoreStmt) or ( + isinstance(instr, MovStmt) and isinstance(instr.dest, MemoryAddress) + ) + def isScalar(instr): - return isinstance(instr, AddStmt) or (isinstance(instr, MovStmt) and isinstance(instr.dest, Register) and instr.typ == AsmType.i64) + return isinstance(instr, AddStmt) or ( + isinstance(instr, MovStmt) + and isinstance(instr.dest, Register) + and instr.typ == AsmType.i64 + ) + def isLoad(instr): - return isinstance(instr, LoadStmt) or (isinstance(instr, MovStmt) and isinstance(instr.src, MemoryAddress)) or isinstance(instr, BcstStmt) + return ( + isinstance(instr, LoadStmt) + or (isinstance(instr, MovStmt) and isinstance(instr.src, MemoryAddress)) + or isinstance(instr, BcstStmt) + ) + def hasDependency(instr1, instr2, rrt=False): ww = instr1.regs_out() & instr2.regs_out() @@ -18,6 +32,7 @@ def hasDependency(instr1, instr2, rrt=False): rr = instr1.regs_in() & instr2.regs_in() return len(ww) > 0 or len(wr) > 0 or len(rw) > 0 or (rrt and len(rr) > 0) + def moveLoads(block, isLoop=False): preprocessed = [] for instr in block: @@ -28,15 +43,28 @@ def moveLoads(block, isLoop=False): if instr.final_val == 1: preprocessed += prelude + postlude elif instr.final_val > 1: - preprocessed += prelude + [loop(instr.iteration_var, instr.final_val - 1, instr.unroll).body(*inner)] + postlude + preprocessed += ( + prelude + + [ + loop( + instr.iteration_var, instr.final_val - 1, instr.unroll + ).body(*inner) + ] + + postlude + ) else: inner = moveLoads(instr.body_contents.contents, False) - preprocessed += [loop(instr.iteration_var, instr.final_val, instr.unroll).body(*inner)] + preprocessed += [ + loop(instr.iteration_var, instr.final_val, instr.unroll).body( + *inner + ) + ] else: preprocessed += [instr] - + return moveLoadsBlock(preprocessed, isLoop) + def moveLoadsBlock(block, isLoop): reordered = [] currentLoads = [] @@ -60,14 +88,18 @@ def addReorderedLoad(j): delta += newdelta reordered.append(loadInstr) return 1 + delta + def addDependentLoads(instr, i): j = 0 while j < len(currentLoads): loadInstr = currentLoads[j] # for now, include read-read dependencies here - if hasDependency(instr, loadInstr): # or too far away insertCounter[j] < i + 4 + if hasDependency( + instr, loadInstr + ): # or too far away insertCounter[j] < i + 4 j -= addReorderedLoad(j) j += 1 + def preponeLoad(instr, i): maxI = [] for loadInstr in reversed(currentLoads): @@ -85,7 +117,7 @@ def preponeLoad(instr, i): else: addDependentLoads(instr, len(reordered)) reordered.append(instr) - + if isLoop: # pass again, but ignore loads postlude = list(reversed(reordered)) @@ -99,13 +131,13 @@ def preponeLoad(instr, i): else: addDependentLoads(instr, len(reordered) + len(postlude)) reordered.append(instr) - + # add loads/scalar instructions that would be materialized over 2 iterations only (?) - for i,instr in enumerate(currentLoads): + for i, instr in enumerate(currentLoads): if i + len(prelude) >= len(currentLoads): break reordered.append(instr) - + return prelude, list(reversed(reordered)), postlude else: for loadInstr in currentLoads: diff --git a/pspamm/codegen/sugar.py b/pypspamm/codegen/sugar.py similarity index 57% rename from pspamm/codegen/sugar.py rename to pypspamm/codegen/sugar.py index 178c75e..dc33495 100644 --- a/pspamm/codegen/sugar.py +++ b/pypspamm/codegen/sugar.py @@ -1,25 +1,42 @@ from typing import Union -from pspamm.codegen.ast import * -from pspamm.codegen.operands import * +import pypspamm.architecture +from pypspamm.codegen.ast import * +from pypspamm.codegen.operands import * -import pspamm.architecture # Convenient statement constructors -def add(src: Union[Operand, int], dest: Register, comment: str = None, additional: Register = None): +def add( + src: Union[Operand, int], + dest: Register, + comment: str = None, + additional: Register = None, +): stmt = AddStmt() - stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) + stmt.src = ( + src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + ) stmt.dest = dest stmt.comment = comment stmt.additional = additional return stmt + def label(name: str): stmt = LabelStmt() - stmt.label = pspamm.architecture.operands.l(name) + stmt.label = pypspamm.architecture.operands.l(name) return stmt -def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: str = None, bcast: Union[int, None] = None, pred: Register = None, sub=False): + +def fma( + bcast_src: Register, + mult_src: Register, + add_dest: Register, + comment: str = None, + bcast: Union[int, None] = None, + pred: Register = None, + sub=False, +): stmt = FmaStmt() stmt.bcast_src = bcast_src stmt.mult_src = mult_src @@ -31,7 +48,14 @@ def fma(bcast_src: Register, mult_src: Register, add_dest: Register, comment: st stmt.sub = sub return stmt -def mul(src: Register, mult_src: Register, dest: Register, comment: str = None, pred: Register = None): + +def mul( + src: Register, + mult_src: Register, + dest: Register, + comment: str = None, + pred: Register = None, +): stmt = MulStmt() stmt.src = src stmt.mult_src = mult_src @@ -41,6 +65,7 @@ def mul(src: Register, mult_src: Register, dest: Register, comment: str = None, stmt.pred = pred return stmt + def bcst(bcast_src: Register, dest: Register, comment: str = None): stmt = BcstStmt() stmt.bcast_src = bcast_src @@ -48,21 +73,38 @@ def bcst(bcast_src: Register, dest: Register, comment: str = None): stmt.comment = comment return stmt + def cmp(lhs: Union[Operand, int], rhs: Union[Operand, int]): stmt = CmpStmt() - stmt.lhs = lhs if isinstance(lhs, Operand) else pspamm.architecture.operands.c(lhs) - stmt.rhs = rhs if isinstance(rhs, Operand) else pspamm.architecture.operands.c(rhs) + stmt.lhs = ( + lhs if isinstance(lhs, Operand) else pypspamm.architecture.operands.c(lhs) + ) + stmt.rhs = ( + rhs if isinstance(rhs, Operand) else pypspamm.architecture.operands.c(rhs) + ) return stmt -def jump(label: str, cmpreg = None, backwards=True): + +def jump(label: str, cmpreg=None, backwards=True): stmt = JumpStmt() - stmt.destination = pspamm.architecture.operands.l(label) + stmt.destination = pypspamm.architecture.operands.l(label) stmt.cmpreg = cmpreg return stmt -def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, pred = None, expand=None, temp=None): + +def mov( + src: Union[Operand, int], + dest: Operand, + vector: bool, + comment: str = None, + pred=None, + expand=None, + temp=None, +): stmt = MovStmt() - stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) + stmt.src = ( + src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + ) stmt.dest = dest stmt.comment = comment stmt.pred = pred @@ -76,7 +118,8 @@ def mov(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = Non stmt.typ = AsmType.i64 return stmt -def lea(src: Register, dest: Operand, offset: int, comment:str = None): + +def lea(src: Register, dest: Operand, offset: int, comment: str = None): stmt = LeaStmt() stmt.src = src stmt.dest = dest @@ -84,9 +127,26 @@ def lea(src: Register, dest: Operand, offset: int, comment:str = None): stmt.comment = comment return stmt -def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, dest2: Operand = None, pred: Register = None, is_B: bool = False, scalar_offs: bool = False, add_reg: AsmType.i64 = None, sub128: bool = False, expand=None, dest3: Operand = None, dest4: Operand = None): + +def ld( + src: Union[Operand, int], + dest: Operand, + vector: bool, + comment: str = None, + dest2: Operand = None, + pred: Register = None, + is_B: bool = False, + scalar_offs: bool = False, + add_reg: AsmType.i64 = None, + sub128: bool = False, + expand=None, + dest3: Operand = None, + dest4: Operand = None, +): stmt = LoadStmt() - stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) + stmt.src = ( + src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + ) stmt.dest = dest stmt.dest2 = dest2 stmt.dest3 = dest3 @@ -110,9 +170,24 @@ def ld(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None stmt.typ = AsmType.i64 return stmt -def st(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None, src2: Operand = None, pred: Register = None, scalar_offs: bool = False, add_reg: AsmType.i64 = None, expand=None, src3: Operand=None, src4: Operand=None): + +def st( + src: Union[Operand, int], + dest: Operand, + vector: bool, + comment: str = None, + src2: Operand = None, + pred: Register = None, + scalar_offs: bool = False, + add_reg: AsmType.i64 = None, + expand=None, + src3: Operand = None, + src4: Operand = None, +): stmt = StoreStmt() - stmt.src = src if isinstance(src, Operand) else pspamm.architecture.operands.c(src) + stmt.src = ( + src if isinstance(src, Operand) else pypspamm.architecture.operands.c(src) + ) stmt.src2 = src2 stmt.src3 = src3 stmt.src4 = src4 @@ -132,7 +207,16 @@ def st(src: Union[Operand, int], dest: Operand, vector: bool, comment:str = None stmt.typ = AsmType.i64 return stmt -def prefetch(dest: Operand, comment:str = None, pred: Register = None, precision: str = None, access_type: str = None, closeness: str = None, temporality: str = None): + +def prefetch( + dest: Operand, + comment: str = None, + pred: Register = None, + precision: str = None, + access_type: str = None, + closeness: str = None, + temporality: str = None, +): stmt = PrefetchStmt() stmt.dest = dest stmt.comment = comment @@ -144,18 +228,27 @@ def prefetch(dest: Operand, comment:str = None, pred: Register = None, precision stmt.temporality = temporality return stmt + def data(value: Union[Operand, int], asmType=AsmType.i64): stmt = DataStmt() - stmt.value = value if isinstance(value, Operand) else pspamm.architecture.operands.c(value) + stmt.value = ( + value if isinstance(value, Operand) else pypspamm.architecture.operands.c(value) + ) stmt.asmType = asmType return stmt + def rvsetvl(actual: Register, requested: Union[Register, int]): stmt = RVSetVLStmt() stmt.actual = actual - stmt.requested = requested if isinstance(requested, Operand) else pspamm.architecture.operands.c(requested) + stmt.requested = ( + requested + if isinstance(requested, Operand) + else pypspamm.architecture.operands.c(requested) + ) return stmt + # Fluent interface class BlockBuilder(Block): diff --git a/pspamm/codegen/virtual.py b/pypspamm/codegen/virtual.py similarity index 75% rename from pspamm/codegen/virtual.py rename to pypspamm/codegen/virtual.py index a44de74..1507ada 100644 --- a/pspamm/codegen/virtual.py +++ b/pypspamm/codegen/virtual.py @@ -1,8 +1,9 @@ from .operands import Register + class VirtualRegister(Register): def __init__(self, typeinfo, pool): - super().__init__(typeinfo, '') + super().__init__(typeinfo, "") self.register = None self.pool = pool @@ -16,30 +17,43 @@ def setRegister(register: Register): @property def ugly(self): - return self.register.ugly if self.register is not None else f'vreg{id(self)}' - + return self.register.ugly if self.register is not None else f"vreg{id(self)}" + @property def ugly_scalar_1d(self): - return self.register.ugly_scalar_1d if self.register is not None else f'vreg{id(self)}' - + return ( + self.register.ugly_scalar_1d + if self.register is not None + else f"vreg{id(self)}" + ) + @property def ugly_scalar(self): - return self.register.ugly_scalar if self.register is not None else f'vreg{id(self)}' - + return ( + self.register.ugly_scalar + if self.register is not None + else f"vreg{id(self)}" + ) + @property def ugly_xmm(self): - return self.register.ugly_xmm if self.register is not None else f'vreg{id(self)}' - + return ( + self.register.ugly_xmm if self.register is not None else f"vreg{id(self)}" + ) + @property def clobbered(self): - return self.register.clobbered if self.register is not None else f'vreg{id(self)}' - + return ( + self.register.clobbered if self.register is not None else f"vreg{id(self)}" + ) + def firstUsage(self): return None if len(self.usage) == 0 else self.usage[0] - + def lastUsage(self): return None if len(self.usage) == 0 else self.usage[-1] + class RegisterPool: def __init__(self, registers): self.registers = registers @@ -56,12 +70,14 @@ def assign(self, asm): if vreg.lastUsage() is instr: unlive.append(vreg.register) + def usagePass(asm): for instruction in asm.flatten(): for reg in instruction.regs(): if isinstance(reg, VirtualRegister): reg.usage += [instruction] + def assignVirtualRegisters(asm, pools): usagePass(asm) for pool in pools: diff --git a/pspamm/codegen/visitor.py b/pypspamm/codegen/visitor.py similarity index 97% rename from pspamm/codegen/visitor.py rename to pypspamm/codegen/visitor.py index 7e6f941..29d8b6c 100644 --- a/pspamm/codegen/visitor.py +++ b/pypspamm/codegen/visitor.py @@ -1,4 +1,5 @@ -from pspamm.codegen.ast import * +from pypspamm.codegen.ast import * + class Visitor: diff --git a/pypspamm/cursors/__init__.py b/pypspamm/cursors/__init__.py new file mode 100644 index 0000000..734be8c --- /dev/null +++ b/pypspamm/cursors/__init__.py @@ -0,0 +1,5 @@ +from pypspamm.cursors.abstractcursor import BlockInfo, Cursor, CursorLocation +from pypspamm.cursors.blockcursor import BlockCursor, sparse_mask +from pypspamm.cursors.coords import Coords +from pypspamm.cursors.densecursor import DenseCursor +from pypspamm.cursors.matrix import Matrix diff --git a/pspamm/cursors/abstractcursor.py b/pypspamm/cursors/abstractcursor.py similarity index 53% rename from pspamm/cursors/abstractcursor.py rename to pypspamm/cursors/abstractcursor.py index 3481830..e1dfca9 100644 --- a/pspamm/cursors/abstractcursor.py +++ b/pypspamm/cursors/abstractcursor.py @@ -1,24 +1,22 @@ -from pspamm.cursors.matrix import Matrix -from pspamm.cursors.coords import Coords - -from pspamm.codegen.operands import * -from pspamm.codegen.ast import AsmStmt, Command - +from collections import namedtuple from typing import List, Tuple -from collections import namedtuple +from pypspamm.codegen.ast import AsmStmt, Command +from pypspamm.codegen.operands import * +from pypspamm.cursors.coords import Coords +from pypspamm.cursors.matrix import Matrix BlockInfo = namedtuple("Blockinfo", ("br bc pattern_index pattern")) + class CursorLocation: current_block = None # Absolute coords of current block - current_cell = None # Relative? + current_cell = None # Relative? - def __init__(self, - current_block = Coords(absolute=True), - current_cell = Coords(absolute=False) - ) -> None: - assert(current_cell.absolute == False) + def __init__( + self, current_block=Coords(absolute=True), current_cell=Coords(absolute=False) + ) -> None: + assert current_cell.absolute == False self.current_block = current_block self.current_cell = current_cell @@ -48,35 +46,32 @@ def brf(self) -> int: def bcf(self) -> int: return self.c % self.bc - def move(self, - src: CursorLocation, - dest_block: Coords - ) -> Tuple[AsmStmt, CursorLocation]: + def move( + self, src: CursorLocation, dest_block: Coords + ) -> Tuple[AsmStmt, CursorLocation]: raise NotImplementedError() - def look(self, - src: CursorLocation, - dest_block: Coords, - dest_cell: Coords - ) -> Tuple[MemoryAddress, str]: + def look( + self, src: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> Tuple[MemoryAddress, str]: raise NotImplementedError() - def start_location(self, dest_block: Coords = Coords(absolute=True)) -> CursorLocation: + def start_location( + self, dest_block: Coords = Coords(absolute=True) + ) -> CursorLocation: raise NotImplementedError() - def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockInfo: + def get_block( + self, src: CursorLocation = None, dest_block: Coords = None + ) -> BlockInfo: raise NotImplementedError() class CursorMovement(Command): matrix = None + class CursorLookup(MemoryAddress): matrix = None src = None dest = None - - - - - diff --git a/pypspamm/cursors/blockcursor.py b/pypspamm/cursors/blockcursor.py new file mode 100644 index 0000000..26a969c --- /dev/null +++ b/pypspamm/cursors/blockcursor.py @@ -0,0 +1,217 @@ +from typing import cast + +from pypspamm.codegen.sugar import * +from pypspamm.cursors.abstractcursor import * +from pypspamm.cursors.coords import Coords +from pypspamm.cursors.matrix import Matrix + + +class BlockCursor(Cursor): + + blocks = None + patterns = None + offsets = None + + def __init__( + self, + name: str, + base_ptr: Register, + rows: int, + cols: int, + ld: int, + block_rows: int, + block_cols: int, + scalar_bytes: int, + blocks: Matrix[int], + patterns: List[Matrix[bool]], + mtx_overhead, + ) -> None: + + self.name = name + self.base_ptr = base_ptr + self.scalar_bytes = scalar_bytes + self.r = rows + self.c = cols + self.ld = ld + self.br = block_rows + self.bc = block_cols + self.blocks = blocks + self.patterns = patterns + + self.offsets = Matrix.full(rows, cols, -1) + x = 0 + for i in range(self.c): + for j in range(self.r): + Bci = i // self.bc + Bri = j // self.br + index = cast(int, blocks[Bri, Bci]) + pattern = patterns[index] + if pattern[j % self.br, i % self.bc]: + self.offsets[j, i] = x + x += 1 + if ld != 0: + x += self.ld - self.r + x += mtx_overhead[i] + + def offset(self, src_loc: CursorLocation, dest_loc: CursorLocation) -> int: + + src_block = src_loc.current_block + src_cell = src_loc.current_cell + dest_block = dest_loc.current_block + dest_cell = dest_loc.current_cell + + if not dest_block.absolute: + dest_block += src_block + + assert src_block.absolute + assert not src_cell.absolute + assert not dest_cell.absolute + + src_cell += Coords(src_block.down * self.br, src_block.right * self.bc, True) + dest_cell += Coords(dest_block.down * self.br, dest_block.right * self.bc, True) + + src_offset = self.offsets[src_cell.down, src_cell.right] + dest_offset = self.offsets[dest_cell.down, dest_cell.right] + + if src_offset == -1 or dest_offset == -1: + raise Exception("Cursor location does not exist in memory!") + + return dest_offset + + def move( + self, src_loc: CursorLocation, dest_block: Coords + ) -> Tuple[AsmStmt, CursorLocation]: + + comment = f"Move {self.name} to {str(dest_block)}" + + if dest_block.absolute: + dest_loc = self.start_location(dest_block) + else: + dest_loc = self.start_location(dest_block + src_loc.current_block) + + offset_bytes = self.offset(src_loc, dest_loc) * self.scalar_bytes + + return add(offset_bytes, self.base_ptr, comment), dest_loc + + def look( + self, src_loc: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> Tuple[MemoryAddress, str]: + + dest_loc = CursorLocation(dest_block, dest_cell) + offset_bytes = self.offset(src_loc, dest_loc) * self.scalar_bytes + comment = f"{self.name}[{dest_block.down},{dest_block.right}][{dest_cell.down},{dest_cell.right}]" + + addr = pypspamm.architecture.operands.mem(self.base_ptr, offset_bytes) + + return (addr, comment) + + def get_block( + self, src: CursorLocation = None, dest_block: Coords = None + ) -> BlockInfo: + + if src is None: # Have dest_block but no src + assert dest_block is not None + assert dest_block.absolute == True + block_abs = dest_block + + elif dest_block is None: # Have src but no dest_block + assert src.current_block.absolute == True + block_abs = src.current_block + + elif dest_block.absolute: # Have src and absolute dest_block + block_abs = dest_block + + else: # Have both src and relative dest_block + assert src.current_block.absolute == True + block_abs = dest_block + src.current_block + + br = self.br if block_abs.down < self.Br else self.brf # TODO: Verify these + bc = self.bc if block_abs.right < self.Bc else self.bcf + index = self.blocks[block_abs.down, block_abs.right] + index = cast(int, index) # TODO: Overload functions correctly + pattern = self.patterns[index][0:br, 0:bc] + pattern = cast(Matrix[bool], pattern) + return BlockInfo(br, bc, index, pattern) + + def has_nonzero_cell( + self, src_loc: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> bool: + + assert not dest_cell.absolute + if not dest_block.absolute: + dest_block += src_loc.current_block + + dest_cell += Coords(dest_block.down * self.br, dest_block.right * self.bc, True) + return ( + self.offsets.shape[0] > dest_cell.down + and self.offsets.shape[1] > dest_cell.right + and self.offsets[dest_cell.down, dest_cell.right] != -1 + ) + + def has_nonzero_block(self, src: CursorLocation, dest_block: Coords) -> bool: + nonzero = False + br, bc, idx, pat = self.get_block(src, dest_block) + for bci in range(bc): + for bri in range(br): + if pat[bri, bci]: + nonzero = True + return nonzero + + def start_location( + self, dest_block: Coords = Coords(absolute=True) + ) -> CursorLocation: + + assert dest_block.absolute == True + br, bc, idx, pat = self.get_block(dest_block=dest_block) + for bci in range(bc): + for bri in range(br): + if pat[bri, bci]: + return CursorLocation( + dest_block, Coords(down=bri, right=bci, absolute=False) + ) + + raise Exception( + f"Block {dest_block} has no starting location because it is empty!" + ) + + def start(self) -> CursorLocation: + + Br, Bc = self.blocks.shape + for Bci in range(Bc): + for Bri in range(Br): + target_block = Coords(down=Bri, right=Bci, absolute=True) + if self.has_nonzero_block(None, target_block): + return self.start_location(target_block) + raise Exception("Matrix is completely empty!") + + +def sparse_mask( + A_regs: Matrix[Register], + A: Cursor, + A_ptr: CursorLocation, + A_block_offset: Coords, + B: Cursor, + B_ptr: CursorLocation, + B_block_offset: Coords, + v_size: int, + has_mask: bool = False, +) -> Matrix[bool]: + + Vr, Vc = A_regs.shape + mask = Matrix.full(Vr, Vc, False) + A_br, A_bc, A_idx, A_pat = A.get_block(A_ptr, A_block_offset) + B_br, B_bc, B_idx, B_pat = B.get_block(B_ptr, B_block_offset) + + if not has_mask: + assert ( + A_br % v_size == 0 + ) # bm must tile m exactly for now in non-mask-supporting ISAs + assert Vc >= A_bc # Matrix block must fit in register block + assert A_bc == B_br # Matrix blocks are compatible + + # Mask out registers not used in current block, including zero-rows of B and A + for Vci in range(A_bc): + if B_pat[Vci, :].any(axis=1): + mask[:, Vci] = A_pat[:, Vci] + + return mask diff --git a/pspamm/cursors/coords.py b/pypspamm/cursors/coords.py similarity index 63% rename from pspamm/cursors/coords.py rename to pypspamm/cursors/coords.py index fbdd817..176bed0 100644 --- a/pspamm/cursors/coords.py +++ b/pypspamm/cursors/coords.py @@ -6,33 +6,38 @@ # a logical block start, or a physical block start depending on context. # We are including a {relative|absolute} flag in order to reduce the number of methods. -C = namedtuple('C', 'down right absolute') +C = namedtuple("C", "down right absolute") C.__new__.__defaults__ = (0, 0, False) + class Coords(C): def copy(self): return Coords(self.down, self.right, self.absolute) - + def __add__(self, other): absolute = self.absolute | other.absolute - return Coords(self.down+other.down, self.right+other.right, absolute) + return Coords(self.down + other.down, self.right + other.right, absolute) def __sub__(self, other): - absolute = self.absolute != other.absolute # TODO: What is the math behind this? - return Coords(self.down-other.down, self.right-other.right, absolute) + absolute = ( + self.absolute != other.absolute + ) # TODO: What is the math behind this? + return Coords(self.down - other.down, self.right - other.right, absolute) def __neg__(self, other): return Coords(-self.down, -self.right, self.absolute) def __eq__(self, other): - return self.down == other.down and \ - self.right == other.right and \ - self.absolute == other.absolute + return ( + self.down == other.down + and self.right == other.right + and self.absolute == other.absolute + ) def __repr__(self): if self.absolute: absolute = ", absolute" else: absolute = "" - return f"(d={self.down},r={self.right}{absolute})" \ No newline at end of file + return f"(d={self.down},r={self.right}{absolute})" diff --git a/pspamm/cursors/densecursor.py b/pypspamm/cursors/densecursor.py similarity index 53% rename from pspamm/cursors/densecursor.py rename to pypspamm/cursors/densecursor.py index 7a59e6a..7a800ab 100644 --- a/pspamm/cursors/densecursor.py +++ b/pypspamm/cursors/densecursor.py @@ -1,25 +1,27 @@ from typing import List, Tuple, cast -from pspamm.codegen.sugar import * -from pspamm.cursors import * - +from pypspamm.codegen.sugar import * +from pypspamm.cursors.abstractcursor import * +from pypspamm.cursors.matrix import * class DenseCursor(Cursor): - def __init__(self, - name: str, - base_ptr: Register, - rows:int, - cols:int, - ld: int, - block_rows: int, - block_cols: int, - scalar_bytes:int) -> None: + def __init__( + self, + name: str, + base_ptr: Register, + rows: int, + cols: int, + ld: int, + block_rows: int, + block_cols: int, + scalar_bytes: int, + ) -> None: self.name = name self.r, self.c = rows, cols - self.br, self.bc = block_rows, block_cols + self.br, self.bc = block_rows, block_cols self.pattern = Matrix.full(block_rows, block_cols, True) self.base_ptr = base_ptr @@ -30,31 +32,26 @@ def __init__(self, x = 0 for bci in range(self.bc): for bri in range(self.br): - self.offsets[bri,bci] = x + self.offsets[bri, bci] = x x += 1 - def offset(self, - src_block: Coords, - dest_block: Coords, - dest_cell: Coords - ) -> int: + def offset(self, src_block: Coords, dest_block: Coords, dest_cell: Coords) -> int: # TODO: Why not make offset compute the 1D distance # from current pointer to desired logical cell instead? - assert(src_block.absolute == True) - assert(dest_cell.absolute == False) + assert src_block.absolute == True + assert dest_cell.absolute == False if not dest_block.absolute: dest_block += src_block Bri, Bci = dest_block.down, dest_block.right bri, bci = dest_cell.down, dest_cell.right - return (Bci*self.bc + bci) * self.ld + Bri*self.br + bri + return (Bci * self.bc + bci) * self.ld + Bri * self.br + bri - def move(self, - src: CursorLocation, - dest_block: Coords - ) -> Tuple[AsmStmt, CursorLocation]: + def move( + self, src: CursorLocation, dest_block: Coords + ) -> Tuple[AsmStmt, CursorLocation]: if dest_block.absolute: dest_block_abs = dest_block @@ -68,13 +65,11 @@ def move(self, dest = CursorLocation(dest_block_abs, src.current_cell) return (add(rel_offset, self.base_ptr, comment), dest) - def look(self, - src: CursorLocation, - dest_block: Coords, - dest_cell: Coords - ) -> Tuple[MemoryAddress, str]: + def look( + self, src: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> Tuple[MemoryAddress, str]: - assert(dest_cell.absolute == False) + assert dest_cell.absolute == False comment = f"{self.name} [{dest_block.down},{dest_block.right}] [{dest_cell.down},{dest_cell.right}]" @@ -82,37 +77,44 @@ def look(self, dest_offset_abs = self.offset(src.current_block, dest_block, dest_cell) rel_offset = self.scalar_bytes * (dest_offset_abs - src_offset_abs) - addr = pspamm.architecture.operands.mem(self.base_ptr, rel_offset) + addr = pypspamm.architecture.operands.mem(self.base_ptr, rel_offset) return (addr, comment) + def start_location( + self, dest_block: Coords = Coords(absolute=True) + ) -> CursorLocation: - def start_location(self, dest_block: Coords = Coords(absolute=True)) -> CursorLocation: - - assert(dest_block.absolute == True) - #TODO: Handle fringe case? + assert dest_block.absolute == True + # TODO: Handle fringe case? for bci in range(self.bc): for bri in range(self.br): if self.offsets[bri, bci] != -1: - return CursorLocation(dest_block, Coords(down=bri, right=bci, absolute=False)) + return CursorLocation( + dest_block, Coords(down=bri, right=bci, absolute=False) + ) - raise Exception(f"Block {dest_block} has no starting location because it is empty!") + raise Exception( + f"Block {dest_block} has no starting location because it is empty!" + ) - def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockInfo: + def get_block( + self, src: CursorLocation = None, dest_block: Coords = None + ) -> BlockInfo: if src is None: - assert(dest_block is not None) - assert(dest_block.absolute == True) + assert dest_block is not None + assert dest_block.absolute == True block_abs = dest_block elif dest_block is None: - assert(src.current_block.absolute == True) + assert src.current_block.absolute == True block_abs = src.current_block else: - assert(src.current_block.absolute == True) - assert(dest_block.absolute == False) + assert src.current_block.absolute == True + assert dest_block.absolute == False block_abs = dest_block + src.current_block - br = self.br if block_abs.down < self.Br else self.brf #TODO: Verify these + br = self.br if block_abs.down < self.Br else self.brf # TODO: Verify these bc = self.bc if block_abs.right < self.Bc else self.bcf index = 0 pattern = self.pattern[0:br, 0:bc] @@ -121,12 +123,14 @@ def get_block(self, src: CursorLocation=None, dest_block: Coords=None) -> BlockI def has_nonzero_block(self, src: CursorLocation, dest_block: Coords) -> bool: return True - - def has_nonzero_cell(self, - src_loc: CursorLocation, - dest_block: Coords, - dest_cell: Coords) -> bool: - return self.offsets.shape[0] > dest_cell.down and self.offsets.shape[1] > dest_cell.right + + def has_nonzero_cell( + self, src_loc: CursorLocation, dest_block: Coords, dest_cell: Coords + ) -> bool: + return ( + self.offsets.shape[0] > dest_cell.down + and self.offsets.shape[1] > dest_cell.right + ) def start(self) -> CursorLocation: return CursorLocation() diff --git a/pspamm/cursors/matrix.py b/pypspamm/cursors/matrix.py similarity index 71% rename from pspamm/cursors/matrix.py rename to pypspamm/cursors/matrix.py index c939cb5..cc39709 100644 --- a/pspamm/cursors/matrix.py +++ b/pypspamm/cursors/matrix.py @@ -1,16 +1,17 @@ - - # Need a native Python matrix type. # Lists of lists are too cumbersome, and scipy does not understand typing. # Also don't want to introduce a hard dependence on scipy if not necessary. -from typing import TypeVar, Generic, Union, Tuple, List, overload, Any -from scipy.sparse import csc_matrix -from scipy.io import mmread, mmwrite -import numpy as np import random +from typing import Any, Generic, List, Tuple, TypeVar, Union, overload + +import numpy as np +from scipy.io import mmread, mmwrite +from scipy.sparse import csc_matrix + +T = TypeVar("T") + -T = TypeVar('T') class Matrix(Generic[T]): def __init__(self, data): @@ -23,16 +24,16 @@ def __init__(self, data): self.cols = self.shape[1] @classmethod - def full(cls, rows:int, cols:int, initial_value:T): + def full(cls, rows: int, cols: int, initial_value: T): """Create a brand new matrix of given size""" - return cls(np.full((rows,cols), initial_value)) + return cls(np.full((rows, cols), initial_value)) def __repr__(self): col_str = [] for ri in range(self.rows): row_str = [] for ci in range(self.cols): - row_str.append(str(self._underlying[ri,ci]).rjust(8)) + row_str.append(str(self._underlying[ri, ci]).rjust(8)) col_str.append("".join(row_str)) return "\n".join(col_str) @@ -40,11 +41,11 @@ def __eq__(self, other): return (self._underlying == other._underlying).all() @overload - def __getitem__(self, t: Tuple[slice,slice]) -> "Matrix[T]": + def __getitem__(self, t: Tuple[slice, slice]) -> "Matrix[T]": pass @overload - def __getitem__(self, t: Tuple[int,int]) -> T: + def __getitem__(self, t: Tuple[int, int]) -> T: pass def __getitem__(self, t) -> Union[T, "Matrix[T]"]: @@ -54,7 +55,7 @@ def __getitem__(self, t) -> Union[T, "Matrix[T]"]: else: return result - def __setitem__(self, cell:Tuple[int,int], value:T): + def __setitem__(self, cell: Tuple[int, int], value: T): self._underlying[cell] = value def __or__(self, other): @@ -68,12 +69,17 @@ def any(self, axis=None, out=None): def nnz(self, axis=None) -> Union[int, List[int]]: if axis is None: - return sum(self[r,c] != 0 for r in range(self.rows) - for c in range(self.cols)) + return sum( + self[r, c] != 0 for r in range(self.rows) for c in range(self.cols) + ) if axis == 1: - return [sum(self[r,c] != 0 for r in range(self.rows)) for c in range(self.cols)] + return [ + sum(self[r, c] != 0 for r in range(self.rows)) for c in range(self.cols) + ] if axis == 0: - return [sum(self[r,c] != 0 for c in range(self.cols)) for r in range(self.rows)] + return [ + sum(self[r, c] != 0 for c in range(self.cols)) for r in range(self.rows) + ] @classmethod def load_pattern(cls, filename) -> "Matrix[bool]": diff --git a/pypspamm/matmul.py b/pypspamm/matmul.py new file mode 100644 index 0000000..6bddab3 --- /dev/null +++ b/pypspamm/matmul.py @@ -0,0 +1,833 @@ +from typing import Tuple + +import numpy + +import pypspamm.architecture +from pypspamm.codegen.ast import * +from pypspamm.codegen.forms import * +from pypspamm.codegen.precision import * +from pypspamm.codegen.prune import * +from pypspamm.codegen.sugar import * +from pypspamm.codegen.virtual import * +from pypspamm.cursors import * + + +def decompose_pattern( + k, n, pattern: Matrix[bool], bk: int, bn: int +) -> Tuple[Matrix[int], List[Matrix[bool]]]: + Bk, Bn = k // bk, n // bn + patterns = [] + x = 0 + + n_overhead = n % bn + k_overhead = k % bk + + if n_overhead > 0: + Bn += 1 + if k_overhead > 0: + Bk += 1 + + blocks = Matrix.full(Bk, Bn, -1) + + for Bni in range(Bn): + for Bki in range(Bk): + if Bni + 1 == Bn and n_overhead > 0 and Bki + 1 == Bk and k_overhead > 0: + block = pattern[ + (Bki * bk) : ((Bki + 1) * bk + k_overhead), + (Bni * bn) : ((Bni) * bn + n_overhead), + ] + elif Bni + 1 == Bn and n_overhead > 0: + block = pattern[ + (Bki * bk) : ((Bki + 1) * bk), + (Bni * bn) : ((Bni) * bn + n_overhead), + ] + elif Bki + 1 == Bk and k_overhead > 0: + block = pattern[ + (Bki * bk) : ((Bki + 1) * bk + k_overhead), + (Bni * bn) : ((Bni + 1) * bn), + ] + else: + block = pattern[ + (Bki * bk) : ((Bki + 1) * bk), (Bni * bn) : ((Bni + 1) * bn) + ] + + blocks[Bki, Bni] = x + x += 1 + patterns.append(block) + + mtx_overhead = [0] * n + + for i in range(n): + for j in range(k, pattern.rows): + if pattern[j, i]: + mtx_overhead[i] += 1 + + return blocks, patterns, mtx_overhead + + +class MatMul: + def __init__( + self, + m: int, + n: int, + k: int, + lda: int, + ldb: int, + ldc: int, + alpha: str, + beta: str, + mtx_filename: str, + amtx_filename: str, + bmtx_filename: str, + mtx_format: str = "any", + output_funcname: str = None, + output_filename: str = None, + output_overwrite: bool = False, + bm: int = None, + bn: int = None, + bk: int = None, + arch: str = "knl", + precision: str = "d", + prefetching: str = None, + **kwargs, # Accept and ignore args which don't belong + ) -> None: + + self.m = m + self.n = n + self.k = k + + self.lda = lda + self.ldb = ldb + self.ldc = ldc + + try: + self.alpha = float(alpha) + except: + self.alpha = "generic" + try: + self.beta = float(beta) + except: + self.beta = "generic" + + if arch.startswith("skx"): + arch = "knl" + arch[3:] + + # hacky implementation of multi-register length + if arch.startswith("arm_sve"): + if len(arch) == 7: + v_len_regs = 4 # compatibility: arm_sve == arm_sve512 + else: + v_len_bits = int(arch[7:]) + assert v_len_bits % 128 == 0 and v_len_bits <= 2048 + v_len_regs = v_len_bits // 128 + arch = "arm_sve" + + if arch.startswith("knl"): + if len(arch) == 3: + v_len_regs = 4 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256, 512) + v_len_regs = v_len_bits // 128 + arch = "knl" + + if arch.startswith("hsw"): + if len(arch) == 3: + v_len_regs = 2 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = "hsw" + + if arch.startswith("rvv"): + if len(arch) == 3: + v_len_regs = 1 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256, 512, 1024, 2048, 4096, 8192) + v_len_regs = v_len_bits // 128 + arch = "rvv" + + if arch.startswith("arm") and not arch.startswith("arm_sve"): + # only 128 supported + v_len_regs = 1 + arch = "arm" + + if arch.startswith("lsx"): + if len(arch) == 3: + v_len_regs = 1 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = "lsx" + + if arch.startswith("lasx"): + if len(arch) == 4: + v_len_regs = 2 + else: + v_len_bits = int(arch[4:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = "lsx" + + self.arch = arch + assert precision.lower() in ["bf16", "h", "s", "d"] + self.precision = { + "h": Precision.HALF, + "s": Precision.SINGLE, + "d": Precision.DOUBLE, + "bf16": Precision.BFLOAT16, + }[precision.lower()] + + pypspamm.architecture.init() + pypspamm.architecture.arch = arch + pypspamm.architecture.Generator = pypspamm.architecture.get_class( + "pypspamm.codegen.architectures." + arch + ".generator" + ).Generator + pypspamm.architecture.operands = pypspamm.architecture.get_class( + "pypspamm.codegen.architectures." + arch + ".operands" + ) + pypspamm.architecture.blocksize = pypspamm.architecture.get_class( + "pypspamm.codegen.architectures." + arch + ".blocksize" + ).Default + + self.generator = pypspamm.architecture.Generator(self.precision) + + # flag that determines if a matmul kernel uses sve instructions -> needed for sve predicates + self.masks = self.generator.has_masks() + # define which architectures need to use an explicit broadcast, necessary for alpha/beta values + self.use_bcst = self.generator.use_broadcast() + + self.generator.v_len = v_len_regs + + self.v_size = self.generator.get_v_size() + + if bk == None: + bk = 2 if arch == "knl" else 1 + + if bm == None or bn == None: + (self.bm, self.bn, self.bk) = pypspamm.architecture.blocksize.getBlocksize( + m, n, bk, self.v_size, self.precision + ) + else: + self.bm = bm + self.bn = bn + self.bk = bk + + self.prefetching = prefetching + + self.output_funcname = output_funcname + self.output_filename = output_filename + self.output_overwrite = output_overwrite + + if ldb == 0: + if bmtx_filename is None or bmtx_filename == "": + bmtx_filename = mtx_filename + bpattern = Matrix.load(bmtx_filename) + self.generator.set_sparse() + else: + bpattern = Matrix.full(k, n, True) + assert self.k <= ldb + + if lda == 0: + apattern = Matrix.load(amtx_filename) + self.generator.set_sparse() + else: + apattern = Matrix.full(m, k, True) + assert self.m <= lda + + self.bmtx_filename = bmtx_filename + self.amtx_filename = amtx_filename + self.mtx_format = mtx_format + + assert self.m <= ldc + + self.bnnz = bpattern.nnz() + self.annz = apattern.nnz() + + # compute flops by splitting into outer products over k + kannz = apattern.nnz(1) + kbnnz = bpattern.nnz(0) + self.flop = 2 * sum(ka * kb for ka, kb in zip(kannz, kbnnz)) + + # if matrices are always padded to multiple of v_size, we can remove the if-part and execute the assert for SVE too + if not self.masks: + assert self.m % self.v_size == 0 + + ( + self.A_regs, + self.B_regs, + self.C_regs, + self.starting_regs, + self.alpha_reg, + self.beta_reg, + self.loop_regs, + self.additional_regs, + self.mask_regs, + self.prefetch_reg, + ) = self.generator.make_reg_blocks( + self.bm, + self.bn, + self.bk, + self.v_size, + self.bnnz, + self.m, + self.n, + self.k, + self.prefetching, + ) + + self.A_pool = RegisterPool( + [ + self.A_regs[i, j] + for i in range(self.A_regs.shape[0]) + for j in range(self.A_regs.shape[1]) + ] + ) + self.B_pool = RegisterPool( + [ + self.B_regs[i, j] + for i in range(self.B_regs.shape[0]) + for j in range(self.B_regs.shape[1]) + ] + ) + self.C_pool = RegisterPool( + [ + self.C_regs[i, j] + for i in range(self.C_regs.shape[0]) + for j in range(self.C_regs.shape[1]) + ] + ) + + self.alpha_bcst_reg, self.beta_bcst_reg = ( + self.starting_regs[3], + self.starting_regs[4], + ) + + if lda == 0: + blocks, patterns, mtx_overhead = decompose_pattern( + self.m, self.k, apattern, self.bm, self.bk + ) + self.A = BlockCursor( + "A", + self.starting_regs[0], + self.m, + self.k, + self.lda, + self.bm, + self.bk, + self.precision.size(), + blocks, + patterns, + mtx_overhead, + ) + self.annz += sum(mtx_overhead) + else: + self.A = DenseCursor( + "A", + self.starting_regs[0], + self.m, + self.k, + self.lda, + self.bm, + self.bk, + self.precision.size(), + ) + if ldb == 0: + blocks, patterns, mtx_overhead = decompose_pattern( + self.k, self.n, bpattern, self.bk, self.bn + ) + self.B = BlockCursor( + "B", + self.starting_regs[1], + self.k, + self.n, + self.ldb, + self.bk, + self.bn, + self.precision.size(), + blocks, + patterns, + mtx_overhead, + ) + self.bnnz += sum(mtx_overhead) + else: + self.B = DenseCursor( + "B", + self.starting_regs[1], + self.k, + self.n, + self.ldb, + self.bk, + self.bn, + self.precision.size(), + ) + self.C = DenseCursor( + "C", + self.starting_regs[2], + self.m, + self.n, + self.ldc, + self.bm, + self.bn, + self.precision.size(), + ) + self.C_pf = ( + DenseCursor( + "C_pf", + self.starting_regs[5], + self.m, + self.n, + self.ldc, + self.bm, + self.bn, + self.precision.size(), + ) + if self.prefetch_reg + else None + ) + + self.unroll_n = ldb == 0 + self.unroll_m = lda == 0 + + # use unused loop registers for scaling instead + if self.unroll_m: + self.additional_regs += [self.loop_regs[0]] + if self.unroll_n: + self.additional_regs += [self.loop_regs[1]] + if self.unroll_m or self.unroll_n: + self.additional_regs += [self.loop_regs[2]] + + def microkernel(self, asm, Bmi, Bni, unroll, A_ptr, B_ptr, C_ptr, C_pf_ptr): + Bn = self.n // self.bn + Bk = self.k // self.bk + Bm = self.m // self.bm + + vm = self.generator.ceil_div(self.bm, self.v_size) + + n_overhead = self.n % self.bn + k_overhead = self.k % self.bk + m_overhead = self.m % self.bm + vm_overhead = -(m_overhead // -self.v_size) + + if n_overhead > 0: + Bn += 1 + if k_overhead > 0: + Bk += 1 + if m_overhead > 0: + Bm += 1 + + regs = Matrix( + [ + [ + VirtualRegister(self.C_regs[0, 0].typeinfo, self.C_pool) + for _ in range(self.C_regs.shape[1]) + ] + for _ in range(self.C_regs.shape[0]) + ] + ) + + BnEnd = Bni + 1 == Bn + BmEnd = Bmi + 1 == Bm + + if BnEnd and n_overhead > 0: + regs = regs[:, :n_overhead] + if BmEnd and m_overhead > 0: + regs = regs[:vm_overhead, :] + + C_ptr_in = CursorLocation(Coords(right=Bni, down=Bmi, absolute=True)) + to_C = Coords() + C_ptr_pf_in = C_ptr_in + + if self.alpha in [-1.0, 1.0] and self.beta != 0.0: + asm.add( + self.generator.move_register_block( + self.C, + C_ptr_in, + to_C, + regs, + self.v_size, + self.additional_regs, + None, + False, + ) + ) + if self.beta != 1.0: + if self.use_bcst: + asm.add( + bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta") + ) + for ic in range(regs.shape[1]): + for ir in range(regs.shape[0]): + pred_m = ( + None + if not self.masks + else self.generator.pred_n_trues( + self.bm - ir * self.v_size, self.v_size, "m" + ) + ) + asm.add( + mul( + regs[ir, ic], + self.beta_reg[1], + regs[ir, ic], + "C = beta * C", + pred=pred_m, + ) + ) + else: + asm.add(self.generator.make_zero_block(regs, self.additional_regs)) + + def kernelK(asm, Bki): + if unroll: + # adjust registers if necessary for the last operation + + if BmEnd and m_overhead > 0 and not self.unroll_m: + A_ptr_in = CursorLocation(Coords(right=0, down=Bmi, absolute=True)) + else: + A_ptr_in = A_ptr + to_A = ( + Coords(right=Bki, down=Bmi, absolute=True) + if self.unroll_m + else Coords(right=Bki) + ) + + if BnEnd and n_overhead > 0 and not self.unroll_n: + B_ptr_in = CursorLocation(Coords(down=0, right=Bni, absolute=True)) + else: + B_ptr_in = B_ptr + to_B = ( + Coords(right=Bni, down=Bki, absolute=True) + if self.unroll_n + else Coords(down=Bki) + ) + keep = ( + not self.unroll_n or self.B.has_nonzero_block(B_ptr_in, to_B) + ) and (not self.unroll_m or self.A.has_nonzero_block(A_ptr_in, to_A)) + else: + # setting A_ptr, B_ptr here may be a bit too hacky... + A_ptr_in = CursorLocation(Coords(right=Bki, down=Bmi, absolute=True)) + B_ptr_in = CursorLocation(Coords(right=Bni, down=Bki, absolute=True)) + to_A = Coords() + to_B = Coords() + keep = True + + sub = self.alpha == -1.0 + + if keep: + A_regs = Matrix( + [ + [ + VirtualRegister(self.A_regs[0, 0].typeinfo, self.A_pool) + for _ in range(self.A_regs.shape[1]) + ] + for _ in range(self.A_regs.shape[0]) + ] + ) + B_regs = Matrix( + [ + [ + VirtualRegister(self.B_regs[0, 0].typeinfo, self.B_pool) + for _ in range(self.B_regs.shape[1]) + ] + for _ in range(self.B_regs.shape[0]) + ] + ) + asm.add( + self.generator.make_microkernel( + self.A, + self.B, + A_ptr_in, + B_ptr_in, + A_regs, + B_regs, + regs, + self.v_size, + self.additional_regs, + to_A, + to_B, + sub, + ) + ) + + self.loopwrap( + asm, + kernelK, + Bk, + k_overhead > 0, + unroll, + self.loop_regs[2], + [self.A, self.B], + [A_ptr, B_ptr], + ["right", "down"], + loopunroll=1, + overlap=True, + ) + + if self.alpha not in [-1.0, 1.0]: + store_block = block("") + + if self.use_bcst: + store_block.add( + bcst(self.alpha_bcst_reg, self.alpha_reg[1], "Broadcast alpha") + ) + if self.beta != 0.0 and self.beta != 1.0: + store_block.add( + bcst(self.beta_bcst_reg, self.beta_reg[1], "Broadcast beta") + ) + + for x in range(0, regs.shape[1], self.A_regs.shape[1]): + A_regs = Matrix( + [ + [ + VirtualRegister(self.A_regs[0, 0].typeinfo, self.A_pool) + for _ in range(self.A_regs.shape[1]) + ] + for _ in range(self.A_regs.shape[0]) + ] + ) + A_regs_cut = A_regs[ + 0 : min(self.A_regs.shape[0], regs.shape[0]), 0 : regs.shape[1] - x + ] + if self.beta != 0.0: + store_block.add( + self.generator.move_register_block( + self.C, + C_ptr_in, + to_C, + A_regs_cut, + self.v_size, + self.additional_regs, + None, + False, + None, + self.ldc * x, + ) + ) + + for ir in range(A_regs_cut.shape[0]): + for ic in range(A_regs_cut.shape[1]): + pred_m = ( + None + if not self.masks + else self.generator.pred_n_trues( + self.bm - ir * self.v_size, self.v_size, "m" + ) + ) + if self.beta != 0.0 and self.beta != 1.0: + store_block.add( + mul( + A_regs_cut[ir, ic], + self.beta_reg[1], + A_regs_cut[ir, ic], + "C = beta * C + alpha * AB", + pred=pred_m, + ) + ) + + if self.beta == 0.0: + store_block.add( + mul( + regs[ir, x + ic], + self.alpha_reg[1], + A_regs_cut[ir, ic], + "C = alpha * AB", + pred=pred_m, + ) + ) + else: + store_block.add( + fma( + regs[ir, x + ic], + self.alpha_reg[1], + A_regs_cut[ir, ic], + "C = C + alpha * AB", + None, + pred=pred_m, + ) + ) + store_block.add( + self.generator.move_register_block( + self.C, + C_ptr_in, + to_C, + A_regs_cut, + self.v_size, + self.additional_regs, + None, + True, + self.prefetching, + self.ldc * x, + self.C_pf, + C_pf_ptr, + ) + ) + asm.add(store_block) + else: + asm.add( + self.generator.move_register_block( + self.C, + C_ptr_in, + to_C, + regs, + self.v_size, + self.additional_regs, + None, + True, + self.prefetching, + 0, + self.C_pf, + C_pf_ptr, + ) + ) + + def blockloop(self, asm, A_ptr, B_ptr, C_ptr, C_pf_ptr): + Bn = self.n // self.bn + Bk = self.k // self.bk + Bm = self.m // self.bm + + vm = self.generator.ceil_div(self.bm, self.v_size) + + n_overhead = self.n % self.bn + k_overhead = self.k % self.bk + m_overhead = self.m % self.bm + vm_overhead = -(m_overhead // -self.v_size) + + if n_overhead > 0: + Bn += 1 + if k_overhead > 0: + Bk += 1 + if m_overhead > 0: + Bm += 1 + + argsA = [ + Bm, + m_overhead > 0, + self.unroll_m, + self.loop_regs[0], + [self.A], + [A_ptr], + ["down"], + ] + argsB = [ + Bn, + n_overhead > 0, + self.unroll_n, + self.loop_regs[1], + [self.B], + [B_ptr], + ["right"], + ] + + if self.unroll_n and not self.unroll_m: + # swap loops + outerArgs, innerArgs = (argsB, argsA) + dirC, dirC2 = ("down", "right") + args = lambda i, j: (j, i) + else: + outerArgs, innerArgs = (argsA, argsB) + dirC, dirC2 = ("right", "down") + args = lambda i, j: (i, j) + + unroll_k = self.unroll_m | self.unroll_n + + def outerLoop(asm, i): + def innerLoop(asm, j): + Bmi, Bni = args(i, j) + self.microkernel(asm, Bmi, Bni, unroll_k, A_ptr, B_ptr, C_ptr, C_pf_ptr) + if j < innerArgs[0] - 1: + move_C, _ = self.C.move(C_ptr, Coords(**{dirC: 1})) + asm.add(move_C) + if self.C_pf: + move_C_pf, _ = self.C_pf.move(C_pf_ptr, Coords(**{dirC: 1})) + asm.add(move_C_pf) + + overhead = self.loopwrap(asm, innerLoop, *innerArgs) + moveLength = 1 - innerArgs[0] if overhead else -innerArgs[0] + asm.add(self.C.move(C_ptr, Coords(**{dirC2: 1, dirC: moveLength}))[0]) + if self.C_pf: + asm.add( + self.C_pf.move(C_pf_ptr, Coords(**{dirC2: 1, dirC: moveLength}))[0] + ) + + self.loopwrap(asm, outerLoop, *outerArgs) + + def loopwrap( + self, + asm, + inner, + length, + overhead, + unroll, + loopreg, + matrices, + ptrs, + directions, + loopunroll=1, + overlap=False, + ): + if unroll: + for i in range(length): + inner(asm, i) + return True + else: + + def makeMove(dist): + asm = block(f"move by {dist}") + for matrix, ptr, direction in zip(matrices, ptrs, directions): + asm.add(matrix.move(ptr, Coords(**{direction: dist}))[0]) + return asm + + def makeLoop(until): + loopblock = block("kernel") + inner(loopblock, 0) + loopblock.add(makeMove(1)) + return loop(loopreg, until, unroll=loopunroll, overlap=overlap).body( + loopblock + ) + + if length == 1: + inner(asm, 0) + return True + elif overhead: + if length > 1: + asm.add(makeLoop(length - 1)) + inner(asm, length - 1) + asm.add(makeMove(1 - length)) + return True + else: + asm.add(makeLoop(length)) + asm.add(makeMove(-length)) + return False + + def make(self): + A_ptr = self.A.start() + B_ptr = self.B.start() + C_ptr = self.C.start() + C_pf_ptr = self.C_pf.start() if self.C_pf else None + + asm = block("kernel") + + asm.add( + self.generator.make_argument_load(self.starting_regs, self.C_pf is not None) + ) + + asm.add( + block( + "header", + self.generator.make_scaling_offsets(self.additional_regs, self.bnnz), + self.generator.init_mask( + self.m, self.bm, self.v_size, self.loop_regs[0], self.mask_regs + ), + ) + ) + + asm.add(self.generator.init_block(self.v_size)) + + self.blockloop(asm, A_ptr, B_ptr, C_ptr, C_pf_ptr) + + assignVirtualRegisters(asm, [self.A_pool, self.B_pool, self.C_pool]) + + return asm diff --git a/pypspamm/metagen/__init__.py b/pypspamm/metagen/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pypspamm/metagen/arm.py b/pypspamm/metagen/arm.py new file mode 100644 index 0000000..9576871 --- /dev/null +++ b/pypspamm/metagen/arm.py @@ -0,0 +1,9 @@ +def arm_basic(): + generator = MetaGenerator() + + generator.add_condition("", "arm128") + generator.add_condition("svcntb() == 16", "arm_sve128") + generator.add_condition("svcntb() == 32", "arm_sve256") + generator.add_condition("svcntb() == 64", "arm_sve512") + generator.add_condition("svcntb() == 128", "arm_sve1024") + generator.add_condition("svcntb() == 256", "arm_sve2048") diff --git a/pspamm/metagen/metagen.py b/pypspamm/metagen/metagen.py similarity index 52% rename from pspamm/metagen/metagen.py rename to pypspamm/metagen/metagen.py index 1b4408a..cd39423 100644 --- a/pspamm/metagen/metagen.py +++ b/pypspamm/metagen/metagen.py @@ -1,17 +1,20 @@ -from pspamm.matmul import MatMul -from pspamm.codegen.ccode import * +from pypspamm.codegen.ccode import * +from pypspamm.matmul import MatMul + class MetaGenerator: def __init__(self): self.conditions = [] self.archs = [] - + def add_condition(self, condition, arch): self.conditions += [condition] self.archs += arch - + def generate_meta(self, funcname, params): - condition_template = " if ({condition}) {{ func = {funcname}_{arch}; }}\n" + condition_template = ( + " if ({condition}) {{ func = {funcname}_{arch}; }}\n" + ) template = """ void {funcname}({params}) {{ @@ -27,23 +30,39 @@ def generate_meta(self, funcname, params): """ conditions = "" - for (condition, arch) in zip(self.conditions, self.archs): - conditions += condition_template.format(funcname=funcname, arch=arch, condition=condition) - + for condition, arch in zip(self.conditions, self.archs): + conditions += condition_template.format( + funcname=funcname, arch=arch, condition=condition + ) + return template.format(funcname=funcname, params=params, conditions=conditions) def generate(self, alg: MatMul): block = alg.make() - return make_cfunc(alg.output_funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs, alg.generator.get_precision()) + return make_cfunc( + alg.output_funcname, + alg.generator.get_template(), + block, + alg.flop, + alg.starting_regs, + alg.generator.get_precision(), + ) if len(self.archs) == 0: return "" - + if len(self.archs) == 1: block = alg.make() - return make_cfunc(alg.output_funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs, alg.generator.get_precision()) + return make_cfunc( + alg.output_funcname, + alg.generator.get_template(), + block, + alg.flop, + alg.starting_regs, + alg.generator.get_precision(), + ) # only generate the kernel; nothing else else: text = "" @@ -51,13 +70,20 @@ def generate(self, alg: MatMul): for arch in self.archs: block = alg.make() - funcname = f'{alg.output_funcname}_{arch}' + funcname = f"{alg.output_funcname}_{arch}" - func = make_cfunc(funcname, alg.generator.get_template(), block, alg.flop, alg.starting_regs, alg.generator.get_precision()) + func = make_cfunc( + funcname, + alg.generator.get_template(), + block, + alg.flop, + alg.starting_regs, + alg.generator.get_precision(), + ) text += f"static {func}\n\n" - params = f"const {alg.precision.ctype()}* A, const {alg.precision.ctype()}* B, {alg.precision.ctype()}* C, {alg.precision.ctype()} alpha, {alg.precision.ctype()} beta, const {alg.precision.ctype()}* prefetch" + params = f"const {alg.precision.ctype()}* A, const {alg.precision.ctype()}* B, {alg.precision.ctype()}* C, {alg.precision.ctype()} alpha, {alg.precision.ctype()} beta, const {alg.precision.ctype()}* prefetch" text += self.generate_meta(alg.output_funcname, params) text += "\n\n" diff --git a/requirements.txt b/requirements.txt index 2a378b7..4562c82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,2 @@ numpy>=1.14.0 scipy>=1.0.0 -setuptools>=61.0.0 -wheel diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e936f71 --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +import setuptools + +with open("pypspamm/VERSION", "r") as fh: + current_version = fh.read().strip() + +with open("README.md", "r") as fh: + long_description = fh.read() + +with open("requirements.txt", "r") as fh: + install_requires = [s.strip() for s in fh.readlines() if s.strip() != ""] + +setuptools.setup( + name="PSpaMM", + version=current_version, + license="BSD-3-Clause", + author="Peter Wauligmann, Nathan Brei, Alex Puscas, David Schneller", + author_email="david.schneller@tum.de", + description="An inline assembly generator for sparse matrix multiplications", + long_description=long_description, + long_description_content_type="text/markdown", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + ], + url="https://github.com/seissol/pspamm", + python_requires=">=3.7", + install_requires=install_requires, + include_package_data=True, + entry_points={ + "console_scripts": [ + "pspamm-generator = pypspamm.cli:main", + ] + }, +) diff --git a/tests/README.md b/tests/README.md index a21a9eb..2abea55 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,71 +1,27 @@ -# Guidelines on how to execute the NEON & SVE tests -### DISCLAIMER: -Some unit tests for SVE fail when including the gcc compiler flag "-mcpu=a64fx". -We assume that this flag makes the compiler optimize the unit tests in a way that breaks them. -Specifically, the values for `ldb`, `alpha`, and `beta` are sometimes set to 0 when calculating a reference solution which we compare to the solution of the PSpaMM kernel. -To fix this, the generated testsuite for Arm NEON and SVE saves certain values as variables before passing them to specific functions -instead of passing them as constant values. -## Compiling with gcc -A Makefile is provided, however only NEON and SVE related unit tests can be compiled at the moment. -Naturally, other compiler flags than the ones provided may be used. -Compiling the SVE testsuite with gcc 11.0.0 seems to break some test cases. Within the provided test setup, the values of certain parameters are overwritten after -specific tests, namely ```sve_arm_only_test15_23_6.h``` and ```sve_arm_only_test16_23_6.h```. This leads to a wrong reference solution which is then compared to -the one calculated by our generated kernel. -Example output when using GDB: -``` -Program received signal SIGSEGV, Segmentation fault. -#0 0x0000000000213dc4 in post (M=7, M@entry=23, N=N@entry=29, K=K@entry=31, LDA=7, LDA@entry=23, LDB=0x3feeb851eb851eb8, LDB@entry=0xffffffffa5ec, LDC=7, LDC@entry=23, A=A@entry=0x2fa8c0, - B=B@entry=0x2fbf40, C=C@entry=0x300cc0, Cref=Cref@entry=0x2ff7c0, DELTA=DELTA@entry=9.9999999999999995e-08, BETA=, ALPHA=, BETA=, - ALPHA=) at sve_testsuite.cpp:179 -#1 0x00000000002aecf4 in main () at sve_testsuite.cpp:619 -``` -Additional testing of gcc-based compilation is needed. Meanwhile, the SVE testsuite should be compiled with clang. -## Unit Tests -Unit tests for all 3 architectures (KNL, Arm NEON, Arm SVE) are provided. -The testsuite that corresponds to a unit test needs to be executed on the respective processor/architecture. -How to generate and execute a specific testsuite is shown below. -If nothing breaks, the generated testsuite reports the number of successful test case executions. -### KNL -1. Generate the testsuite by calling ```python3 unit_tests_knl.py``` -2. Adjust the Makefile as needed and compile the generated ```testsuite.cpp``` -3. Run the compiled executable +# Test Running Guidelines -### Arm NEON -1. Generate the testsuite by calling ```python3 unit_tests_arm.py``` -2. Adjust the Makefile as needed and compile the generated ```testsuite.cpp``` by calling -```make neon_testsuite``` -3. Run the compiled executable with ```./neon_testsuite``` +Run `runall-sve.sh` which tests all configurations using software emulation. Note that on a processor that does not support AVX512/AVX10, you might need to comment out some tests, since there is no known emulation method for them yet. -### Arm SVE -1. Generate the testsuite by calling ```python3 unit_tests_arm_sve.py``` -2. Adjust the Makefile as needed and compile the generated ```sve_testsuite.cpp``` by calling -```make sve_testsuite``` -3. Run the compiled executable with ```./sve_testsuite``` +## Running a single test -#### Notes Running SVE with QEMU user-static +Use `runlocal.sh` with the PSpaMM architecture of your choice (e.g. `knl512`). The script will also automatically execute the tests; unless you give it the `norun` flag as second argument. -Run `runall-sve.sh` which tests a bunch of configurations already. +## Debugging -For a bit length `BITLEN`, it executes the following commands: -``` -# generate tests -python unit_tests_arm_sve.py $BITLEN +For debugging, for example for SVE with vector length 512 +(cf. ): -# compile: we use AVM V8.2 and SVE; the SVE vector length is set explicitly -aarch64-linux-gnu-g++ -static -march=armv8.2-a+sve -msve-vector-bits=${BITLEN} arm_sve${BITLEN}_testsuite.cpp -o sve${BITLEN}-test - -# run using QEMU, this way we may run on x86-64 as well; enable all features and constrain to sve${BITLEN} SVE registers maximum length (cf. https://qemu-project.gitlab.io/qemu/system/arm/cpu-features.html); the sve-default-vector-length=-1 parameter is needed for 1024 and 2048 bit SVE to work correctly (otherwise, QEMU will assume 512 bit maximum) -qemu-aarch64-static -cpu max,sve${BITLEN}=on,sve-default-vector-length=-1 ./sve${BITLEN}-test -``` - - -For debugging, for example for vector length 512 (cf. https://mariokartwii.com/showthread.php?tid=1998 ): -``` +```bash aarch64-linux-gnu-g++ -g -ggdb -static -march=armv9-a+sve -msve-vector-bits=512 sve_testsuite.cpp qemu-aarch64-static -g 1234 -cpu max,sve512=on ./a.out ``` + (we use 1234 as port here, and a.out as filename) -In a separate window, run `aarch64-linux-gnu-gdb --ex "target remote localhost:1234" --ex "file a.out"`. -The extra commands already connect you with QEMU and attach you to the compiled binary file, so method names etc. are printed correctly. -To run the program, just type `continue`. You may maybe want to set up breakpoints etc. before you do that. +In a separate window, run +`aarch64-linux-gnu-gdb --ex "target remote localhost:1234" --ex "file a.out"`. +The extra commands already connect you with +QEMU and attach you to the compiled binary file, +so method names etc. are printed correctly. +To run the program, just type `continue`. You +may maybe want to set up breakpoints etc. before you do that. diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py old mode 100755 new mode 100644 index 45d0376..9a77356 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -1,14 +1,19 @@ -from collections import namedtuple -import subprocess -import numpy as np +import os.path import random +import subprocess import sys -import os.path -from pspamm.codegen.precision import * +from collections import namedtuple + +import numpy as np + +from pypspamm.codegen.precision import * -BASEDIR = 'build' +BASEDIR = "build" -TestKernel = namedtuple('TestKernel', 'name precision m n k lda ldb ldc alpha beta block_sizes amtx bmtx delta') +TestKernel = namedtuple( + "TestKernel", + "name precision m n k lda ldb ldc alpha beta block_sizes amtx bmtx delta", +) head_of_testsuite = """#include #include @@ -78,9 +83,9 @@ T* C; int resA = posix_memalign(reinterpret_cast(&A), 64, LDA*LDB*sizeof(T)); - int resAsparse = posix_memalign(reinterpret_cast(&Asparse), 64, LDA*LDB*sizeof(T)); + int resAsparse = posix_memalign(reinterpret_cast(&Asparse), 64, LDA*LDB*sizeof(T)); int resB = posix_memalign(reinterpret_cast(&B), 64, LDB*N*sizeof(T)); - int resBsparse = posix_memalign(reinterpret_cast(&Bsparse), 64, LDB*N*sizeof(T)); + int resBsparse = posix_memalign(reinterpret_cast(&Bsparse), 64, LDB*N*sizeof(T)); int resCref = posix_memalign(reinterpret_cast(&Cref), 64, LDC*N*sizeof(T)); int resC = posix_memalign(reinterpret_cast(&C), 64, LDC*N*sizeof(T)); @@ -193,13 +198,13 @@ } gemm_ref(M, N, K, *LDA, *LDB, LDC, *ALPHA, *BETA, A, B, Cref); - + double diffAbsMax = 0; double diffRelMax = 0; int failedCount = 0; for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { - // we use the relative error instead of the absolute error because of an issue we found for sparse single precision + // we use the relative error instead of the absolute error because of an issue we found for sparse single precision // kernels presumably due to limited precision of floats const double diffAbs = std::abs((static_cast(C[i + j * LDC]) - static_cast(Cref[i + j * LDC]))); const double diffRel = diffAbs / std::abs(static_cast(Cref[i + j * LDC])); @@ -247,7 +252,7 @@ setup_prefetch(prefetch, std::get<4>(pointers), {n}, {ldc}); {name}(std::get<{asparse}>(pointers), std::get<{bsparse}>(pointers), std::get<4>(pointers), {alpha}, {beta}, prefetch); const auto result = post<{precision}>({m}, {n}, {k}, &lda, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<2>(pointers), std::get<4>(pointers), std::get<5>(pointers), {delta:.15e}); - + if (result) {{ ++correct; }} @@ -272,12 +277,12 @@ def generateMTX(k, n, nnz, bk=1, bn=1): - random.seed(k*n + nnz) + random.seed(k * n + nnz) if k < bk: - bk = k + bk = k if n < bn: - bn = n + bn = n assert k % bk == 0 assert n % bn == 0 @@ -286,36 +291,43 @@ def generateMTX(k, n, nnz, bk=1, bn=1): true_nzz = nnz * bk * bn - os.makedirs(os.path.join(BASEDIR, 'mtx'), exist_ok=True) + os.makedirs(os.path.join(BASEDIR, "mtx"), exist_ok=True) - filename = os.path.join(BASEDIR, 'mtx', f'{k}-{bk}-{n}-{bn}-{nnz}.mtx') + filename = os.path.join(BASEDIR, "mtx", f"{k}-{bk}-{n}-{bn}-{nnz}.mtx") if os.path.isfile(filename): return filename - with open(filename, 'w') as f: + with open(filename, "w") as f: - f.write(f'%%MatrixMarket matrix coordinate real general\n%\n{k} {n} {true_nzz}') + f.write(f"%%MatrixMarket matrix coordinate real general\n%\n{k} {n} {true_nzz}") - zeros = set() + zeros = set() - for i in range(1, k + 1, bk): - for j in range(1, n + 1, bn): - zeros.add((i, j)) + for i in range(1, k + 1, bk): + for j in range(1, n + 1, bn): + zeros.add((i, j)) - nonzeros = random.sample(sorted(zeros), nnz) + nonzeros = random.sample(sorted(zeros), nnz) - for entry in nonzeros: - for ii in range(bk): - for jj in range(bn): - f.write('\n' + str(entry[0] + ii) + ' ' + str(entry[1] + jj) + ' ' + str(random.uniform(0.00001, 1000))) + for entry in nonzeros: + for ii in range(bk): + for jj in range(bn): + f.write( + "\n" + + str(entry[0] + ii) + + " " + + str(entry[1] + jj) + + " " + + str(random.uniform(0.00001, 1000)) + ) return filename def make(kernels, arch): os.makedirs(os.path.join(BASEDIR, arch), exist_ok=True) - f = open(os.path.join(BASEDIR, f'{arch}_testsuite.cpp'), 'w') + f = open(os.path.join(BASEDIR, f"{arch}_testsuite.cpp"), "w") f.write(head_of_testsuite) @@ -323,18 +335,29 @@ def make(kernels, arch): for kern in kernels: - arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), - str(kern.ldc), str(kern.alpha), str(kern.beta)] + arguments = [ + "pspamm-generator", + str(kern.m), + str(kern.n), + str(kern.k), + str(kern.lda), + str(kern.ldb), + str(kern.ldc), + str(kern.alpha), + str(kern.beta), + ] if kern.amtx is not None: - arguments += ['--amtx_filename', kern.amtx] + arguments += ["--amtx_filename", kern.amtx] if kern.bmtx is not None: - arguments += ['--bmtx_filename', kern.bmtx] + arguments += ["--bmtx_filename", kern.bmtx] - prec = 's' if kern.precision == Precision.SINGLE else 'd' - arguments += ['--precision', prec] + prec = "s" if kern.precision == Precision.SINGLE else "d" + arguments += ["--precision", prec] - block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes)) + block_sizes = list( + set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes) + ) for bs in block_sizes: bm = bs[0] @@ -342,9 +365,9 @@ def make(kernels, arch): bk = bs[2] if arch.startswith("arm_sve"): - veclen = int(arch[7:]) if arch[7:] != '' else 128 + veclen = int(arch[7:]) if arch[7:] != "" else 128 else: - veclen = int(arch[3:]) if arch[3:] != '' else 128 + veclen = int(arch[3:]) if arch[3:] != "" else 128 assert veclen % 128 == 0 reglen = veclen // 128 v_len = (16 // kern.precision.size()) * reglen @@ -352,67 +375,113 @@ def make(kernels, arch): # ceiling division vm = -(bm // -v_len) v_size = v_len - elem128 = (16 // kern.precision.size()) + elem128 = 16 // kern.precision.size() if arch.startswith("knl"): - if not ((bn+bk) * vm <= 32): - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + if not ((bn + bk) * vm <= 32): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("hsw"): - if not ((bn+bk) * vm + bn * bk <= 16) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + if ( + not ((bn + bk) * vm + bn * bk <= 16) + or not (kern.m % v_size) == 0 + or not (bm % v_size) == 0 + ): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("arm_sve"): - vkext = -(bk // -elem128) - isvkext = bn*vkext <= 16 if elem128 == 2 else bn*vkext <= 8 - vk = vkext if isvkext else bk - if not ((bn+bk) * vm + bn * vk <= 32): - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + vkext = -(bk // -elem128) + isvkext = bn * vkext <= 16 if elem128 == 2 else bn * vkext <= 8 + vk = vkext if isvkext else bk + if not ((bn + bk) * vm + bn * vk <= 32): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("arm"): - vk = -(bk // -elem128) - if not ((bn+bk) * vm + bn * vk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + vk = -(bk // -elem128) + if ( + not ((bn + bk) * vm + bn * vk <= 32) + or not (kern.m % v_size) == 0 + or not (bm % v_size) == 0 + ): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("rvv"): - if not ((bn+bk) * vm <= 32) or not (bn*bk <= 30) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue + if ( + not ((bn + bk) * vm <= 32) + or not (bn * bk <= 30) + or not (kern.m % v_size) == 0 + or not (bm % v_size) == 0 + ): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue elif arch.startswith("lsx") or arch.startswith("lasx"): - if not ((bn+bk) * vm + bn * bk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: - print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') - continue - - name = f'{kern.name}_{kern.precision}_{bm}_{bn}_{bk}' - - additional_args = ['--output_funcname', name, '--output_filename', os.path.join(BASEDIR, arch, name + '.h'), - '--output_overwrite'] - additional_args += ['--bm', str(bm), '--bn', str(bn), '--bk', str(bk), '--arch', arch] - additional_args += ['--prefetching', 'BL2viaC'] + if ( + not ((bn + bk) * vm + bn * bk <= 32) + or not (kern.m % v_size) == 0 + or not (bm % v_size) == 0 + ): + print(f"Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}") + continue + + name = f"{kern.name}_{kern.precision}_{bm}_{bn}_{bk}" + + additional_args = [ + "--output_funcname", + name, + "--output_filename", + os.path.join(BASEDIR, arch, name + ".h"), + "--output_overwrite", + ] + additional_args += [ + "--bm", + str(bm), + "--bn", + str(bn), + "--bk", + str(bk), + "--arch", + arch, + ] + additional_args += ["--prefetching", "BL2viaC"] try: - print(' '.join(arguments + additional_args)) - subprocess.check_output(arguments + additional_args, stderr=subprocess.STDOUT) + print(" ".join(arguments + additional_args)) + subprocess.check_output( + arguments + additional_args, stderr=subprocess.STDOUT + ) except subprocess.CalledProcessError as e: - raise RuntimeError(f"The command\n{' '.join(e.cmd)}\n returned with an error (code {e.returncode}):\n{e.output.decode('utf-8')}") + raise RuntimeError( + f"The command\n{' '.join(e.cmd)}\n returned with an error (code {e.returncode}):\n{e.output.decode('utf-8')}" + ) - f.write('#include "' + arch + '/' + name + '.h"\n') + f.write('#include "' + arch + "/" + name + '.h"\n') testcases += [ - setup_single_testcase.format( - m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, - beta=kern.beta, delta=kern.delta, name=name, - amtx=kern.amtx or '', bmtx = kern.bmtx or '', - asparse=1 if kern.lda == 0 else 0, bsparse=3 if kern.ldb == 0 else 2, - precision=kern.precision.ctype()) + setup_single_testcase.format( + m=kern.m, + n=kern.n, + k=kern.k, + lda=kern.lda, + ldb=kern.ldb, + ldc=kern.ldc, + alpha=kern.alpha, + beta=kern.beta, + delta=kern.delta, + name=name, + amtx=kern.amtx or "", + bmtx=kern.bmtx or "", + asparse=1 if kern.lda == 0 else 0, + bsparse=3 if kern.ldb == 0 else 2, + precision=kern.precision.ctype(), + ) ] - f.write('\n') + f.write("\n") f.write(function_definitions) f.write(setup_main.format(arch=arch)) for testcase in testcases: - f.write(testcase) + f.write(testcase) f.write(end_of_testsuite) diff --git a/tests/unit_test.py b/tests/unit_test.py old mode 100644 new mode 100755 index 5bf40c6..4247a1e --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -1,30 +1,40 @@ #!/usr/bin/env python3 -import testsuite_generator as generator +import random +import re +import sys from importlib import import_module -from pspamm.codegen.precision import * +import testsuite_generator as generator -import sys -import re -import random +from pypspamm.codegen.precision import * arch = sys.argv[1] -parsedarch = re.fullmatch(r'(?P[a-zA-Z_]+)(?P\d+)', arch) +parsedarch = re.fullmatch(r"(?P[a-zA-Z_]+)(?P\d+)", arch) -archname = parsedarch.group('name') -archprec = parsedarch.group('prec') +archname = parsedarch.group("name") +archprec = parsedarch.group("prec") -blocksize = import_module("pspamm.codegen.architectures." + archname + ".blocksize") +blocksize = import_module("pypspamm.codegen.architectures." + archname + ".blocksize") scripts = { - "arm": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxK, blocksize.Cube], + "arm": lambda blocksize: [ + blocksize.Old, + blocksize.Max, + blocksize.MaxK, + blocksize.Cube, + ], "arm_sve": lambda blocksize: [blocksize.Max, blocksize.MaxK, blocksize.Cube], - "knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn, blocksize.CubeBn], + "knl": lambda blocksize: [ + blocksize.Old, + blocksize.Max, + blocksize.MaxBn, + blocksize.CubeBn, + ], "hsw": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.Cube], "rvv": lambda blocksize: [blocksize.MaxBn, blocksize.CubeBn], - "lsx": lambda blocksize: [blocksize.Max] + "lsx": lambda blocksize: [blocksize.Max], } blocksize_algs = scripts[archname](blocksize) + [blocksize.Default] @@ -36,128 +46,1838 @@ # define the maximum allowed difference between elements of our solution and the reference solution for # double and single precision delta_hp = 1e-2 -delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough -delta_dp = 1e-6 # epsilon is around e-15 => /2 +delta_sp = 1e-4 # epsilon is around e-7 => /2 ... For most cases, 1e-6 is enough +delta_dp = 1e-6 # epsilon is around e-15 => /2 kernels = [] for precision, delta in zip((Precision.SINGLE, Precision.DOUBLE), (delta_sp, delta_dp)): v_size = v_size_fun(precision) - kernels.append(generator.TestKernel("test0m", precision, 8, 8, 8, 8, 8, 8, -1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("test0dv", precision, 8, 8, 8, 8, 8, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("test0bspv", precision, 8, 8, 8, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(8, 8, 8, 1, 1), delta)) - kernels.append(generator.TestKernel("test0aspv", precision, 8, 8, 8, 0, 8, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(8, 8, 8, v_size, 1), None, delta)) - kernels.append(generator.TestKernel("test0abspv", precision, 8, 8, 8, 0, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(8, 8, 8, v_size, 1), generator.generateMTX(8, 8, 8, 1, 1), delta)) - - kernels.append(generator.TestKernel("test1dv", precision, 64, 8, 56, 64, 56, 64, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("test1bspv", precision, 64, 8, 56, 64, 0, 64, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 8, 20, 1, 1), delta)) - kernels.append(generator.TestKernel("test1aspv", precision, 64, 8, 56, 0, 56, 64, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(64, 56, 30, v_size, 1), None, delta)) - kernels.append(generator.TestKernel("test1abspv", precision, 64, 8, 56, 0, 0, 64, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], generator.generateMTX(64, 56, 30, v_size, 1), generator.generateMTX(56, 8, 20, 1, 1), delta)) - - kernels.append(generator.TestKernel("testlarge", precision, 40, 100, 100, 100, 100, 100, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(10, 10, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("test1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 56, 30), delta)) - kernels.append(generator.TestKernel("test2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("test3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 2, 1), delta)) - kernels.append(generator.TestKernel("knl_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2,2), (16,7,2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(40, 40, 20), delta)) - - kernels.append(generator.TestKernel("knl_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 2, 2), delta)) - kernels.append(generator.TestKernel("knl_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [(8, 20,2), (24,3,2)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(10, 20, 1), delta)) - kernels.append(generator.TestKernel("knl_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [(32, 2,2), (8,14,2)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(10, 5, 1), delta)) - kernels.append(generator.TestKernel("knl_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1,2)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 1, 1), delta)) - kernels.append(generator.TestKernel("knl_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8, 24,2), (8,1,2)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(40, 24, 1), delta)) - - kernels.append(generator.TestKernel("knl_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1,2)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2,2), (16,7,2)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [(8, 28,2)], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8, 20,2), (8,3,2)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [(32, 2,2), (8,14,2)] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1,2)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("knl_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24,2)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("hswtest1", precision, 8, 56, 56, 8, 0, 8, 2.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 56, 30), delta)) - kernels.append(generator.TestKernel("hswtest2", precision, 8, 40, 40, 8, 40, 8, 2.5, 1.0, [(8,2)] + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hswtest3", precision, 8, 56, 56, 8, 56, 8, 1.0, 5.0, [(8, 3)] + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test1", precision, 8, 2, 1, 8, 0, 8, 1.0, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 2, 1), delta)) - kernels.append(generator.TestKernel("hsw_only_test2", precision, 24, 40, 40, 32, 0, 24, 1000, 1.0, [(8, 2)] + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(40, 40, 20), delta)) - - kernels.append(generator.TestKernel("hsw_only_test3", precision, 8, 2, 1, 8, 0, 16, -2.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 2, 2), delta)) - kernels.append(generator.TestKernel("hsw_only_test4", precision, 24, 20, 10, 40, 0, 24, 35.222, 0.0, [] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(10, 20, 1), delta)) - kernels.append(generator.TestKernel("hsw_only_test5", precision, 64, 5, 10, 64, 0, 64, 2.3, 0.0, [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(10, 5, 1), delta)) - kernels.append(generator.TestKernel("hsw_only_test6", precision, 8, 1, 1, 16, 0, 56, 1.0, 0.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 1, 1), delta)) - kernels.append(generator.TestKernel("hsw_only_test7", precision, 8, 24, 40, 8, 0, 8, 1.0, 333333.2222222, [(8,1)] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(40, 24, 1), delta)) - - kernels.append(generator.TestKernel("hsw_only_test8", precision, 8, 2, 1, 8, 1, 8, 2.5, 0.0, [(8,1)] + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test9", precision, 32, 40, 40, 32, 60, 32, 2.0, -4.33, [(8,2)] + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test10", precision, 56, 28, 56, 56, 56, 56, 0.1, 3.0, [x.getBlocksize(56, 28, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test11", precision, 8, 20, 8, 40, 10, 8, 234234.123123, 0.0, [(8,3)] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test12", precision, 64, 5, 10, 64, 12, 64, 1.0, 1.0, [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test13", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("hsw_only_test14", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("itest4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4), (4,4,2), (4,4,4), (4,4,8)], None, None, delta)) - - kernels.append(generator.TestKernel("itest1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 56, 30), delta)) - kernels.append(generator.TestKernel("itest2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("itest3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(4, 3, 5), delta)) - kernels.append(generator.TestKernel("arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(4, 3, 5), delta)) - kernels.append(generator.TestKernel("arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(50, 80, 294), delta)) - kernels.append(generator.TestKernel("arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(32, 32, 24), delta)) - kernels.append(generator.TestKernel("arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 1, 1), delta)) - kernels.append(generator.TestKernel("arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(2, 2, 1), delta)) - kernels.append(generator.TestKernel("arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(7, 5, 35), delta)) - - kernels.append(generator.TestKernel("arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - kernels.append(generator.TestKernel("arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], None, None, delta)) - - kernels.append(generator.TestKernel("sve_mixed_test1", precision, 9, 9, 9, 9, 9, 9, 1.0, 0.0, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test2", precision, 9, 9, 9, 9, 0, 9, 4.0, 2.5, [(3, 3)] + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(9, 9, 20), delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test3", precision, 18, 18, 18, 18, 0, 18, 3.4, -2.5, [(1, 1), (3, 3), (6, 6)] + [x.getBlocksize(18, 18, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(18, 18, 59), delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test4", precision, 80, 80, 80, 80, 0, 80, 0.0, -2.5, [(4, 4), (8, 8)] + [x.getBlocksize(80, 80, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(80, 80, 312), delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test5", precision, 8, 8, 8, 10, 0, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(8, 8, 6), delta_dp)) - kernels.append(generator.TestKernel("sve_mixed_test6", precision, 8, 8, 8, 10, 8, 8, 3.0, -0.9, [(2, 2), (4, 4)] + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - - kernels.append(generator.TestKernel("sve_test4", precision, 4, 4, 4, 4, 4, 4, 2.0, 2.0, [(4, 4)], None, None, delta_dp)) - - kernels.append(generator.TestKernel("sve_test1", precision, 8, 56, 56, 8, 0, 8, 1.0, 0.0, [(8, 4), (8,1)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(56, 56, 30), delta_dp)) - kernels.append(generator.TestKernel("sve_test2", precision, 8, 40, 40, 8, 40, 8, 3.0, 2.0, [(8, 5), (8,2)] + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_test3", precision, 8, 56, 56, 8, 56, 8, 0.0, 0.0, [(8, 3), (8, 5)] + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - - kernels.append(generator.TestKernel("sve_arm_only_test1", precision, 2, 3, 4, 2, 0, 2, 1.1233, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(4, 3, 5), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test2", precision, 2, 3, 4, 20, 0, 14, 1.0, 1.0, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(4, 3, 5), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test3", precision, 32, 80, 50, 32, 0, 32, 1.0, 3.0, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(50, 80, 294), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test4", precision, 32, 32, 32, 34, 0, 32, 1.0, 0.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(32, 32, 24), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test5", precision, 2, 1, 1, 2, 0, 8, 1.0, -1.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(1, 1, 1), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test6", precision, 2, 2, 2, 2, 0, 2, 2.0, 234234.123, [(2, 1)] + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(2, 2, 1), delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test7", precision, 16, 5, 7, 16, 0, 16, 0.0, -1.123, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(7, 5, 35), delta_dp)) - - kernels.append(generator.TestKernel("sve_arm_only_test8", precision, 2, 3, 4, 2, 4, 2, 1.0, 0.0, [(2, 1), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test9", precision, 2, 3, 4, 20, 12, 14, 2.0, 1.123, [(2, 2), (2,3)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test10", precision, 32, 80, 50, 32, 50, 32, 0.0, 0.2, [(8, 5)] + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test11", precision, 32, 32, 32, 33, 68, 32, 1231.0, 14443.0, [(4, 4), (4,3)] + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test12", precision, 2, 1, 1, 2, 1, 8, 1.0, 3.0, [(2, 1)] + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test13", precision, 2, 3, 3, 2, 3, 2, 1.0, 0.0, [(2, 1)] + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test14", precision, 16, 5, 7, 16, 7, 16, 1.0, 1.0, [(8, 1), (8,2)] + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - - kernels.append(generator.TestKernel("sve_arm_only_test15", precision, 23, 29, 31, 23, 31, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], None, None, delta_dp)) - kernels.append(generator.TestKernel("sve_arm_only_test16", precision, 23, 29, 31, 23, 0, 23, 1.32, 0.96, [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(31, 29, 61), delta_dp)) - - kernels.append(generator.TestKernel("sve_single_prec_test_S1", precision, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], None, None, delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S2", precision, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], None, None, delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S3", precision, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], None, None, delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S4", precision, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], None, None, delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S5", precision, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(9, 9, 8), delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S6", precision, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(15, 15, 22), delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S7", precision, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(23, 23, 52), delta_sp)) - kernels.append(generator.TestKernel("sve_single_prec_test_S8", precision, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], None, generator.generateMTX(13, 31, 40), delta_sp)) + kernels.append( + generator.TestKernel( + "test0m", + precision, + 8, + 8, + 8, + 8, + 8, + 8, + -1.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "test0dv", + precision, + 8, + 8, + 8, + 8, + 8, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test0bspv", + precision, + 8, + 8, + 8, + 8, + 0, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(8, 8, 8, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test0aspv", + precision, + 8, + 8, + 8, + 0, + 8, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + generator.generateMTX(8, 8, 8, v_size, 1), + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test0abspv", + precision, + 8, + 8, + 8, + 0, + 0, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + generator.generateMTX(8, 8, 8, v_size, 1), + generator.generateMTX(8, 8, 8, 1, 1), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "test1dv", + precision, + 64, + 8, + 56, + 64, + 56, + 64, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test1bspv", + precision, + 64, + 8, + 56, + 64, + 0, + 64, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 8, 20, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test1aspv", + precision, + 64, + 8, + 56, + 0, + 56, + 64, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], + generator.generateMTX(64, 56, 30, v_size, 1), + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test1abspv", + precision, + 64, + 8, + 56, + 0, + 0, + 64, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(64, 8, 1, v_size, precision) for x in blocksize_algs], + generator.generateMTX(64, 56, 30, v_size, 1), + generator.generateMTX(56, 8, 20, 1, 1), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "testlarge", + precision, + 40, + 100, + 100, + 100, + 100, + 100, + 2.5, + 1.0, + [(8, 5), (8, 2)] + + [x.getBlocksize(10, 10, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test1", + precision, + 8, + 56, + 56, + 8, + 0, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 56, 30), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test2", + precision, + 8, + 40, + 40, + 8, + 40, + 8, + 2.5, + 1.0, + [(8, 5), (8, 2)] + + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "test3", + precision, + 8, + 56, + 56, + 8, + 56, + 8, + 1.0, + 5.0, + [(8, 3), (8, 5)] + + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test1", + precision, + 8, + 2, + 1, + 8, + 0, + 8, + 1.0, + 0.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 2, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test2", + precision, + 24, + 40, + 40, + 32, + 0, + 24, + 1000, + 1.0, + [(8, 2, 2), (16, 7, 2)] + + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(40, 40, 20), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "knl_only_test3", + precision, + 8, + 2, + 1, + 8, + 0, + 16, + -2.0, + 0.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 2, 2), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test4", + precision, + 24, + 20, + 10, + 40, + 0, + 24, + 35.222, + 0.0, + [(8, 20, 2), (24, 3, 2)] + + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(10, 20, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test5", + precision, + 64, + 5, + 10, + 64, + 0, + 64, + 2.3, + 0.0, + [(32, 2, 2), (8, 14, 2)] + + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(10, 5, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test6", + precision, + 8, + 1, + 1, + 16, + 0, + 56, + 1.0, + 0.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test7", + precision, + 8, + 24, + 40, + 8, + 0, + 8, + 1.0, + 333333.2222222, + [(8, 24, 2), (8, 1, 2)] + + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(40, 24, 1), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "knl_only_test8", + precision, + 8, + 2, + 1, + 8, + 1, + 8, + 2.5, + 0.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test9", + precision, + 32, + 40, + 40, + 32, + 60, + 32, + 2.0, + -4.33, + [(8, 2, 2), (16, 7, 2)] + + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test10", + precision, + 56, + 28, + 56, + 56, + 56, + 56, + 0.1, + 3.0, + [(8, 28, 2)], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test11", + precision, + 8, + 20, + 8, + 40, + 10, + 8, + 234234.123123, + 0.0, + [(8, 20, 2), (8, 3, 2)] + + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test12", + precision, + 64, + 5, + 10, + 64, + 12, + 64, + 1.0, + 1.0, + [(32, 2, 2), (8, 14, 2)] + + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test13", + precision, + 8, + 1, + 1, + 16, + 1, + 56, + 0.0, + 123.0, + [(8, 1, 2)] + + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "knl_only_test14", + precision, + 8, + 24, + 40, + 8, + 41, + 8, + 2.0, + 1.0, + [(8, 24, 2)] + + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "hswtest1", + precision, + 8, + 56, + 56, + 8, + 0, + 8, + 2.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 56, 30), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hswtest2", + precision, + 8, + 40, + 40, + 8, + 40, + 8, + 2.5, + 1.0, + [(8, 2)] + + [x.getBlocksize(8, 40, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hswtest3", + precision, + 8, + 56, + 56, + 8, + 56, + 8, + 1.0, + 5.0, + [(8, 3)] + + [x.getBlocksize(8, 56, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test1", + precision, + 8, + 2, + 1, + 8, + 0, + 8, + 1.0, + 0.0, + [(8, 1)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 2, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test2", + precision, + 24, + 40, + 40, + 32, + 0, + 24, + 1000, + 1.0, + [(8, 2)] + + [x.getBlocksize(24, 40, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(40, 40, 20), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "hsw_only_test3", + precision, + 8, + 2, + 1, + 8, + 0, + 16, + -2.0, + 0.0, + [(8, 1)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 2, 2), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test4", + precision, + 24, + 20, + 10, + 40, + 0, + 24, + 35.222, + 0.0, + [] + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(10, 20, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test5", + precision, + 64, + 5, + 10, + 64, + 0, + 64, + 2.3, + 0.0, + [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(10, 5, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test6", + precision, + 8, + 1, + 1, + 16, + 0, + 56, + 1.0, + 0.0, + [(8, 1)] + + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test7", + precision, + 8, + 24, + 40, + 8, + 0, + 8, + 1.0, + 333333.2222222, + [(8, 1)] + + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(40, 24, 1), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "hsw_only_test8", + precision, + 8, + 2, + 1, + 8, + 1, + 8, + 2.5, + 0.0, + [(8, 1)] + + [x.getBlocksize(8, 2, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test9", + precision, + 32, + 40, + 40, + 32, + 60, + 32, + 2.0, + -4.33, + [(8, 2)] + + [x.getBlocksize(32, 40, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test10", + precision, + 56, + 28, + 56, + 56, + 56, + 56, + 0.1, + 3.0, + [x.getBlocksize(56, 28, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test11", + precision, + 8, + 20, + 8, + 40, + 10, + 8, + 234234.123123, + 0.0, + [(8, 3)] + + [x.getBlocksize(8, 20, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test12", + precision, + 64, + 5, + 10, + 64, + 12, + 64, + 1.0, + 1.0, + [] + [x.getBlocksize(64, 5, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test13", + precision, + 8, + 1, + 1, + 16, + 1, + 56, + 0.0, + 123.0, + [(8, 1)] + + [x.getBlocksize(8, 1, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "hsw_only_test14", + precision, + 8, + 24, + 40, + 8, + 41, + 8, + 2.0, + 1.0, + [] + [x.getBlocksize(8, 24, 2, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "itest4", + precision, + 4, + 4, + 4, + 4, + 4, + 4, + 2.0, + 2.0, + [(4, 4), (4, 4, 2), (4, 4, 4), (4, 4, 8)], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "itest1", + precision, + 8, + 56, + 56, + 8, + 0, + 8, + 1.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 56, 30), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "itest2", + precision, + 8, + 40, + 40, + 8, + 40, + 8, + 3.0, + 2.0, + [(8, 5), (8, 2)] + + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "itest3", + precision, + 8, + 56, + 56, + 8, + 56, + 8, + 0.0, + 0.0, + [(8, 3), (8, 5)] + + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "arm_only_test1", + precision, + 2, + 3, + 4, + 2, + 0, + 2, + 1.1233, + 0.0, + [(2, 1), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(4, 3, 5), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test2", + precision, + 2, + 3, + 4, + 20, + 0, + 14, + 1.0, + 1.0, + [(2, 2), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(4, 3, 5), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test3", + precision, + 32, + 80, + 50, + 32, + 0, + 32, + 1.0, + 3.0, + [(8, 5)] + + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(50, 80, 294), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test4", + precision, + 32, + 32, + 32, + 34, + 0, + 32, + 1.0, + 0.0, + [(4, 4), (4, 3)] + + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(32, 32, 24), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test5", + precision, + 2, + 1, + 1, + 2, + 0, + 8, + 1.0, + -1.0, + [(2, 1)] + + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 1, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test6", + precision, + 2, + 2, + 2, + 2, + 0, + 2, + 2.0, + 234234.123, + [(2, 1)] + + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(2, 2, 1), + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test7", + precision, + 16, + 5, + 7, + 16, + 0, + 16, + 0.0, + -1.123, + [(8, 1), (8, 2)] + + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(7, 5, 35), + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "arm_only_test8", + precision, + 2, + 3, + 4, + 2, + 4, + 2, + 1.0, + 0.0, + [(2, 1), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test9", + precision, + 2, + 3, + 4, + 20, + 12, + 14, + 2.0, + 1.123, + [(2, 2), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test10", + precision, + 32, + 80, + 50, + 32, + 50, + 32, + 0.0, + 0.2, + [(8, 5)] + + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test11", + precision, + 32, + 32, + 32, + 33, + 68, + 32, + 1231.0, + 14443.0, + [(4, 4), (4, 3)] + + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test12", + precision, + 2, + 1, + 1, + 2, + 1, + 8, + 1.0, + 3.0, + [(2, 1)] + + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test13", + precision, + 2, + 3, + 3, + 2, + 3, + 2, + 1.0, + 0.0, + [(2, 1)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + kernels.append( + generator.TestKernel( + "arm_only_test14", + precision, + 16, + 5, + 7, + 16, + 7, + 16, + 1.0, + 1.0, + [(8, 1), (8, 2)] + + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_mixed_test1", + precision, + 9, + 9, + 9, + 9, + 9, + 9, + 1.0, + 0.0, + [(3, 3)] + + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test2", + precision, + 9, + 9, + 9, + 9, + 0, + 9, + 4.0, + 2.5, + [(3, 3)] + + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(9, 9, 20), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test3", + precision, + 18, + 18, + 18, + 18, + 0, + 18, + 3.4, + -2.5, + [(1, 1), (3, 3), (6, 6)] + + [x.getBlocksize(18, 18, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(18, 18, 59), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test4", + precision, + 80, + 80, + 80, + 80, + 0, + 80, + 0.0, + -2.5, + [(4, 4), (8, 8)] + + [x.getBlocksize(80, 80, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(80, 80, 312), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test5", + precision, + 8, + 8, + 8, + 10, + 0, + 8, + 3.0, + -0.9, + [(2, 2), (4, 4)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(8, 8, 6), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_mixed_test6", + precision, + 8, + 8, + 8, + 10, + 8, + 8, + 3.0, + -0.9, + [(2, 2), (4, 4)] + + [x.getBlocksize(8, 8, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_test4", + precision, + 4, + 4, + 4, + 4, + 4, + 4, + 2.0, + 2.0, + [(4, 4)], + None, + None, + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_test1", + precision, + 8, + 56, + 56, + 8, + 0, + 8, + 1.0, + 0.0, + [(8, 4), (8, 1)] + + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(56, 56, 30), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_test2", + precision, + 8, + 40, + 40, + 8, + 40, + 8, + 3.0, + 2.0, + [(8, 5), (8, 2)] + + [x.getBlocksize(8, 40, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_test3", + precision, + 8, + 56, + 56, + 8, + 56, + 8, + 0.0, + 0.0, + [(8, 3), (8, 5)] + + [x.getBlocksize(8, 56, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_arm_only_test1", + precision, + 2, + 3, + 4, + 2, + 0, + 2, + 1.1233, + 0.0, + [(2, 1), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(4, 3, 5), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test2", + precision, + 2, + 3, + 4, + 20, + 0, + 14, + 1.0, + 1.0, + [(2, 2), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(4, 3, 5), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test3", + precision, + 32, + 80, + 50, + 32, + 0, + 32, + 1.0, + 3.0, + [(8, 5)] + + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(50, 80, 294), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test4", + precision, + 32, + 32, + 32, + 34, + 0, + 32, + 1.0, + 0.0, + [(4, 4), (4, 3)] + + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(32, 32, 24), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test5", + precision, + 2, + 1, + 1, + 2, + 0, + 8, + 1.0, + -1.0, + [(2, 1)] + + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(1, 1, 1), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test6", + precision, + 2, + 2, + 2, + 2, + 0, + 2, + 2.0, + 234234.123, + [(2, 1)] + + [x.getBlocksize(2, 2, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(2, 2, 1), + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test7", + precision, + 16, + 5, + 7, + 16, + 0, + 16, + 0.0, + -1.123, + [(8, 1), (8, 2)] + + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(7, 5, 35), + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_arm_only_test8", + precision, + 2, + 3, + 4, + 2, + 4, + 2, + 1.0, + 0.0, + [(2, 1), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test9", + precision, + 2, + 3, + 4, + 20, + 12, + 14, + 2.0, + 1.123, + [(2, 2), (2, 3)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test10", + precision, + 32, + 80, + 50, + 32, + 50, + 32, + 0.0, + 0.2, + [(8, 5)] + + [x.getBlocksize(32, 80, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test11", + precision, + 32, + 32, + 32, + 33, + 68, + 32, + 1231.0, + 14443.0, + [(4, 4), (4, 3)] + + [x.getBlocksize(32, 32, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test12", + precision, + 2, + 1, + 1, + 2, + 1, + 8, + 1.0, + 3.0, + [(2, 1)] + + [x.getBlocksize(2, 1, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test13", + precision, + 2, + 3, + 3, + 2, + 3, + 2, + 1.0, + 0.0, + [(2, 1)] + + [x.getBlocksize(2, 3, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test14", + precision, + 16, + 5, + 7, + 16, + 7, + 16, + 1.0, + 1.0, + [(8, 1), (8, 2)] + + [x.getBlocksize(16, 5, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_arm_only_test15", + precision, + 23, + 29, + 31, + 23, + 31, + 23, + 1.32, + 0.96, + [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_dp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_arm_only_test16", + precision, + 23, + 29, + 31, + 23, + 0, + 23, + 1.32, + 0.96, + [x.getBlocksize(23, 29, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(31, 29, 61), + delta_dp, + ) + ) + + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S1", + precision, + 9, + 9, + 9, + 9, + 9, + 9, + 1.24, + 0.87, + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S2", + precision, + 15, + 15, + 15, + 15, + 15, + 15, + -3.14, + 6.28, + [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S3", + precision, + 23, + 23, + 23, + 23, + 23, + 23, + 1.5, + -0.66, + [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S4", + precision, + 23, + 31, + 13, + 23, + 13, + 23, + 2.0, + 0.0, + [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], + None, + None, + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S5", + precision, + 9, + 9, + 9, + 9, + 0, + 9, + 1.24, + 0.87, + [x.getBlocksize(9, 9, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(9, 9, 8), + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S6", + precision, + 15, + 15, + 15, + 15, + 0, + 15, + -3.14, + 6.28, + [x.getBlocksize(15, 15, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(15, 15, 22), + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S7", + precision, + 23, + 23, + 23, + 23, + 0, + 23, + 1.5, + -0.66, + [x.getBlocksize(23, 23, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(23, 23, 52), + delta_sp, + ) + ) + kernels.append( + generator.TestKernel( + "sve_single_prec_test_S8", + precision, + 23, + 31, + 13, + 23, + 0, + 23, + 2.0, + 0.0, + [x.getBlocksize(23, 31, 1, v_size, precision) for x in blocksize_algs], + None, + generator.generateMTX(13, 31, 40), + delta_sp, + ) + ) generator.make(kernels, arch)