From 0ad3c54b10b661d54728714178dcd4d2e7d9d72d Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sat, 17 May 2025 12:24:33 +0200 Subject: [PATCH] Add LSX --- .github/workflows/codegen.yml | 6 +- README.md | 3 +- pspamm/codegen/architectures/lsx/blocksize.py | 29 +++ pspamm/codegen/architectures/lsx/generator.py | 236 ++++++++++++++++++ .../architectures/lsx/inlineprinter.py | 218 ++++++++++++++++ pspamm/codegen/architectures/lsx/operands.py | 77 ++++++ pspamm/matmul.py | 18 ++ tests/runlocal.sh | 7 + tests/testsuite_generator.py | 4 + tests/unit_test.py | 3 +- 10 files changed, 597 insertions(+), 4 deletions(-) create mode 100644 pspamm/codegen/architectures/lsx/blocksize.py create mode 100644 pspamm/codegen/architectures/lsx/generator.py create mode 100644 pspamm/codegen/architectures/lsx/inlineprinter.py create mode 100644 pspamm/codegen/architectures/lsx/operands.py diff --git a/.github/workflows/codegen.yml b/.github/workflows/codegen.yml index ad0c8fb..61b2211 100644 --- a/.github/workflows/codegen.yml +++ b/.github/workflows/codegen.yml @@ -1,6 +1,6 @@ name: codegen on: - - pull_request + - push jobs: install-pspamm: @@ -65,12 +65,14 @@ jobs: - rvv256 - rvv512 - rvv1024 + - lsx128 + - lsx256 steps: - name: apt-get run: | set -euo pipefail sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu g++-riscv64-linux-gnu qemu-user-static + sudo apt-get install g++-aarch64-linux-gnu g++-riscv64-linux-gnu g++-14-loongarch64-linux-gnu qemu-user-static - name: setup-python uses: actions/setup-python@v4 diff --git a/README.md b/README.md index 17fa685..198b3b4 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Currently supported: * x86_64: AVX2, AVX512/AVX10.1 * ARM/AARCH64: NEON, SVE (128,256,512,1024,2048 bit) * RISC-V: V (128,256,512,1024,2048,4096,8192 bit) +* LoongArch: LSX, LASX ## Installation @@ -25,7 +26,7 @@ pip install git+https://github.com/SeisSol/PSpaMM.git ```bash pspamm-generator M N K LDA LDB LDC ALPHA BETA \ - --arch {arm,arm_sve{128..2048},knl{128..512},hsw{128..256},rvv{128..8192}} \ + --arch {arm,arm_sve{128..2048},knl{128..512},hsw{128..256},rvv{128..8192},lsx{128..256}} \ --amtx_filename MTX_FILE_PATH --bmtx_filename MTX_FILE_PATH \ --output_funcname FUNCTION_NAME --output_filename OUTPUT_NAME diff --git a/pspamm/codegen/architectures/lsx/blocksize.py b/pspamm/codegen/architectures/lsx/blocksize.py new file mode 100644 index 0000000..db4fa3c --- /dev/null +++ b/pspamm/codegen/architectures/lsx/blocksize.py @@ -0,0 +1,29 @@ +class Max: + @classmethod + def getBlocksize(cls, m, n, bk, v_size, prec): + bm = v_size + bn = 1 + maxval = 0 + + for i in range(v_size, m+1, v_size): + for j in range(1, n+1): + # can be replaced by cls.LSX_condition_extended here + # (but that seemed to be slower in the end) + if cls.LSX_condition(i, j, bk, v_size): + if i*j > maxval and (cls.LSX_condition(i, j, bk, v_size) or j > 1): + maxval = i*j + bm = i + bn = j + + while cls.LSX_condition(bm, bn, bk+1, v_size): + bk += 1 + + return (bm, bn, bk) + + @classmethod + def LSX_condition(cls, bm, bn, bk, v_size): + # ceiling division + vm = -(bm // -v_size) + return (bn + bk) * vm + bn * bk <= 32 + +Default = Max diff --git a/pspamm/codegen/architectures/lsx/generator.py b/pspamm/codegen/architectures/lsx/generator.py new file mode 100644 index 0000000..836277d --- /dev/null +++ b/pspamm/codegen/architectures/lsx/generator.py @@ -0,0 +1,236 @@ +from pspamm.cursors import * + +from pspamm.codegen.architectures.lsx.operands import * +from pspamm.codegen.ast import * +from pspamm.codegen.sugar import * +from pspamm.codegen.generator import * +from pspamm.codegen.precision import * +from pspamm.codegen.regcache import * + +class Generator(AbstractGenerator): + template = """ +void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, {real_type} alpha, {real_type} beta, {real_type} const* prefetch) {{ + __asm__ __volatile__( +{body_text} + : : {args} : {clobbered}); + + #ifndef NDEBUG + #ifdef _OPENMP + #pragma omp atomic + #endif + pspamm_num_total_flops += {flop}; + #endif +}} +""" + v_len = 2 + + def get_v_size(self): + return (16 // self.precision.size()) * self.v_len + + def get_template(self): + return Generator.template + + def use_broadcast(self): + return True + + def has_masks(self): + return False + + def init_mask(self, m, bm, v_size, tempreg, maskregs): + return block("") + + def make_argument_load(self, starting_regs, prefetch): + asm = block("Load arguments") + asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False)) + asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False)) + asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False)) + asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False)) + asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False)) + if prefetch: + asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False)) + return asm + + def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str): + assert(bm % v_size == 0) + vm = self.ceil_div(bm, v_size) + + assert (bn + bk) * vm + bn * bk <= 32 + + vmm = { + 1: vr, + 2: xr + }[self.v_len] + + A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)]) + Aoffset = vm*bk + + B_regs = Matrix([[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)]) + C_regs = Matrix([[vmm(32 - vm*bn + vm*c + r) for c in range(bn)] + for r in range(vm)]) + + b_reg = Aoffset + alpha_reg = [vmm(b_reg)] * 2 + beta_reg = [vmm(b_reg + 1)] * 2 + + starting_regs = [r(10), r(11), r(12), r(13), r(14), r(6), r(5)] + + additional_regs = [r(15), r(16), r(17), r(31), r(7)] + + loop_regs = [r(28), r(29), r(30)] + + prefetch_reg = prefetch == 'BL2viaC' + + return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [], prefetch_reg + + def make_scaling_offsets(self, + additional_regs: List[Register], + nnz: int + ) -> Block: + return block("") + + def init_block(self, size): + return block("") + + def move_register_block(self, + cursor: Cursor, + cursor_ptr: CursorLocation, + block_offset: Coords, + registers: Matrix[Register], + v_size: int, + additional_regs, + mask: Matrix[bool] = None, + store: bool = False, + prefetching: str = None, + load_offset: int = 0, + pf_cursor: Cursor = None, + pf_cursor_ptr: CursorLocation = None, + temp = None + ) -> Block: + + rows, cols = registers.shape + action = "Store" if store else "Load" + asm = block(f"{action} {cursor.name} register block @ {block_offset}") + + max_offs = 2047 + cur11 = 0 + + for ic in range(cols): + for ir in range(rows): + if (mask is None) or (mask[ir,ic]): + all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(v_size)] + has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords] + if all(has_nonzero): + cell_offset = all_coords[0] + addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) + addr.disp += self.precision.size() * load_offset + needsmove = False + if addr.disp > max_offs: + moved = addr.disp - cur11 + if moved > 0 and moved <= max_offs: + addr.disp = moved + else: + asm.add(add(addr.disp, additional_regs[0], "", addr.base)) + cur11 = addr.disp + addr.disp = 0 + needsmove = True + + addr.base = additional_regs[0] + if store: + asm.add(st(registers[ir,ic], addr, True, comment)) + if prefetching == 'BL2viaC' and pf_cursor is not None: + addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset) + addr.disp += self.precision.size() * load_offset + if addr.disp > max_offs: + moved = addr.disp - cur11 + if needsmove: + asm.add(add(addr.disp, additional_regs[3], "", addr.base)) + addr.disp = 0 + else: + addr.disp = moved + addr.base = additional_regs[3] + asm.add(prefetch(addr, closeness="L2")) + else: + asm.add(ld(addr, registers[ir,ic], True, comment)) + elif any(has_nonzero): + raise NotImplementedError("Element-wise sparsity in A is not yet fully implemented.") + return asm + + def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block: + + rows, cols = registers.shape + asm = block("zero registers") + + for ic in range(cols): + for ir in range(rows): + asm.add(mov(0, registers[ir,ic], True)) + + return asm + + + def make_microkernel(self, + A: Cursor, + B: Cursor, + A_ptr: CursorLocation, + B_ptr: CursorLocation, + A_regs: Matrix[Register], + B_regs, + C_regs: Matrix[Register], + v_size:int, + additional_regs, + to_A_block: Coords = Coords(), + to_B_block: Coords = Coords(), + sub: bool = False + ) -> Block: + + """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation. + It is responsible for loading and unloading the A block, + It does not assume that the A or B cursors point to the start of the block. + Instead, the coordinates to the start of the block are passed separately. + It does not modify any cursor pointers. + """ + asm = block("Block GEMM microkernel") + bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block) + bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block) + assert(bm % v_size == 0) + + mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size) + asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False, temp=B_regs[0,0])) + + Vm = self.ceil_div(bm, v_size) + cur11 = 0 + max_offs = 2047 + + bs = [] + for Vmi in range(Vm): + for bni in range(bn): # inside this n-block + for bki in range(bk): # inside this k-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi*v_size, right=bki) + if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell) + if B_regs[bki, bni] not in bs: + # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value + if B_cell_addr.disp > max_offs: + moved = B_cell_addr.disp - cur11 + if moved > 0 and moved <= max_offs: + B_cell_addr.disp = moved + else: + asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base)) + cur11 = B_cell_addr.disp + B_cell_addr.disp = 0 + + B_cell_addr.base = additional_regs[0] + + asm.add(bcst(B_cell_addr, B_regs[bki, bni], B_comment)) + bs.append(B_regs[bki, bni]) + + for bki in range(bk): # inside this k-block + for Vmi in range(Vm): + for bni in range(bn): # inside this n-block + to_bcell = Coords(down=bki, right=bni) + to_acell = Coords(down=Vmi*v_size, right=bki) + if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell): + _, B_comment = B.look(B_ptr, to_B_block, to_bcell) + comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}" + asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None, sub=sub)) + return asm diff --git a/pspamm/codegen/architectures/lsx/inlineprinter.py b/pspamm/codegen/architectures/lsx/inlineprinter.py new file mode 100644 index 0000000..fe13715 --- /dev/null +++ b/pspamm/codegen/architectures/lsx/inlineprinter.py @@ -0,0 +1,218 @@ +from typing import List +from pspamm.codegen.ast import * +from pspamm.codegen.visitor import Visitor +from pspamm.codegen.operands import * +from pspamm.codegen.precision import * + + +class InlinePrinter(Visitor): + + show_comments = False + indent = " " + depth = 0 + lmargin = 0 + rmargin = 70 + vpadding = False + output = None + stack = None + + + def __init__(self, precision: Precision): + self.output = [] + self.stack = [] + assert precision in (Precision.SINGLE, Precision.DOUBLE) + self.precision = precision + self.psuffix = { + Precision.DOUBLE: "d", + Precision.SINGLE: "s" + }[precision] + self.bpsuffix = { + Precision.DOUBLE: "d", + Precision.SINGLE: "w" + }[precision] + + def show(self): + print("\n".join(self.output)) + + def addLine(self, stmt: str, comment: str): + + line = " "*self.lmargin + self.indent*self.depth + + if stmt is not None and comment is not None and self.show_comments: + stmt = '"' + stmt + '\\r\\n"' + line += stmt.ljust(self.rmargin) + "// " + comment + + elif stmt is not None: + line += '"' + stmt + '\\r\\n"' + + elif stmt is None and comment is not None: + line += "// " + comment + + self.output.append(line) + + def prefix(self, register): + return { + 16: "v", + 32: "xv" + }[register.size()] + + def iname(self, root, refreg, bp): + prefix = self.prefix(refreg) + suffix = self.bpsuffix if bp else self.psuffix + return f"{prefix}{root}.{suffix}" + + def to_addi(self, value): + ADDILENGTH = 12 + ADDIBLOCK = (1 << ADDILENGTH) - 1 + ADDISBLOCK = (1 << (ADDILENGTH - 1)) - 1 + + addipart = value & ADDIBLOCK + luipart = value >> ADDILENGTH + + if addipart >= ADDISBLOCK: + addipart = addipart - (1 << ADDILENGTH) + luipart += 1 + return addipart, luipart + + def visitFma(self, stmt: FmaStmt): + b = stmt.bcast_src.ugly + m = stmt.mult_src.ugly + a = stmt.add_dest.ugly + + # nmsub is used for c' = -a*b + c + op = "fnmsub" if stmt.sub else "fmadd" + + # no broadcasting supported inside the instruction (unlike AVX-512) + s = f"{self.iname(op, stmt.add_dest, False)} {a}, {m}, {b}, {a}" + self.addLine(s, stmt.comment) + + def visitMul(self, stmt: MulStmt): + b = stmt.src.ugly + m = stmt.mult_src.ugly + a = stmt.dest.ugly + s = f"{self.iname('fmul', stmt.dest, False)} {a}, {m}, {b}" + self.addLine(s, stmt.comment) + + def visitBcst(self, stmt: BcstStmt): + b = stmt.bcast_src.ugly + a = stmt.dest.ugly + # check if we broadcast a general register + if isinstance(stmt.bcast_src, Register): + instruction = self.iname('replgr2vr', stmt.dest, True) + else: + instruction = self.iname('ldrepl', stmt.dest, True) + s = f"{instruction} {a}, {b}" + self.addLine(s, stmt.comment) + + def visitAdd(self, stmt: AddStmt): + if isinstance(stmt.src, Constant) and stmt.src.value == 0: + # avoid 0 instructions + return + if isinstance(stmt.src, Constant) and (stmt.src.value > 2047 or stmt.src.value < -2048): + # we need an intermediate register here + + # TODO: do not hard-code x5 here, make well-defined + itmp = "$r5" if stmt.additional is None else stmt.dest.ugly + tmp = "$r5" if stmt.additional is None else stmt.additional.ugly + if stmt.src.value < 0: + addival, luival = self.to_addi(-stmt.src.value) + else: + addival, luival = self.to_addi(stmt.src.value) + self.addLine(f"lu12i.w {itmp}, {luival}", f"Intermediate add: place upper 12 bits of {stmt.src.value}") + if addival != 0: + self.addLine(f"addi.d {itmp}, {itmp}, {addival}", f"Intermediate add: place lower 12 bits of {stmt.src.value}") + if stmt.src.value < 0: + self.addLine(f"sub.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + else: + self.addLine(f"add.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment) + else: + # if stmt.src is a Constant but outside of the above range of value < -2048 or value > 2047 + # we can simply add the Constant to a register + accumulate = stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly + self.addLine(f"addi.d {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment) + + def visitLabel(self, stmt: LabelStmt): + s = f"{stmt.label.ugly}:" + self.addLine(s, stmt.comment) + + def visitCmp(self, stmt: CmpStmt): + raise NotImplementedError() + + def visitJump(self, stmt: JumpStmt): + s = f"bne {stmt.cmpreg.ugly}, $r0, {stmt.destination.ugly}" + self.addLine(s, stmt.comment) + + def visitMov(self, stmt: MovStmt): + if isinstance(stmt.src, Constant): + if stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4]: + assert stmt.src.ugly == '0' + self.addLine(f"{self.prefix(stmt.dest)}ldi {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment) + else: + if stmt.src.value < 2**12: + self.addLine(f"addi.w {stmt.dest.ugly}, $r0, {stmt.src.value}", stmt.comment) + elif stmt.src.value < 2**32: + addival, luival = self.to_addi(stmt.src.value) + self.addLine(f"lu12i.w {stmt.dest.ugly}, {luival}", "Intermediate mov: place upper 12 bits") + if addival != 0: + self.addLine(f"addi.w {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", stmt.comment) + else: + raise NotImplementedError() + elif isinstance(stmt.src, Register): + if stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4]: + iname = self.iname('replgr2vr', stmt.dest, True) + self.addLine(f"{iname} {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment) + else: + self.addLine(f"addi.w {stmt.dest.ugly}, {stmt.src.ugly}, 0", stmt.comment) + else: + raise NotImplementedError() + + def visitPrefetch(self, stmt: PrefetchStmt): + if stmt.closeness == "L3": + hint = "2" + if stmt.closeness == "L2": + hint = "1" + if stmt.closeness == "L1": + hint = "0" + # TODO: maybe preldx here? + s = f"preld {hint}, {stmt.dest.ugly}" + self.addLine(s, stmt.comment) + + def visitLoad(self, stmt: LoadStmt): + if stmt.dest.typeinfo == AsmType.f64: + s = f"fl{self.ugly_precision} {stmt.dest.ugly}, {stmt.src.ugly}" + elif stmt.dest.typeinfo == AsmType.i64: + s = f"ld.d {stmt.dest.ugly}, {stmt.src.ugly}" + elif stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4] and stmt.aligned: + instr = f'{self.prefix(stmt.dest)}ld' + s = f"{instr} {stmt.dest.ugly}, {stmt.src.ugly}" + else: + raise NotImplementedError() + self.addLine(s, stmt.comment) + + def visitStore(self, stmt: StoreStmt): + if stmt.src.typeinfo == AsmType.f64: + s = f"fs{self.ugly_precision} {stmt.src.ugly}, {stmt.dest.ugly}" + elif stmt.src.typeinfo == AsmType.i64: + s = f"st.d {stmt.src.ugly}, {stmt.dest.ugly}" + elif stmt.src.typeinfo in [AsmType.f64x2, AsmType.f64x4] and stmt.aligned: + instr = f'{self.prefix(stmt.src)}st' + s = f"{instr} {stmt.src.ugly}, {stmt.dest.ugly}" + else: + raise NotImplementedError() + self.addLine(s, stmt.comment) + + def visitBlock(self, block: Block): + self.stack.append(block) + self.depth += 1 + if self.show_comments and block.comment != '': + self.addLine(None, block.comment) + for stmt in block.contents: + stmt.accept(self) + self.depth -= 1 + self.stack.pop() + + +def render(s: AsmStmt): + p = InlinePrinter() + s.accept(p) + return "\n".join(p.output) diff --git a/pspamm/codegen/architectures/lsx/operands.py b/pspamm/codegen/architectures/lsx/operands.py new file mode 100644 index 0000000..5267726 --- /dev/null +++ b/pspamm/codegen/architectures/lsx/operands.py @@ -0,0 +1,77 @@ +from pspamm.codegen.operands import * + + +class Operand_LSX: + @property + def ugly(self): + raise NotImplementedError() + + +# TODO: Rename this 'Immediate' +class Constant_LSX(Constant): + + @property + def ugly(self): + return f"{self.value}" + + +def c(n): + """Sugar for conveniently defining integer constants""" + return Constant_LSX(value=int(n)) + + + +class Label_LSX(Label): + + @property + def ugly(self): + #return self.ordinal + return self.value.upper() + "_%=" + +def l(label: str): + return Label_LSX(label) + + +class Register_LSX(Register): + + @property + def ugly(self): + return "$" + self.value + +r = lambda n: Register_LSX(AsmType.i64, "r"+str(n)) +vr = lambda n: Register_LSX(AsmType.f64x2, "vr"+str(n)) +xr = lambda n: Register_LSX(AsmType.f64x4, "xr"+str(n)) + + + + +class MemoryAddress_LSX(MemoryAddress): + + def __init__(self, + base: Register, + disp: int, + index: Register = None, + scaling: int = None) -> None: + self.base = base + self.disp = disp + self.index = index + self.scaling = scaling + + @property + def ugly(self): + #if self.index is None: + # return f"{self.disp}({self.base.ugly})" + #return f"{self.disp}({self.base.ugly},{self.index.ugly},{self.scaling})" + return f"{self.base.ugly},{self.disp}" + + def registers(self): + return [self.base, self.index] + +def mem(base, offset, index=None, scaling=None): + return MemoryAddress_LSX(base, offset, index, scaling) + + + + + + diff --git a/pspamm/matmul.py b/pspamm/matmul.py index ade311c..e856ba6 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -140,6 +140,24 @@ def __init__(self, # only 128 supported v_len_regs = 1 arch = 'arm' + + if arch.startswith('lsx'): + if len(arch) == 3: + v_len_regs = 1 + else: + v_len_bits = int(arch[3:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = 'lsx' + + if arch.startswith('lasx'): + if len(arch) == 4: + v_len_regs = 2 + else: + v_len_bits = int(arch[4:]) + assert v_len_bits in (128, 256) + v_len_regs = v_len_bits // 128 + arch = 'lsx' self.arch = arch assert precision.lower() in ['bf16', 'h', 's', 'd'] diff --git a/tests/runlocal.sh b/tests/runlocal.sh index bb7998f..05bbd7e 100755 --- a/tests/runlocal.sh +++ b/tests/runlocal.sh @@ -34,4 +34,11 @@ elif [[ ${1:0:3} == "knl" ]]; then if [[ ${2} != "norun" ]]; then qemu-x86_64-static -cpu Skylake-Server build/${1}-test fi +elif [[ ${1:0:3} == "lsx" ]]; then + BITLEN=${1:3:6} + # TODO: once established, remove the -14 + loongarch64-linux-gnu-g++-14 -static -mlasx build/${1}_testsuite.cpp -o build/${1}-test + if [[ ${2} != "norun" ]]; then + qemu-loongarch64-static -cpu max build/${1}-test + fi fi diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index 2262a1f..45d0376 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -378,6 +378,10 @@ def make(kernels, arch): if not ((bn+bk) * vm <= 32) or not (bn*bk <= 30) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue + elif arch.startswith("lsx") or arch.startswith("lasx"): + if not ((bn+bk) * vm + bn * bk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue name = f'{kern.name}_{kern.precision}_{bm}_{bn}_{bk}' diff --git a/tests/unit_test.py b/tests/unit_test.py index bf5e73e..5bf40c6 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -23,7 +23,8 @@ "arm_sve": lambda blocksize: [blocksize.Max, blocksize.MaxK, blocksize.Cube], "knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn, blocksize.CubeBn], "hsw": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.Cube], - "rvv": lambda blocksize: [blocksize.MaxBn, blocksize.CubeBn] + "rvv": lambda blocksize: [blocksize.MaxBn, blocksize.CubeBn], + "lsx": lambda blocksize: [blocksize.Max] } blocksize_algs = scripts[archname](blocksize) + [blocksize.Default]