From 0ad3c54b10b661d54728714178dcd4d2e7d9d72d Mon Sep 17 00:00:00 2001
From: David Schneller <david.schneller@tum.de>
Date: Sat, 17 May 2025 12:24:33 +0200
Subject: [PATCH] Add LSX

---
 .github/workflows/codegen.yml                 |   6 +-
 README.md                                     |   3 +-
 pspamm/codegen/architectures/lsx/blocksize.py |  29 +++
 pspamm/codegen/architectures/lsx/generator.py | 236 ++++++++++++++++++
 .../architectures/lsx/inlineprinter.py        | 218 ++++++++++++++++
 pspamm/codegen/architectures/lsx/operands.py  |  77 ++++++
 pspamm/matmul.py                              |  18 ++
 tests/runlocal.sh                             |   7 +
 tests/testsuite_generator.py                  |   4 +
 tests/unit_test.py                            |   3 +-
 10 files changed, 597 insertions(+), 4 deletions(-)
 create mode 100644 pspamm/codegen/architectures/lsx/blocksize.py
 create mode 100644 pspamm/codegen/architectures/lsx/generator.py
 create mode 100644 pspamm/codegen/architectures/lsx/inlineprinter.py
 create mode 100644 pspamm/codegen/architectures/lsx/operands.py

diff --git a/.github/workflows/codegen.yml b/.github/workflows/codegen.yml
index ad0c8fb..61b2211 100644
--- a/.github/workflows/codegen.yml
+++ b/.github/workflows/codegen.yml
@@ -1,6 +1,6 @@
 name: codegen
 on:
-  - pull_request
+  - push
 
 jobs:
   install-pspamm:
@@ -65,12 +65,14 @@ jobs:
           - rvv256
           - rvv512
           - rvv1024
+          - lsx128
+          - lsx256
     steps:
       - name: apt-get
         run: |
           set -euo pipefail
           sudo apt-get update
-          sudo apt-get install g++-aarch64-linux-gnu g++-riscv64-linux-gnu qemu-user-static
+          sudo apt-get install g++-aarch64-linux-gnu g++-riscv64-linux-gnu g++-14-loongarch64-linux-gnu qemu-user-static
 
       - name: setup-python
         uses: actions/setup-python@v4
diff --git a/README.md b/README.md
index 17fa685..198b3b4 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@ Currently supported:
 * x86_64: AVX2, AVX512/AVX10.1
 * ARM/AARCH64: NEON, SVE (128,256,512,1024,2048 bit)
 * RISC-V: V (128,256,512,1024,2048,4096,8192 bit)
+* LoongArch: LSX, LASX
 
 ## Installation
 
@@ -25,7 +26,7 @@ pip install git+https://github.com/SeisSol/PSpaMM.git
 ```bash
 
 pspamm-generator M N K LDA LDB LDC ALPHA BETA \
-    --arch {arm,arm_sve{128..2048},knl{128..512},hsw{128..256},rvv{128..8192}} \
+    --arch {arm,arm_sve{128..2048},knl{128..512},hsw{128..256},rvv{128..8192},lsx{128..256}} \
     --amtx_filename MTX_FILE_PATH --bmtx_filename MTX_FILE_PATH \
     --output_funcname FUNCTION_NAME --output_filename OUTPUT_NAME
 
diff --git a/pspamm/codegen/architectures/lsx/blocksize.py b/pspamm/codegen/architectures/lsx/blocksize.py
new file mode 100644
index 0000000..db4fa3c
--- /dev/null
+++ b/pspamm/codegen/architectures/lsx/blocksize.py
@@ -0,0 +1,29 @@
+class Max:
+    @classmethod
+    def getBlocksize(cls, m, n, bk, v_size, prec):
+        bm = v_size
+        bn = 1
+        maxval = 0
+
+        for i in range(v_size, m+1, v_size):
+            for j in range(1, n+1):
+                # can be replaced by cls.LSX_condition_extended here
+                # (but that seemed to be slower in the end)
+                if cls.LSX_condition(i, j, bk, v_size):
+                    if i*j > maxval and (cls.LSX_condition(i, j, bk, v_size) or j > 1):
+                        maxval = i*j
+                        bm = i
+                        bn = j 
+
+        while cls.LSX_condition(bm, bn, bk+1, v_size):
+            bk += 1
+
+        return (bm, bn, bk)
+
+    @classmethod
+    def LSX_condition(cls, bm, bn, bk, v_size):
+        # ceiling division
+        vm = -(bm // -v_size)
+        return (bn + bk) * vm + bn * bk <= 32
+
+Default = Max
diff --git a/pspamm/codegen/architectures/lsx/generator.py b/pspamm/codegen/architectures/lsx/generator.py
new file mode 100644
index 0000000..836277d
--- /dev/null
+++ b/pspamm/codegen/architectures/lsx/generator.py
@@ -0,0 +1,236 @@
+from pspamm.cursors import *
+
+from pspamm.codegen.architectures.lsx.operands import *
+from pspamm.codegen.ast import *
+from pspamm.codegen.sugar import *
+from pspamm.codegen.generator import *
+from pspamm.codegen.precision import *
+from pspamm.codegen.regcache import *
+
+class Generator(AbstractGenerator):
+    template = """
+void {funcName} (const {real_type}* A, const {real_type}* B, {real_type}* C, {real_type} alpha, {real_type} beta, {real_type} const* prefetch) {{
+  __asm__ __volatile__(
+{body_text}
+    : : {args} : {clobbered});
+
+    #ifndef NDEBUG
+    #ifdef _OPENMP
+    #pragma omp atomic
+    #endif
+    pspamm_num_total_flops += {flop};
+    #endif
+}}
+"""
+    v_len = 2
+
+    def get_v_size(self):
+        return (16 // self.precision.size()) * self.v_len
+
+    def get_template(self):
+        return Generator.template
+
+    def use_broadcast(self):
+        return True
+
+    def has_masks(self):
+        return False
+
+    def init_mask(self, m, bm, v_size, tempreg, maskregs):
+        return block("")
+
+    def make_argument_load(self, starting_regs, prefetch):
+        asm = block("Load arguments")
+        asm.add(ld(InputOperand(f'0', 'm', 'A'), starting_regs[0], False))
+        asm.add(ld(InputOperand(f'1', 'm', 'B'), starting_regs[1], False))
+        asm.add(ld(InputOperand(f'2', 'm', 'C'), starting_regs[2], False))
+        asm.add(ld(InputOperand(f'3', 'm', 'alpha'), starting_regs[3], False))
+        asm.add(ld(InputOperand(f'4', 'm', 'beta'), starting_regs[4], False))
+        if prefetch:
+            asm.add(ld(InputOperand(f'5', 'm', 'prefetch'), starting_regs[5], False))
+        return asm
+
+    def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int, prefetch: str):
+        assert(bm % v_size == 0)
+        vm = self.ceil_div(bm, v_size)
+
+        assert (bn + bk) * vm + bn * bk <= 32
+
+        vmm = {
+            1: vr,
+            2: xr
+        }[self.v_len]
+
+        A_regs = Matrix([[vmm(vm*c + r) for c in range(bk)] for r in range(vm)])
+        Aoffset = vm*bk
+        
+        B_regs = Matrix([[vmm(Aoffset + bn * r + c) for c in range(bn)] for r in range(bk)])
+        C_regs = Matrix([[vmm(32 - vm*bn + vm*c + r) for c in range(bn)]
+                                                     for r in range(vm)])
+
+        b_reg = Aoffset
+        alpha_reg = [vmm(b_reg)] * 2
+        beta_reg = [vmm(b_reg + 1)] * 2
+
+        starting_regs = [r(10), r(11), r(12), r(13), r(14), r(6), r(5)]
+
+        additional_regs = [r(15), r(16), r(17), r(31), r(7)]
+
+        loop_regs = [r(28), r(29), r(30)]
+
+        prefetch_reg = prefetch == 'BL2viaC'
+
+        return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, [], prefetch_reg
+
+    def make_scaling_offsets(self,
+                         additional_regs: List[Register],
+                         nnz: int
+                        ) -> Block:
+        return block("")
+
+    def init_block(self, size):
+        return block("")
+
+    def move_register_block(self,
+                            cursor: Cursor,
+                            cursor_ptr: CursorLocation,
+                            block_offset: Coords,
+                            registers: Matrix[Register],
+                            v_size: int,
+                            additional_regs,
+                            mask: Matrix[bool] = None,
+                            store: bool = False,
+                            prefetching: str = None,
+                            load_offset: int = 0,
+                            pf_cursor: Cursor = None,
+                            pf_cursor_ptr: CursorLocation = None,
+                            temp = None
+                           ) -> Block:
+
+        rows, cols = registers.shape
+        action = "Store" if store else "Load"
+        asm = block(f"{action} {cursor.name} register block @ {block_offset}")
+
+        max_offs = 2047
+        cur11 = 0
+
+        for ic in range(cols):
+            for ir in range(rows):
+                if (mask is None) or (mask[ir,ic]):
+                    all_coords = [Coords(down=ir*v_size+i,right=ic) for i in range(v_size)]
+                    has_nonzero = [cursor.has_nonzero_cell(cursor_ptr, block_offset, offset) for offset in all_coords]
+                    if all(has_nonzero):
+                        cell_offset = all_coords[0]
+                        addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset)
+                        addr.disp += self.precision.size() * load_offset
+                        needsmove = False
+                        if addr.disp > max_offs:
+                            moved = addr.disp - cur11
+                            if moved > 0 and moved <= max_offs:
+                                addr.disp = moved
+                            else:
+                                asm.add(add(addr.disp, additional_regs[0], "", addr.base))
+                                cur11 = addr.disp
+                                addr.disp = 0
+                                needsmove = True
+
+                            addr.base = additional_regs[0]
+                        if store:
+                            asm.add(st(registers[ir,ic], addr, True, comment))
+                            if prefetching == 'BL2viaC' and pf_cursor is not None:
+                                addr, comment = pf_cursor.look(pf_cursor_ptr, block_offset, cell_offset)
+                                addr.disp += self.precision.size() * load_offset
+                                if addr.disp > max_offs:
+                                    moved = addr.disp - cur11
+                                    if needsmove:
+                                        asm.add(add(addr.disp, additional_regs[3], "", addr.base))
+                                        addr.disp = 0
+                                    else:
+                                        addr.disp = moved
+                                    addr.base = additional_regs[3]
+                                asm.add(prefetch(addr, closeness="L2"))
+                        else:
+                            asm.add(ld(addr, registers[ir,ic], True, comment))
+                    elif any(has_nonzero):
+                        raise NotImplementedError("Element-wise sparsity in A is not yet fully implemented.")
+        return asm
+
+    def make_zero_block(self, registers: Matrix[Register], additional_regs) -> Block:
+
+        rows, cols = registers.shape
+        asm = block("zero registers")
+
+        for ic in range(cols):
+            for ir in range(rows):
+                asm.add(mov(0, registers[ir,ic], True))
+
+        return asm
+
+
+    def make_microkernel(self,
+                         A: Cursor,
+                         B: Cursor,
+                         A_ptr: CursorLocation,
+                         B_ptr: CursorLocation,
+                         A_regs: Matrix[Register],
+                         B_regs,
+                         C_regs: Matrix[Register],
+                         v_size:int,
+                         additional_regs,
+                         to_A_block: Coords = Coords(),
+                         to_B_block: Coords = Coords(),
+                         sub: bool = False
+                        ) -> Block:
+
+        """ make_microkernel generates a GEMM microkernel for two blocks using the outer-product formulation.
+            It is responsible for loading and unloading the A block,
+            It does not assume that the A or B cursors point to the start of the block.
+            Instead, the coordinates to the start of the block are passed separately.
+            It does not modify any cursor pointers.
+        """
+        asm = block("Block GEMM microkernel")
+        bm,bk,aidx,apattern = A.get_block(A_ptr, to_A_block)
+        bk,bn,bidx,bpattern = B.get_block(B_ptr, to_B_block)
+        assert(bm % v_size == 0)
+
+        mask = sparse_mask(A_regs, A, A_ptr, to_A_block, B, B_ptr, to_B_block, v_size)
+        asm.add(self.move_register_block(A, A_ptr, to_A_block, A_regs, v_size, additional_regs, mask, store=False, temp=B_regs[0,0]))
+
+        Vm = self.ceil_div(bm, v_size)
+        cur11 = 0
+        max_offs = 2047
+
+        bs = []
+        for Vmi in range(Vm):
+            for bni in range(bn):   # inside this n-block
+                for bki in range(bk):       # inside this k-block
+                    to_bcell = Coords(down=bki, right=bni)
+                    to_acell = Coords(down=Vmi*v_size, right=bki)
+                    if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell):
+                        B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_bcell)
+                        if B_regs[bki, bni] not in bs:
+                            # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value
+                            if B_cell_addr.disp > max_offs:
+                                moved = B_cell_addr.disp - cur11
+                                if moved > 0 and moved <= max_offs:
+                                    B_cell_addr.disp = moved
+                                else:
+                                    asm.add(add(B_cell_addr.disp, additional_regs[0], "", B_cell_addr.base))
+                                    cur11 = B_cell_addr.disp
+                                    B_cell_addr.disp = 0
+
+                                B_cell_addr.base = additional_regs[0]
+                            
+                            asm.add(bcst(B_cell_addr, B_regs[bki, bni], B_comment))
+                            bs.append(B_regs[bki, bni])
+
+        for bki in range(bk):       # inside this k-block
+            for Vmi in range(Vm):
+                for bni in range(bn):   # inside this n-block
+                    to_bcell = Coords(down=bki, right=bni)
+                    to_acell = Coords(down=Vmi*v_size, right=bki)
+                    if B.has_nonzero_cell(B_ptr, to_B_block, to_bcell) and A.has_nonzero_cell(A_ptr, to_A_block, to_acell):
+                        _, B_comment = B.look(B_ptr, to_B_block, to_bcell)
+                        comment = f"C[{Vmi*v_size}:{Vmi*v_size+v_size},{bni}] += A[{Vmi*v_size}:{Vmi*v_size+v_size},{bki}]*{B_comment}"
+                        asm.add(fma(B_regs[bki, bni], A_regs[Vmi, bki], C_regs[Vmi, bni], comment=comment, bcast=None, sub=sub))
+        return asm
diff --git a/pspamm/codegen/architectures/lsx/inlineprinter.py b/pspamm/codegen/architectures/lsx/inlineprinter.py
new file mode 100644
index 0000000..fe13715
--- /dev/null
+++ b/pspamm/codegen/architectures/lsx/inlineprinter.py
@@ -0,0 +1,218 @@
+from typing import List
+from pspamm.codegen.ast import *
+from pspamm.codegen.visitor import Visitor
+from pspamm.codegen.operands import *
+from pspamm.codegen.precision import *
+
+
+class InlinePrinter(Visitor):
+
+    show_comments = False
+    indent = "  "
+    depth = 0
+    lmargin = 0
+    rmargin = 70
+    vpadding = False
+    output = None
+    stack = None
+
+
+    def __init__(self, precision: Precision):
+        self.output = []
+        self.stack = []
+        assert precision in (Precision.SINGLE, Precision.DOUBLE)
+        self.precision = precision
+        self.psuffix = {
+            Precision.DOUBLE: "d",
+            Precision.SINGLE: "s"
+        }[precision]
+        self.bpsuffix = {
+            Precision.DOUBLE: "d",
+            Precision.SINGLE: "w"
+        }[precision]
+
+    def show(self):
+        print("\n".join(self.output))
+
+    def addLine(self, stmt: str, comment: str):
+
+        line = " "*self.lmargin + self.indent*self.depth
+
+        if stmt is not None and comment is not None and self.show_comments:
+            stmt = '"' + stmt + '\\r\\n"'
+            line += stmt.ljust(self.rmargin) + "// " + comment
+
+        elif stmt is not None:
+            line += '"' + stmt + '\\r\\n"'
+
+        elif stmt is None and comment is not None:
+            line += "// " + comment
+
+        self.output.append(line)
+
+    def prefix(self, register):
+        return {
+            16: "v",
+            32: "xv"
+        }[register.size()]
+    
+    def iname(self, root, refreg, bp):
+        prefix = self.prefix(refreg)
+        suffix = self.bpsuffix if bp else self.psuffix
+        return f"{prefix}{root}.{suffix}"
+    
+    def to_addi(self, value):
+        ADDILENGTH = 12
+        ADDIBLOCK = (1 << ADDILENGTH) - 1
+        ADDISBLOCK = (1 << (ADDILENGTH - 1)) - 1
+
+        addipart = value & ADDIBLOCK
+        luipart = value >> ADDILENGTH
+
+        if addipart >= ADDISBLOCK:
+            addipart = addipart - (1 << ADDILENGTH)
+            luipart += 1
+        return addipart, luipart
+
+    def visitFma(self, stmt: FmaStmt):
+        b = stmt.bcast_src.ugly
+        m = stmt.mult_src.ugly
+        a = stmt.add_dest.ugly
+
+        # nmsub is used for c' = -a*b + c
+        op = "fnmsub" if stmt.sub else "fmadd"
+
+        # no broadcasting supported inside the instruction (unlike AVX-512)
+        s = f"{self.iname(op, stmt.add_dest, False)} {a}, {m}, {b}, {a}"
+        self.addLine(s, stmt.comment)
+
+    def visitMul(self, stmt: MulStmt):
+        b = stmt.src.ugly
+        m = stmt.mult_src.ugly
+        a = stmt.dest.ugly
+        s = f"{self.iname('fmul', stmt.dest, False)} {a}, {m}, {b}"
+        self.addLine(s, stmt.comment)
+
+    def visitBcst(self, stmt: BcstStmt):
+        b = stmt.bcast_src.ugly
+        a = stmt.dest.ugly
+        # check if we broadcast a general register
+        if isinstance(stmt.bcast_src, Register):
+            instruction = self.iname('replgr2vr', stmt.dest, True)
+        else:
+            instruction = self.iname('ldrepl', stmt.dest, True)
+        s = f"{instruction} {a}, {b}"
+        self.addLine(s, stmt.comment)
+
+    def visitAdd(self, stmt: AddStmt):
+        if isinstance(stmt.src, Constant) and stmt.src.value == 0:
+            # avoid 0 instructions
+            return
+        if isinstance(stmt.src, Constant) and (stmt.src.value > 2047 or stmt.src.value < -2048):
+            # we need an intermediate register here
+
+            # TODO: do not hard-code x5 here, make well-defined
+            itmp = "$r5" if stmt.additional is None else stmt.dest.ugly
+            tmp = "$r5" if stmt.additional is None else stmt.additional.ugly
+            if stmt.src.value < 0:
+                addival, luival = self.to_addi(-stmt.src.value)
+            else:
+                addival, luival = self.to_addi(stmt.src.value)
+            self.addLine(f"lu12i.w {itmp}, {luival}", f"Intermediate add: place upper 12 bits of {stmt.src.value}")
+            if addival != 0:
+                self.addLine(f"addi.d {itmp}, {itmp}, {addival}", f"Intermediate add: place lower 12 bits of {stmt.src.value}")
+            if stmt.src.value < 0:
+                self.addLine(f"sub.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment)
+            else:
+                self.addLine(f"add.d {stmt.dest.ugly}, {stmt.dest.ugly}, {tmp}", stmt.comment)
+        else:
+            # if stmt.src is a Constant but outside of the above range of value < -2048 or value > 2047
+            # we can simply add the Constant to a register
+            accumulate = stmt.dest.ugly if stmt.additional is None else stmt.additional.ugly
+            self.addLine(f"addi.d {stmt.dest.ugly}, {accumulate}, {stmt.src.ugly}", stmt.comment)
+
+    def visitLabel(self, stmt: LabelStmt):
+        s = f"{stmt.label.ugly}:"
+        self.addLine(s, stmt.comment)
+
+    def visitCmp(self, stmt: CmpStmt):
+        raise NotImplementedError()
+
+    def visitJump(self, stmt: JumpStmt):
+        s = f"bne {stmt.cmpreg.ugly}, $r0, {stmt.destination.ugly}"
+        self.addLine(s, stmt.comment)
+
+    def visitMov(self, stmt: MovStmt):
+        if isinstance(stmt.src, Constant):
+            if stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4]:
+                assert stmt.src.ugly == '0'
+                self.addLine(f"{self.prefix(stmt.dest)}ldi {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment)
+            else:
+                if stmt.src.value < 2**12:
+                    self.addLine(f"addi.w {stmt.dest.ugly}, $r0, {stmt.src.value}", stmt.comment)
+                elif stmt.src.value < 2**32:
+                    addival, luival = self.to_addi(stmt.src.value)
+                    self.addLine(f"lu12i.w {stmt.dest.ugly}, {luival}", "Intermediate mov: place upper 12 bits")
+                    if addival != 0:
+                        self.addLine(f"addi.w {stmt.dest.ugly}, {stmt.dest.ugly}, {addival}", stmt.comment)
+                else:
+                    raise NotImplementedError()
+        elif isinstance(stmt.src, Register):
+            if stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4]:
+                iname = self.iname('replgr2vr', stmt.dest, True)
+                self.addLine(f"{iname} {stmt.dest.ugly}, {stmt.src.ugly}", stmt.comment)
+            else:
+                self.addLine(f"addi.w {stmt.dest.ugly}, {stmt.src.ugly}, 0", stmt.comment)
+        else:
+            raise NotImplementedError()
+
+    def visitPrefetch(self, stmt: PrefetchStmt):
+        if stmt.closeness == "L3":
+            hint = "2"
+        if stmt.closeness == "L2":
+            hint = "1"
+        if stmt.closeness == "L1":
+            hint = "0"
+        # TODO: maybe preldx here?
+        s = f"preld {hint}, {stmt.dest.ugly}"
+        self.addLine(s, stmt.comment)
+    
+    def visitLoad(self, stmt: LoadStmt):
+        if stmt.dest.typeinfo == AsmType.f64:
+            s = f"fl{self.ugly_precision} {stmt.dest.ugly}, {stmt.src.ugly}"
+        elif stmt.dest.typeinfo == AsmType.i64:
+            s = f"ld.d {stmt.dest.ugly}, {stmt.src.ugly}"
+        elif stmt.dest.typeinfo in [AsmType.f64x2, AsmType.f64x4] and stmt.aligned:
+            instr = f'{self.prefix(stmt.dest)}ld'
+            s = f"{instr} {stmt.dest.ugly}, {stmt.src.ugly}"
+        else:
+            raise NotImplementedError()
+        self.addLine(s, stmt.comment)
+
+    def visitStore(self, stmt: StoreStmt):
+        if stmt.src.typeinfo == AsmType.f64:
+            s = f"fs{self.ugly_precision} {stmt.src.ugly}, {stmt.dest.ugly}"
+        elif stmt.src.typeinfo == AsmType.i64:
+            s = f"st.d {stmt.src.ugly}, {stmt.dest.ugly}"
+        elif stmt.src.typeinfo in [AsmType.f64x2, AsmType.f64x4] and stmt.aligned:
+            instr = f'{self.prefix(stmt.src)}st'
+            s = f"{instr} {stmt.src.ugly}, {stmt.dest.ugly}"
+        else:
+            raise NotImplementedError()
+        self.addLine(s, stmt.comment)
+
+    def visitBlock(self, block: Block):
+        self.stack.append(block)
+        self.depth += 1
+        if self.show_comments and block.comment != '':
+            self.addLine(None, block.comment)
+        for stmt in block.contents:
+            stmt.accept(self)
+        self.depth -= 1
+        self.stack.pop()
+
+
+def render(s: AsmStmt):
+    p = InlinePrinter()
+    s.accept(p)
+    return "\n".join(p.output)
diff --git a/pspamm/codegen/architectures/lsx/operands.py b/pspamm/codegen/architectures/lsx/operands.py
new file mode 100644
index 0000000..5267726
--- /dev/null
+++ b/pspamm/codegen/architectures/lsx/operands.py
@@ -0,0 +1,77 @@
+from pspamm.codegen.operands import *
+
+
+class Operand_LSX:
+    @property
+    def ugly(self):
+        raise NotImplementedError()
+
+
+# TODO: Rename this 'Immediate'
+class Constant_LSX(Constant):
+
+    @property
+    def ugly(self):
+        return f"{self.value}"
+
+
+def c(n):
+    """Sugar for conveniently defining integer constants"""
+    return Constant_LSX(value=int(n))
+
+
+
+class Label_LSX(Label):
+
+    @property
+    def ugly(self):
+        #return self.ordinal
+        return self.value.upper() + "_%="
+
+def l(label: str):
+    return Label_LSX(label)
+
+
+class Register_LSX(Register):
+
+    @property
+    def ugly(self):
+        return "$" + self.value
+
+r   = lambda n: Register_LSX(AsmType.i64, "r"+str(n))
+vr = lambda n: Register_LSX(AsmType.f64x2, "vr"+str(n))
+xr = lambda n: Register_LSX(AsmType.f64x4, "xr"+str(n))
+
+
+
+
+class MemoryAddress_LSX(MemoryAddress):
+    
+    def __init__(self,
+                 base: Register,
+                 disp: int,
+                 index: Register = None,
+                 scaling: int = None) -> None:
+        self.base = base
+        self.disp = disp
+        self.index = index
+        self.scaling = scaling
+
+    @property
+    def ugly(self):
+        #if self.index is None:
+        #    return f"{self.disp}({self.base.ugly})"
+        #return f"{self.disp}({self.base.ugly},{self.index.ugly},{self.scaling})"
+        return f"{self.base.ugly},{self.disp}"
+    
+    def registers(self):
+        return [self.base, self.index]
+
+def mem(base, offset, index=None, scaling=None):
+    return MemoryAddress_LSX(base, offset, index, scaling)
+
+
+
+
+
+
diff --git a/pspamm/matmul.py b/pspamm/matmul.py
index ade311c..e856ba6 100644
--- a/pspamm/matmul.py
+++ b/pspamm/matmul.py
@@ -140,6 +140,24 @@ def __init__(self,
           # only 128 supported
           v_len_regs = 1
           arch = 'arm'
+        
+        if arch.startswith('lsx'):
+          if len(arch) == 3:
+            v_len_regs = 1
+          else:
+            v_len_bits = int(arch[3:])
+            assert v_len_bits in (128, 256)
+            v_len_regs = v_len_bits // 128
+          arch = 'lsx'
+        
+        if arch.startswith('lasx'):
+          if len(arch) == 4:
+            v_len_regs = 2
+          else:
+            v_len_bits = int(arch[4:])
+            assert v_len_bits in (128, 256)
+            v_len_regs = v_len_bits // 128
+          arch = 'lsx'
 
         self.arch = arch
         assert precision.lower() in ['bf16', 'h', 's', 'd']
diff --git a/tests/runlocal.sh b/tests/runlocal.sh
index bb7998f..05bbd7e 100755
--- a/tests/runlocal.sh
+++ b/tests/runlocal.sh
@@ -34,4 +34,11 @@ elif [[ ${1:0:3} == "knl" ]]; then
     if [[ ${2} != "norun" ]]; then
         qemu-x86_64-static -cpu Skylake-Server build/${1}-test
     fi
+elif [[ ${1:0:3} == "lsx" ]]; then
+    BITLEN=${1:3:6}
+    # TODO: once established, remove the -14
+    loongarch64-linux-gnu-g++-14 -static -mlasx build/${1}_testsuite.cpp -o build/${1}-test
+    if [[ ${2} != "norun" ]]; then
+        qemu-loongarch64-static -cpu max build/${1}-test
+    fi
 fi
diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py
index 2262a1f..45d0376 100755
--- a/tests/testsuite_generator.py
+++ b/tests/testsuite_generator.py
@@ -378,6 +378,10 @@ def make(kernels, arch):
               if not ((bn+bk) * vm <= 32) or not (bn*bk <= 30) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0:
                 print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}')
                 continue
+            elif arch.startswith("lsx") or arch.startswith("lasx"):
+              if not ((bn+bk) * vm + bn * bk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0:
+                print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}')
+                continue
 
             name = f'{kern.name}_{kern.precision}_{bm}_{bn}_{bk}'
 
diff --git a/tests/unit_test.py b/tests/unit_test.py
index bf5e73e..5bf40c6 100644
--- a/tests/unit_test.py
+++ b/tests/unit_test.py
@@ -23,7 +23,8 @@
     "arm_sve": lambda blocksize: [blocksize.Max, blocksize.MaxK, blocksize.Cube],
     "knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn, blocksize.CubeBn],
     "hsw": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.Cube],
-    "rvv": lambda blocksize: [blocksize.MaxBn, blocksize.CubeBn]
+    "rvv": lambda blocksize: [blocksize.MaxBn, blocksize.CubeBn],
+    "lsx": lambda blocksize: [blocksize.Max]
 }
 
 blocksize_algs = scripts[archname](blocksize) + [blocksize.Default]