From ff143d9dd621bea6bd1bad00be402043b1654158 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 4 Aug 2025 04:04:03 +0000 Subject: [PATCH 1/3] [Cleanup] Remove deprecated modules --- AsmParser/riscv_parser.py | 997 ------- GemminiLowerPass/CMakeLists.txt | 38 - GemminiLowerPass/LowerGemminiPass.cpp | 2566 ----------------- GemminiLowerPass/LowerGemminiPass.h | 37 - PyTorchSimFrontend/extension_codecache.py | 117 - PyTorchSimFrontend/llvm/llvm_autotune.py | 75 - .../llvm/llvm_codegen_backend.py | 1157 -------- PyTorchSimFrontend/llvm/llvm_conv_template.py | 230 -- PyTorchSimFrontend/llvm/llvm_gemm_template.py | 139 - PyTorchSimFrontend/llvm/llvm_lowering.py | 91 - PyTorchSimFrontend/llvm/llvm_template.py | 234 -- 11 files changed, 5681 deletions(-) delete mode 100644 AsmParser/riscv_parser.py delete mode 100644 GemminiLowerPass/CMakeLists.txt delete mode 100644 GemminiLowerPass/LowerGemminiPass.cpp delete mode 100644 GemminiLowerPass/LowerGemminiPass.h delete mode 100644 PyTorchSimFrontend/llvm/llvm_autotune.py delete mode 100644 PyTorchSimFrontend/llvm/llvm_codegen_backend.py delete mode 100644 PyTorchSimFrontend/llvm/llvm_conv_template.py delete mode 100644 PyTorchSimFrontend/llvm/llvm_gemm_template.py delete mode 100644 PyTorchSimFrontend/llvm/llvm_lowering.py delete mode 100644 PyTorchSimFrontend/llvm/llvm_template.py diff --git a/AsmParser/riscv_parser.py b/AsmParser/riscv_parser.py deleted file mode 100644 index 88293395..00000000 --- a/AsmParser/riscv_parser.py +++ /dev/null @@ -1,997 +0,0 @@ -from collections import OrderedDict -from itertools import chain -import onnx -if __name__ == "__main__": - from onnx_utility import loop_index_node, load_node, store_node, compute_node, connect_nodes, dump_onnx_graph -else: - from AsmParser.onnx_utility import loop_index_node, loop_end_node, load_node, store_node, compute_node, connect_nodes, dump_onnx_graph - - -# Operand Attributes -MEM = 0x400 -OFFSET = 0x200 -DEST = 0x100 - -REGISTER = 0x000 -IMMEDIATE = 0x001 -SPECIAL = 0x002 -LABEL = 0x004 -TYPE_MASK = (IMMEDIATE|SPECIAL|LABEL) - -R_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER] -I_TEMPLATE = [REGISTER|DEST, REGISTER, IMMEDIATE] -U_TEMPLATE = [REGISTER|DEST, IMMEDIATE] -B_TEMPLATE = [REGISTER, REGISTER, LABEL] - -LOAD_TEMPLATE = [REGISTER|DEST, REGISTER|OFFSET] -STORE_TEMPLATE = [REGISTER, REGISTER|OFFSET] - -NOP_TEMPLATE = [] -RI_TEMPLATE = [REGISTER|DEST, IMMEDIATE] -PSEUDO_B_TEMPLATE = [REGISTER, LABEL] -PSEUDO_J_TEMPLATE = [LABEL] - -CSR_REG_TEMPLATE = [REGISTER|DEST, SPECIAL, REGISTER] -CSR_IMM_TEMPLATE = [REGISTER|DEST, SPECIAL, IMMEDIATE] - -CSR_ALIAS_WRITE_TEMPLATE = [SPECIAL, REGISTER] -CSR_ALIAS_READ_TEMPLATE = [REGISTER|DEST, SPECIAL] -CSR_ALIAS_IMM_TEPLATE = [SPECIAL, IMMEDIATE] - -ATOMIC_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER|MEM] -ATOMIC_LOAD_TEMPLATE = [REGISTER|DEST, REGISTER|MEM] - -R4_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER, REGISTER] -R2_TEMPLATE = [REGISTER|DEST, REGISTER] - -VECTOR_LOAD_TEMPLATE = [REGISTER|DEST, REGISTER|MEM] -VECTOR_STORE_TEMPLATE = [REGISTER, REGISTER|MEM] - -VECTOR_STRIDE_LOAD_TEMPLATE = [REGISTER|DEST, REGISTER|MEM, REGISTER] -VECTOR_STRIDE_STORE_TEMPLATE = [REGISTER, REGISTER|MEM, REGISTER] - -VECTOR_VV_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER] -VECTOR_VX_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER] -VECTOR_VF_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER] -VECTOR_VI_TEMPLATE = [REGISTER|DEST, REGISTER, IMMEDIATE] -VECTOR_WV_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER] -VECTOR_WX_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER] - -CUSTOM_R_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER] - -VSETVLI_TEMPLATE = [REGISTER|DEST, REGISTER] -VSETIVLI_TEMPLATE = [REGISTER|DEST, IMMEDIATE] -VSETVL_TEMPLATE = [REGISTER|DEST, REGISTER, REGISTER] - -R32_INSTUCTION_TEMPLATE = { - # RV32 - "addi": I_TEMPLATE, - "slti": I_TEMPLATE, - "sltiu": I_TEMPLATE, - "xori": I_TEMPLATE, - "ori": I_TEMPLATE, - "andi": I_TEMPLATE, - "lui": RI_TEMPLATE, - "slli": I_TEMPLATE, - "srli": I_TEMPLATE, - "srai": I_TEMPLATE, - - "add": R_TEMPLATE, - "sub": R_TEMPLATE, - "slt": R_TEMPLATE, - "sltu": R_TEMPLATE, - "xor": R_TEMPLATE, - "or": R_TEMPLATE, - "and": R_TEMPLATE, - "sll": R_TEMPLATE, - "srl": R_TEMPLATE, - "sra": R_TEMPLATE, - - "beq": B_TEMPLATE, - "bne": B_TEMPLATE, - "blt": B_TEMPLATE, - "bge": B_TEMPLATE, - "bltu": B_TEMPLATE, - "bgeu": B_TEMPLATE, - - "lb": LOAD_TEMPLATE, - "lh": LOAD_TEMPLATE, - "lw": LOAD_TEMPLATE, - "lbu": LOAD_TEMPLATE, - "lhu": LOAD_TEMPLATE, - "sb": STORE_TEMPLATE, - "sh": STORE_TEMPLATE, - "sw": STORE_TEMPLATE, - - "csrc": CSR_ALIAS_WRITE_TEMPLATE, - "csrr": CSR_ALIAS_READ_TEMPLATE, - "csrw": CSR_ALIAS_WRITE_TEMPLATE, - "csrci": CSR_ALIAS_IMM_TEPLATE, - "csrsi": CSR_ALIAS_IMM_TEPLATE, - "csrwi": CSR_ALIAS_IMM_TEPLATE, -} - -R64_INSTRUCTION_TEMPLATE = { - # RV64 - "addiw": I_TEMPLATE, - - "slliw": I_TEMPLATE, - "srliw": I_TEMPLATE, - "sraiw": I_TEMPLATE, - - "addw": R_TEMPLATE, - "subw": R_TEMPLATE, - - "sllw": R_TEMPLATE, - "srlw": R_TEMPLATE, - "sraw": R_TEMPLATE, - - "ld": LOAD_TEMPLATE, - "lwu": LOAD_TEMPLATE, - "sd": STORE_TEMPLATE -} - -PSEUDO_INSTRUCTION_TEMPLATE = { - "li": RI_TEMPLATE, - "mv": R2_TEMPLATE, - "not": R2_TEMPLATE, - "neg": R2_TEMPLATE, - "negw": R2_TEMPLATE, - "sext.w": R2_TEMPLATE, - "seqz": R2_TEMPLATE, - "snez": R2_TEMPLATE, - "sltz": R2_TEMPLATE, - "sgtz": R2_TEMPLATE, - "fmv.s": R2_TEMPLATE, - "fabs.s": R2_TEMPLATE, - "fneg.s": R2_TEMPLATE, - "fmv.d": R2_TEMPLATE, - "fabs.d": R2_TEMPLATE, - "fneg.d": R2_TEMPLATE, - - "beqz": PSEUDO_B_TEMPLATE, - "bnez": PSEUDO_B_TEMPLATE, - "blez": PSEUDO_B_TEMPLATE, - "bgez": PSEUDO_B_TEMPLATE, - "bltz": PSEUDO_B_TEMPLATE, - "bgtz": PSEUDO_B_TEMPLATE, - "bgt": B_TEMPLATE, - "ble": B_TEMPLATE, - "bgtu": B_TEMPLATE, - "bleu": B_TEMPLATE, - "ret": NOP_TEMPLATE, - "j": PSEUDO_J_TEMPLATE -} - -CSR_INSTURCTION_TEMPLATE = { - # Zicsr - "csrrw": CSR_REG_TEMPLATE, - "csrrs": CSR_REG_TEMPLATE, - "csrrc": CSR_REG_TEMPLATE, - "csrrwi": CSR_IMM_TEMPLATE, - "csrrsi": CSR_IMM_TEMPLATE, - "csrrci": CSR_IMM_TEMPLATE -} - -MUL_INSTRUCTION_TEMPLATE = { - # RV32M/64M - "mul": R_TEMPLATE, - "mulh": R_TEMPLATE, - "mulhsu": R_TEMPLATE, - "mulhu": R_TEMPLATE, - "div": R_TEMPLATE, - "divu": R_TEMPLATE, - "rem": R_TEMPLATE, - "remu": R_TEMPLATE, - - "mulw": R_TEMPLATE, - "divw": R_TEMPLATE, - "divuw": R_TEMPLATE, - "remw": R_TEMPLATE, - "remuw": R_TEMPLATE, -} - -ATOMIC_INSTRUCTION_TEMPLATE = { - "lr.w": ATOMIC_LOAD_TEMPLATE, - "sc.w": ATOMIC_TEMPLATE, - "amoadd.w": ATOMIC_TEMPLATE, - "amoswap.w": ATOMIC_TEMPLATE, - "amoxor.w": ATOMIC_TEMPLATE, - "amoor.w": ATOMIC_TEMPLATE, - "amoand.w": ATOMIC_TEMPLATE, - "amomin.w": ATOMIC_TEMPLATE, - "amomax.w": ATOMIC_TEMPLATE, - "amominu.w": ATOMIC_TEMPLATE, - "amomaxu.w": ATOMIC_TEMPLATE, - - "lr.d": ATOMIC_LOAD_TEMPLATE, - "sc.d": ATOMIC_TEMPLATE, - "amoadd.d": ATOMIC_TEMPLATE, - "amoswap.d": ATOMIC_TEMPLATE, - "amoxor.d": ATOMIC_TEMPLATE, - "amoor.d": ATOMIC_TEMPLATE, - "amoand.d": ATOMIC_TEMPLATE, - "amomin.d": ATOMIC_TEMPLATE, - "amomax.d": ATOMIC_TEMPLATE, - "amominu.d": ATOMIC_TEMPLATE, - "amomaxu.d": ATOMIC_TEMPLATE, -} - -FLOATING_INSTRUCTION_TEMPLATE = { - # RV32F/64F - "flw": LOAD_TEMPLATE, - "fsw": STORE_TEMPLATE, - - "fmadd.s": R4_TEMPLATE, - "fmsub.s": R4_TEMPLATE, - "fnmsub.s": R4_TEMPLATE, - "fnmadd.s": R4_TEMPLATE, - "fadd.s": R_TEMPLATE, - "fsub.s": R_TEMPLATE, - "fmul.s": R_TEMPLATE, - "fdiv.s": R_TEMPLATE, - "fsqrt.s": R2_TEMPLATE, - "fmin.s" : R_TEMPLATE, - "fmax.s" : R_TEMPLATE, - - "fcvt.w.s": R2_TEMPLATE, - "fcvt.wu.s": R2_TEMPLATE, - "fcvt.s.w": R2_TEMPLATE, - "fcvt.s.wu": R2_TEMPLATE, - - "fsgnj.s": R_TEMPLATE, - "fsgnjn.s": R_TEMPLATE, - "fsgnjx.s": R_TEMPLATE, - "fmv.x.w": R2_TEMPLATE, - "fmv.w.x": R2_TEMPLATE, - - "feq.s": R_TEMPLATE, - "flt.s": R_TEMPLATE, - "fle.s": R_TEMPLATE, - "fclass.s": R2_TEMPLATE, - - "fcvt.l.s": R2_TEMPLATE, - "fcvt.lu.s": R2_TEMPLATE, - "fcvt.s.l": R2_TEMPLATE, - "fcvt.s.lu": R2_TEMPLATE, - - "fld": LOAD_TEMPLATE, - "fsd": STORE_TEMPLATE, - - "fmadd.d": R4_TEMPLATE, - "fmsub.d": R4_TEMPLATE, - "fnmsub.d": R4_TEMPLATE, - "fnmadd.d": R4_TEMPLATE, - "fadd.d": R_TEMPLATE, - "fsub.d": R_TEMPLATE, - "fmul.d": R_TEMPLATE, - "fdiv.d": R_TEMPLATE, - "fsqrt.d": R2_TEMPLATE, - "fmin.d" : R_TEMPLATE, - "fmax.d" : R_TEMPLATE, - - "fcvt.d.s": R2_TEMPLATE, - "fcvt.s.d": R2_TEMPLATE, - "fcvt.w.d": R2_TEMPLATE, - "fcvt.wu.d": R2_TEMPLATE, - "fcvt.d.w": R2_TEMPLATE, - "fcvt.d.wu": R2_TEMPLATE, - - "fsgnj.d": R_TEMPLATE, - "fsgnjn": R_TEMPLATE, - "fsgnjx": R_TEMPLATE, - - "feq.d": R_TEMPLATE, - "flt.d": R_TEMPLATE, - "fle.d": R_TEMPLATE, - "fclass.d": R2_TEMPLATE, - - "fcvt.l.d": R2_TEMPLATE, - "fcvt.lu.d": R2_TEMPLATE, - "fcvt.d.l": R2_TEMPLATE, - "fcvt.d.lu": R2_TEMPLATE, - - "fmv.x.d": R2_TEMPLATE, - "fmv.d.x": R2_TEMPLATE -} - -VECTOR_INSTRUCTION_TEMPLATE = { - # Vector mask is not supported - "vle8.v": VECTOR_LOAD_TEMPLATE, - "vle16.v": VECTOR_LOAD_TEMPLATE, - "vle32.v": VECTOR_LOAD_TEMPLATE, - "vle64.v": VECTOR_LOAD_TEMPLATE, - "vlm.v": VECTOR_LOAD_TEMPLATE, - "vle8ff.v": VECTOR_LOAD_TEMPLATE, - "vle16ff.v": VECTOR_LOAD_TEMPLATE, - "vle32ff.v": VECTOR_LOAD_TEMPLATE, - "vle64ff.v": VECTOR_LOAD_TEMPLATE, - "vse8.v": VECTOR_STORE_TEMPLATE, - "vse16.v": VECTOR_STORE_TEMPLATE, - "vse32.v": VECTOR_STORE_TEMPLATE, - "vse64.v": VECTOR_STORE_TEMPLATE, - "vsm.v": VECTOR_STORE_TEMPLATE, - "vse8ff.v": VECTOR_STORE_TEMPLATE, - "vse16ff.v": VECTOR_STORE_TEMPLATE, - "vse32ff.v": VECTOR_STORE_TEMPLATE, - "vse64ff.v": VECTOR_STORE_TEMPLATE, - - "vlse8.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vlse16.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vlse32.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vlse64.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vsse8.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsse16.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsse32.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsse64.v": VECTOR_STRIDE_STORE_TEMPLATE, - - "vluxei8.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vluxei16.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vluxei32.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vluxei64.v": VECTOR_STRIDE_LOAD_TEMPLATE, - - "vloxei8.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vloxei16.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vloxei32.v": VECTOR_STRIDE_LOAD_TEMPLATE, - "vloxei64.v": VECTOR_STRIDE_LOAD_TEMPLATE, - - "vsuxei8.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsuxei16.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsuxei32.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsuxei64.v": VECTOR_STRIDE_STORE_TEMPLATE, - - "vsoxei8.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsoxei16.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsoxei32.v": VECTOR_STRIDE_STORE_TEMPLATE, - "vsoxei64.v": VECTOR_STRIDE_STORE_TEMPLATE, - - "vl1re8.v": VECTOR_LOAD_TEMPLATE, - "vl1re16.v": VECTOR_LOAD_TEMPLATE, - "vl1re32.v": VECTOR_LOAD_TEMPLATE, - "vl1re64.v": VECTOR_LOAD_TEMPLATE, - - "vl2re8.v": VECTOR_LOAD_TEMPLATE, - "vl2re16.v": VECTOR_LOAD_TEMPLATE, - "vl2re32.v": VECTOR_LOAD_TEMPLATE, - "vl2re64.v": VECTOR_LOAD_TEMPLATE, - - "vl4re8.v": VECTOR_LOAD_TEMPLATE, - "vl4re16.v": VECTOR_LOAD_TEMPLATE, - "vl4re32.v": VECTOR_LOAD_TEMPLATE, - "vl4re64.v": VECTOR_LOAD_TEMPLATE, - - "vl8re8.v": VECTOR_LOAD_TEMPLATE, - "vl8re16.v": VECTOR_LOAD_TEMPLATE, - "vl8re32.v": VECTOR_LOAD_TEMPLATE, - "vl8re64.v": VECTOR_LOAD_TEMPLATE, - - "vs1re8.v": VECTOR_STORE_TEMPLATE, - "vs1re16.v": VECTOR_STORE_TEMPLATE, - "vs1re32.v": VECTOR_STORE_TEMPLATE, - "vs1re64.v": VECTOR_STORE_TEMPLATE, - - "vs2re8.v": VECTOR_STORE_TEMPLATE, - "vs2re16.v": VECTOR_STORE_TEMPLATE, - "vs2re32.v": VECTOR_STORE_TEMPLATE, - "vs2re64.v": VECTOR_STORE_TEMPLATE, - - "vs4re8.v": VECTOR_STORE_TEMPLATE, - "vs4re16.v": VECTOR_STORE_TEMPLATE, - "vs4re32.v": VECTOR_STORE_TEMPLATE, - "vs4re64.v": VECTOR_STORE_TEMPLATE, - - "vs8re8.v": VECTOR_STORE_TEMPLATE, - "vs8re16.v": VECTOR_STORE_TEMPLATE, - "vs8re32.v": VECTOR_STORE_TEMPLATE, - "vs8re64.v": VECTOR_STORE_TEMPLATE, - - "vl1r.v": VECTOR_LOAD_TEMPLATE, - "vl2r.v": VECTOR_LOAD_TEMPLATE, - "vl4r.v": VECTOR_LOAD_TEMPLATE, - "vl8r.v": VECTOR_LOAD_TEMPLATE, - - "vs1r.v": VECTOR_STORE_TEMPLATE, - "vs2r.v": VECTOR_STORE_TEMPLATE, - "vs4r.v": VECTOR_STORE_TEMPLATE, - "vs8r.v": VECTOR_STORE_TEMPLATE, - - "vsetvli": R2_TEMPLATE, - "vsetivli": RI_TEMPLATE, - "vmv" : R2_TEMPLATE, - - # For arithmetic vector instuction - ".vv": VECTOR_VV_TEMPLATE, - ".vx": VECTOR_VX_TEMPLATE, - ".vs": VECTOR_VX_TEMPLATE, - ".vi": VECTOR_VI_TEMPLATE, - ".vf": VECTOR_VF_TEMPLATE, - ".vv": VECTOR_VV_TEMPLATE, - - ".wv": VECTOR_WV_TEMPLATE, - ".wx": VECTOR_WX_TEMPLATE, - - ".v.v": R2_TEMPLATE, - ".v.x": R2_TEMPLATE, - ".v.f": R2_TEMPLATE, - ".v.i": RI_TEMPLATE, - - ".x.s": R2_TEMPLATE, - ".s.x": R2_TEMPLATE, - ".f.s": R2_TEMPLATE, - ".s.f": R2_TEMPLATE -} - -SUPPORTED_CUSTOM_INSTRUCTION = { - "custom_mvin": [43, 2, 4], - "custom_mvout": [43, 3, 4] -} - -SUPPORTED_INSTRUCTION = [ - R32_INSTUCTION_TEMPLATE, - R64_INSTRUCTION_TEMPLATE, - PSEUDO_INSTRUCTION_TEMPLATE, - CSR_INSTURCTION_TEMPLATE, - MUL_INSTRUCTION_TEMPLATE, - ATOMIC_INSTRUCTION_TEMPLATE, - FLOATING_INSTRUCTION_TEMPLATE, - VECTOR_INSTRUCTION_TEMPLATE -] - -ATTRIBUTE_LIST = [ - ".text", ".data", ".rodata", ".bss", ".comm", "common", ".section", ".option", - ".file", ".ident", ".size", ".type", ".globl", ".global", ".local", ".equ", ".align", ".balign", - ".p2align", ".2byte", ".4byte", ".8byte", ".half", ".word", ".dword", ".byte", ".asciz", - ".string", ".incbin", ".zero", ".attribute" -] - -BRANCHES = ["beq", "bne", "blt", "bge", "bltu", "bgeu", - "beqz", "bnez", "blez", "bgez", "bltz", "bgtz", - "bgt", "ble", "bgtu", "bleu", - "j",] - -UNCONDITIONAL_JUMP = ["j", "ret"] - -DRAM_LOAD = ["custom_mvin"] -DRAM_STORE = ["custom_mvout"] - -SRAM_LOAD = [] -SRAM_STORE = [] - -M5OPS_RESET_STAT = "\t.insn r CUSTOM_3, 0, 0x40, x0, x0, x0\n" -M5OPS_DUMP_STAT = "\t.insn r CUSTOM_3, 0, 0x41, x0, x0, x0\n" - -class rv_operand: - def __init__(self, op_type, value) -> None: - self.type = op_type & TYPE_MASK - self.dest = op_type & DEST - self.value = value - self.offset = 0 - - if op_type & MEM: - self.value = value[1:-1] - elif op_type & OFFSET: - pos = value.find("(") - self.offset = int(value[0:pos]) - self.value = value[pos+1:-1] - else: - self.value = value - - def is_imm(self): - return self.type == IMMEDIATE - - def is_reg(self): - return self.type == REGISTER - - def is_source(self): - return not self.dest - - def is_destination(self): - return self.dest - - def type_to_str(self): - if self.type == REGISTER: - return "register" - elif self.type == SPECIAL: - return "special" - elif self.type == IMMEDIATE: - return "immediate" - elif self.type == LABEL: - return "label" - return "undefined" - - def __str__(self) -> str: - value = f"Type:{self.type_to_str()}, Value:{self.value}" - return value - -class rv_instruction: - def __init__(self, label, assembly_code:str, is_attribute=False): - self.label = label - self.asm = assembly_code - self.raw_asm = assembly_code - target_list = self.split_assembly(assembly_code) - self.opcode = target_list[0] - self.operands= [] - self.basic_block = None - self.user_insts = [] - self.src_insts = [] - self.is_attribute = is_attribute - - if is_attribute: - return - - for inst_list in SUPPORTED_INSTRUCTION: - if self.opcode not in inst_list: - continue - - if len(inst_list[self.opcode]) != len(target_list[1:]) and "vset" not in self.opcode: - print(f"[Warn] {self.opcode}'s template mismatch in '{assembly_code}'") - - for op_type, value in zip(inst_list[self.opcode], target_list[1:]): - self.operands.append(rv_operand(op_type, value)) - - return - - # For vector extension code - if self.opcode[0] == "v": - for category, template in VECTOR_INSTRUCTION_TEMPLATE.items(): - if category not in self.opcode: - continue - - if len(template) != len(target_list[1:]): - print(f"[Warn] {self.opcode}'s template mismatch in '{assembly_code}'") - - whole_register = 1 - if "2r" in self.opcode: - whole_register = 2 - if "4r" in self.opcode: - whole_register = 4 - if "8r" in self.opcode: - whole_register = 8 - - for op_type, value in zip(template, target_list[1:]): - self.operands.append(rv_operand(op_type, value)) - if (REGISTER == TYPE_MASK & op_type and whole_register > 1): - for i in range(1, whole_register): - next_reg = f"{value[0]}{int(value[1:])+i}" - self.operands.append(rv_operand(op_type, next_reg)) - return - - # For custom instruction - if self.opcode == ".insn" and target_list[1] == "r": - format = [int(imm) for imm in target_list[2:5]] - for custom_op, custom_format in SUPPORTED_CUSTOM_INSTRUCTION.items(): - if format == custom_format: - self.opcode = custom_op - self.asm = self.asm.replace(".insn r", custom_op) - for op_type, value in zip(CUSTOM_R_TEMPLATE, target_list[5:]): - self.operands.append(rv_operand(op_type, value)) - return - - print(f"[Warn] Unsupported instruction in '{assembly_code.strip().rstrip()}'") - - def connect_user_inst(self, user_inst): - self.user_insts.append(user_inst) - user_inst.src_insts.append(self) - - def clear_dependency(self): - self.user_insts = [] - self.src_insts = [] - - @classmethod - def split_assembly(cls, assembly_code:str): - target_list = assembly_code.strip().replace(",", " ") - comment_pos = target_list.find("#") - if comment_pos != -1: - target_list = target_list[0: comment_pos] - - target_list = target_list.split() - return target_list - - @classmethod - def is_label(cls, assembly_code:str): - target_list = cls.split_assembly(assembly_code) - - if len(target_list) == 1: - if target_list[0][-1] == ":": - return True - return False - - @classmethod - def is_comment(cls, assembly_code:str): - target_list = cls.split_assembly(assembly_code) - - if not len(target_list): - return True - return False - - @classmethod - def is_attribute(cls, assembly_code:str): - target_list = cls.split_assembly(assembly_code) - - if len(target_list) and target_list[0] in ATTRIBUTE_LIST: - return True - if target_list[0][:4] == ".cfi": - return True - - return False - - def __str__(self) -> str: - if self.label != "": - info = f"{self.label}:\n" - else: - info = "" - return f"{info}{self.raw_asm}" - -class loop: - def __init__(self, path) -> None: - self.loop_path = OrderedDict() - for idx, bb in enumerate(path): - self.loop_path[idx] = bb - - def __eq__(self, other) -> bool: - return set(self.loop_path.values()) == set(other.loop_path.values()) - - def __str__(self) -> str: - join_str = "->" - return join_str.join([str(bb.name) for bb in self.loop_path.values()]) - - def iter_insts(self): - return chain.from_iterable([iter(bb) for bb in self.loop_path.values()]) - - -NR_BLOCK = 0 -class basic_block: - def __init__(self, name=""): - self.inputs = [] - self.outputs = [] - self.visited = False - self.cycle_list = [] - self.prefix_node = None - self.suffix_node = None - - if name != "": - self.name = f"{name}" - else: - global NR_BLOCK - self.name = f"BasicBlock{NR_BLOCK}" - NR_BLOCK += 1 - self.insts = [] - - def connect(self, new_block): - self.outputs.append(new_block) - new_block.inputs.append(self) - - def add_inst(self, inst): - self.insts.append(inst) - inst.basic_block = self - - def to_onnx(self): - inputs = [i.name + "_output" for i in self.inputs] - outputs = [self.name + "_output"] - #asm = "\n" - lines = {} - asm = ([i.asm.strip().rstrip() for i in self.insts]) - - inst_list = asm - if len(asm) > 20: - inst_list = asm[:10] + ["..."] + asm[-10:] - - for idx, asm_line in enumerate(inst_list): - lines[f"inst{idx:02d}"] = asm_line - - onnx_node = onnx.helper.make_node(op_type=self.__class__.__name__, - inputs=inputs, - outputs=outputs, - bb_name = self.name, - **lines) - return onnx_node - - def dfs(self, start, path=[]): - if self.visited: - if self == start: - tmp_loop = loop(path) - for bb in path: - is_duplicated = any([tmp_loop == cycle for cycle in bb.cycle_list]) - if not is_duplicated: - bb.cycle_list.append(tmp_loop) - return - - self.visited = True - path.append(self) - for child in self.outputs: - child.dfs(start, path) - path.pop(-1) - self.visited = False - - def __iter__(self): - return iter(self.insts) - -class riscv_parser: - def __init__(self) -> None: - self.asm_list = [] - self.inst_list = [] - self.bb_list = [] - self.cycle_list = [] - self.loop_info ={} - self.load_tile_info = {} - self.store_tile_info = {} - - def load_file(self, name, loop_info={}, load_tile_info={}, store_tile_info={}): - with open(name) as file: - asm_lines = file.readlines() - - label = "" - for asm_line in asm_lines: - is_attribute = False - if rv_instruction.is_label(asm_line): - label = rv_instruction.split_assembly(asm_line)[0][:-1] - continue - - if rv_instruction.is_comment(asm_line): - continue - - if rv_instruction.is_attribute(asm_line): - is_attribute = True - - self.asm_list.append(rv_instruction(label, asm_line, is_attribute)) - label = "" - self.inst_list = [inst for inst in self.asm_list if not inst.is_attribute] - - # Load meta data (loop, memory access info) - self.loop_info = loop_info - self.load_tile_info = load_tile_info - self.store_tile_info = store_tile_info - - # Run default analysis pass - self.basic_block_analysis() - self.cycle_detect_analysis() - - def basic_block_analysis(self): - # Construct Basic Block - bb = basic_block() - self.bb_list.append(bb) - for inst in self.inst_list: - if inst.label != "": - bb = basic_block(inst.label) - self.bb_list.append(bb) - bb.add_inst(inst) - - if inst.opcode in BRANCHES: - bb = basic_block() - self.bb_list.append(bb) - - # Trim empty Basic Block - self.bb_list = [bb for bb in self.bb_list if len(bb.insts)] - - # Link Basic Block - prev_inst = self.inst_list[0] - for inst in self.inst_list[1:]: - if prev_inst.basic_block != inst.basic_block and prev_inst.opcode not in UNCONDITIONAL_JUMP: - prev_inst.basic_block.connect(inst.basic_block) - - if inst.opcode in BRANCHES: - labels = [op.value for op in inst.operands if op.type & LABEL] - for label in labels: - for iter_bb in self.bb_list: - if iter_bb.name != label: - continue - inst.basic_block.connect(iter_bb) - - # Update prev inst - prev_inst = inst - - def dump_basic_block_graph(self, name): - # Dump to onnx model - onnx_node_list = [] - for bb in self.bb_list: - onnx_node_list.append(bb.to_onnx()) - - graph_def = onnx.helper.make_graph( - inputs=[],#load_tile_name1, load_tile_name2], - outputs=[],#store_tile_name], - nodes=onnx_node_list, - name="Dummy tile graph", - ) - model_def = onnx.helper.make_model(graph_def, producer_name="PyTorchSim") - model_def.opset_import[0].version = 13 - - onnx.save(model_def, name) - - def cycle_detect_analysis(self): - for bb in self.bb_list: - bb.dfs(bb, []) - for bb_cycle in bb.cycle_list: - is_duplicated = any([bb_cycle == cycle for cycle in self.cycle_list]) - if not is_duplicated: - self.cycle_list.append(bb_cycle) - - for cycle in self.cycle_list: - last_key = list(cycle.loop_path)[-1] - # Handle trampoline pattern ex) j label N - if len(cycle.loop_path[last_key].insts) == 1 and \ - cycle.loop_path[last_key].insts[0].opcode == "j": - del cycle.loop_path[last_key] - - def print_cycles(self): - for cycle in self.cycle_list: - print(f"Cycle-path: {cycle}") - - def dump_sampling_code(self, file): - for cycle in self.cycle_list: - bb_name_list = [bb.name for bb in cycle.loop_path.values()] - inst_list = list(cycle.iter_insts()) - - b_insts = [inst for inst in inst_list if inst.opcode in BRANCHES] - branch = b_insts[-1] - branch_label = branch.operands[-1].value - branch_idx = self.asm_list.index(branch) - - if branch_label in bb_name_list: - # Make it to nop for a escape - self.asm_list[branch_idx].raw_asm = "\tnop\n" - else: - # Make it to unconditional jump for a escape - self.asm_list[branch_idx].raw_asm = f"\tj\t{branch_label}\n" - - for bb in self.cycle_list[0].loop_path.values(): - first_inst = bb.insts[0] - last_inst = bb.insts[-1] - - first_idx = self.asm_list.index(first_inst) - - self.asm_list.pop(first_idx) - if first_inst.label != "": - label = first_inst.label - else: - label = "" - reset_insn = rv_instruction(label, M5OPS_RESET_STAT, True) - changed_insn = rv_instruction("", first_inst.raw_asm, True) - new_inst = [reset_insn, changed_insn] - - # Update with sampling instruction - self.asm_list[first_idx:first_idx] = new_inst - - last_idx = self.asm_list.index(last_inst) - dump_isn = rv_instruction("", M5OPS_DUMP_STAT, True) - self.asm_list[last_idx:last_idx] = [dump_isn] - - with open(file, "w") as f: - lines = [str(inst) for inst in self.asm_list] - f.writelines(lines) - - - def cycle_analysis(self, *args, **kwargs): - loop_info_list = list(self.loop_info.items())#[::-1] - if len(loop_info_list) != len(self.cycle_list): - print("[Error] Generated code and loop information are not matched...") - exit(1) - - for idx, (cycle, info) in enumerate(zip(self.cycle_list, loop_info_list)): - bb_keys = list(cycle.loop_path) - first_key, last_key = bb_keys[0], bb_keys[-1] - - cycle.loop_path[first_key].prefix_node = loop_index_node(info[0], info[1], node_id=idx) - cycle.loop_path[last_key].suffix_node = loop_end_node(info[0], node_id=idx) - - - for cycle, info in zip(self.cycle_list[:1], loop_info_list[:1]): - # Construct rough instruction dependency - scoreboard = {} - for inst in cycle.iter_insts(): - for op in inst.operands[::-1]: - if op.is_destination() and op.is_reg() and op.value != "zero": - scoreboard[op.value] = inst - elif op.is_reg() and op.value in scoreboard: - scoreboard[op.value].connect_user_inst(inst) - - # Cycle analysis phase start - self.generate_tile_graph(cycle, info, *args, **kwargs) - - # Clear instruction dependency - for inst in cycle.iter_insts(): - inst.clear_dependency() - - def generate_tile_graph(self, cycle, info, name="tile_graph", cycle_list=list): - load_nodes = [] - store_nodes = [] - compute_nodes = [] - inst_to_node = {} - - start_node = [] - last_node = [] - index_node = [] - end_node = [] - # For keep topological order - all_nodes = [] - - for bb in cycle.loop_path.values(): - if bb.prefix_node is not None: - for ln in last_node: - connect_nodes(ln, bb.prefix_node) - last_node = [bb.prefix_node] - index_node.append(bb.prefix_node) - all_nodes.append(bb.prefix_node) - - local_load_nodes = [] - local_store_nodes = [] - - # Create compute node for basic block - bb_compute_node = compute_node([], cycle_list.pop(0), len(compute_nodes)) - compute_nodes.append(bb_compute_node) - all_nodes.append(bb_compute_node) - - for inst in bb.insts: - if inst.opcode in DRAM_LOAD: - tmp_node = load_node(self.load_tile_info[f"load{len(load_nodes)}"], [inst.asm], len(load_nodes)+len(local_load_nodes)) - local_load_nodes.append(tmp_node) - all_nodes.append(tmp_node) - inst_to_node[inst] = tmp_node - connect_nodes(tmp_node, bb_compute_node) - elif inst.opcode in DRAM_STORE: - tmp_node = store_node(self.store_tile_info[f"store{len(store_nodes)}"], [inst.asm], len(store_nodes)+len(local_store_nodes)) - local_store_nodes.append(tmp_node) - all_nodes.append(tmp_node) - inst_to_node[inst] = tmp_node - connect_nodes(bb_compute_node, tmp_node) - elif inst.opcode in SRAM_LOAD or inst.opcode[:2] == "vl": - bb_compute_node.inst.append(inst.asm) - inst_to_node[inst] = bb_compute_node - else: - bb_compute_node.inst.append(inst.asm) - inst_to_node[inst] = bb_compute_node - - if len(local_load_nodes): - start_node = local_load_nodes - else: - start_node = [bb_compute_node] - - # Link it! - for sn in start_node: - for ln in last_node: - connect_nodes(ln, sn) - - if len(local_store_nodes): - last_node = local_store_nodes - else: - last_node = [bb_compute_node] - - if bb.suffix_node is not None: - for ln in last_node: - connect_nodes(ln, bb.suffix_node) - last_node = [bb.suffix_node] - end_node.append(bb.suffix_node) - all_nodes.append(bb.suffix_node) - - # Update to global list - load_nodes += local_load_nodes - store_nodes += local_store_nodes - - # NOTE. Since current custom_mvin instruciton has no dependency between following vload instruction. - # So, Make dependency forcefully - # Topological sort - graph = {node:node.get_parent() for node in all_nodes} - sorted_list = [] - while (len(graph)): - for node, parents in graph.items(): - if len(parents): - continue - for child in node.get_child(): - graph[child].pop(graph[child].index(node)) - sorted_list.append(node) - graph.pop(node) - break - - onnx_node_list = [node.to_onnx() for node in sorted_list] - if onnx_node_list: - dump_onnx_graph(f"{name}.onnx", onnx_node_list) - return index_node, end_node, onnx_node_list - -if __name__ == "__main__": - # For Test! - parser = riscv_parser() - parser.load_file("vectoradd.s") - - parser.dump_sampling_code("test.s") - parser.dump_basic_block_graph("basic_block.onnx") - parser.print_cycles() - parser.cycle_analysis() \ No newline at end of file diff --git a/GemminiLowerPass/CMakeLists.txt b/GemminiLowerPass/CMakeLists.txt deleted file mode 100644 index 1cc12153..00000000 --- a/GemminiLowerPass/CMakeLists.txt +++ /dev/null @@ -1,38 +0,0 @@ -cmake_minimum_required(VERSION 3.20) -project(LowerGemminiPass) - -#=============================================================================== -# 1. LOAD LLVM CONFIGURATION -#=============================================================================== -# Set this to a valid LLVM installation dir -set(LT_LLVM_INSTALL_DIR "" CACHE PATH "LLVM installation directory") - -# Add the location of LLVMConfig.cmake to CMake search paths (so that -# find_package can locate it) -list(APPEND CMAKE_MODULE_PATH ${LLVM_DIR}) - -find_package(LLVM REQUIRED CONFIG) -message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") -message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") - -#=============================================================================== -# 2. LLVM-TUTOR BUILD CONFIGURATION -#=============================================================================== -# Use the same C++ standard as LLVM does -set(CMAKE_CXX_STANDARD 17 CACHE STRING "") - -# LLVM is normally built without RTTI. Be consistent with that. -if(NOT LLVM_ENABLE_RTTI) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti") -endif() - -#=============================================================================== -# 3. ADD THE TARGET -#=============================================================================== -add_library(LowerGemminiPass SHARED LowerGemminiPass.cpp) - -# Allow undefined symbols in shared objects on Darwin (this is the default -# behaviour on Linux) -target_link_libraries(LowerGemminiPass - "$<$:-undefined dynamic_lookup>") -include_directories($ENV{TORCHSIM_LLVM_INCLUDE_PATH}) \ No newline at end of file diff --git a/GemminiLowerPass/LowerGemminiPass.cpp b/GemminiLowerPass/LowerGemminiPass.cpp deleted file mode 100644 index 332db980..00000000 --- a/GemminiLowerPass/LowerGemminiPass.cpp +++ /dev/null @@ -1,2566 +0,0 @@ -//===- LowerGemmini.cpp - Lower matrix intrinsics -----*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Lower matrix intrinsics to vector operations. -// -// TODO: -// * Improve fusion: -// * Support more cases, e.g. multiply-add, multiply-sub, operands/results -// transposed. -// * Improve cost-modeling, e.g. choose different number of rows/columns -// columns for tiles, consider cost of copies on alias. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/DomTreeUpdater.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/VectorUtils.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/DebugInfoMetadata.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/MatrixBuilder.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/IR/InlineAsm.h" -#include "llvm/InitializePasses.h" -#include "llvm/Pass.h" -#include "llvm/Support/Alignment.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/MatrixUtils.h" -#include "llvm/Passes/PassBuilder.h" -#include "llvm/Passes/PassPlugin.h" - -#include - -#include "LowerGemminiPass.h" - -using namespace llvm; -using namespace PatternMatch; - -#define DEBUG_TYPE "lower-matrix-intrinsics" - -static cl::opt - FuseMatrix("gemmini-fuse-matrix", cl::init(true), cl::Hidden, - cl::desc("Enable/disable fusing matrix instructions.")); -// TODO: Allow and use non-square tiles. -static cl::opt TileSize( - "gemmini-fuse-matrix-tile-size", cl::init(4), cl::Hidden, - cl::desc( - "Tile size for matrix instruction fusion using square-shaped tiles.")); -static cl::opt TileUseLoops("gemmini-fuse-matrix-use-loops", cl::init(false), - cl::Hidden, - cl::desc("Generate loop nest for tiling.")); -static cl::opt ForceFusion( - "gemmini-force-fuse-matrix", cl::init(false), cl::Hidden, - cl::desc("Force matrix instruction fusion even if not profitable.")); -static cl::opt AllowContractEnabled( - "gemmini-matrix-allow-contract", cl::init(false), cl::Hidden, - cl::desc("Allow the use of FMAs if available and profitable. This may " - "result in different results, due to less rounding error.")); - -enum class MatrixLayoutTy { ColumnMajor, RowMajor }; - -static cl::opt MatrixLayout( - "gemmini-matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor), - cl::desc("Sets the default matrix layout"), - cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major", - "Use column-major layout"), - clEnumValN(MatrixLayoutTy::RowMajor, "row-major", - "Use row-major layout"))); - -/// Helper function to either return Scope, if it is a subprogram or the -/// attached subprogram for a local scope. -static DISubprogram *getSubprogram(DIScope *Scope) { - if (auto *Subprogram = dyn_cast(Scope)) - return Subprogram; - return cast(Scope)->getSubprogram(); -} - -/// Erase \p V from \p BB and move \II forward to avoid invalidating -/// iterators. -static void eraseFromParentAndMove(Value *V, BasicBlock::reverse_iterator &II, - BasicBlock &BB) { - auto *Inst = cast(V); - // Still used, don't erase. - if (!Inst->use_empty()) - return; - if (II != BB.rend() && Inst == &*II) - ++II; - Inst->eraseFromParent(); -} - -/// Return true if V is a splat of a value (which is used when multiplying a -/// matrix with a scalar). -static bool isSplat(Value *V) { - if (auto *SV = dyn_cast(V)) - return SV->isZeroEltSplat(); - return false; -} - -/// Match any mul operation (fp or integer). -template -auto m_AnyMul(const LTy &L, const RTy &R) { - return m_CombineOr(m_Mul(L, R), m_FMul(L, R)); -} - -/// Match any add operation (fp or integer). -template -auto m_AnyAdd(const LTy &L, const RTy &R) { - return m_CombineOr(m_Add(L, R), m_FAdd(L, R)); -} - -namespace { - -// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute -// the start address of vector \p VecIdx with type (\p EltType x \p NumElements) -// assuming \p Stride elements between start two consecutive vectors. -// \p Stride must be >= \p NumElements. -// For column-major matrixes, the function computes the address of a column -// vectors and \p NumElements must be set to the number of elements in a column -// (= number of rows of the matrix). For row-major matrixes, the function -// computes the address of a row vector and \p NumElements must be set to the -// number of elements in a column (= number of columns of the matrix). -// -// Consider a 4x4 matrix in column-mjaor layout like below -// -// 0 1 2 3 -// 0 v_0_0 v_0_1 v_0_2 v_0_3 -// 1 v_1_0 v_1_1 v_1_2 v_1_3 -// 2 v_2_0 v_2_1 v_2_2 v_2_3 -// 3 v_3_0 v_3_1 v_3_2 v_3_3 - -// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1, -// we need a pointer to the first element of the submatrix as base pointer. -// Then we can use computeVectorAddr to compute the addresses for the columns -// of the sub-matrix. -// -// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..) -// -> just returns Base -// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..) -// -> returns Base + (1 * 4) -// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..) -// -> returns Base + (2 * 4) -// -// The graphic below illustrates the number of elements in a column (marked -// with |) and the number of skipped elements (marked with }). -// -// v_0_0 v_0_1 {v_0_2 {v_0_3 -// Base Col 1 Col 2 -// | | | -// v_1_0 |v_1_1 |v_1_2 |v_1_3 -// v_2_0 |v_2_1 |v_2_2 |v_2_3 -// v_3_0 {v_3_1 {v_3_2 v_3_3 -// -Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride, - unsigned NumElements, Type *EltType, - IRBuilder<> &Builder) { - - assert((!isa(Stride) || - cast(Stride)->getZExtValue() >= NumElements) && - "Stride must be >= the number of elements in the result vector."); - unsigned AS = cast(BasePtr->getType())->getAddressSpace(); - - // Compute the start of the vector with index VecIdx as VecIdx * Stride. - Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start"); - - // Get pointer to the start of the selected vector. Skip GEP creation, - // if we select vector 0. - if (isa(VecStart) && cast(VecStart)->isZero()) - VecStart = BasePtr; - else - VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep"); - - // Cast elementwise vector start pointer to a pointer to a vector - // (EltType x NumElements)*. - auto *VecType = FixedVectorType::get(EltType, NumElements); - Type *VecPtrType = PointerType::get(VecType, AS); - return Builder.CreatePointerCast(VecStart, VecPtrType, "vec.cast"); -} - -/// LowerGemmini contains the methods used to lower matrix intrinsics. -/// -/// Currently, the lowering for each matrix intrinsic is done as follows: -/// 1. Propagate the shape information from intrinsics to connected -/// instructions. -/// 2. Lower instructions with shape information (assuming column-major layout). -/// The lowering works similarly using row-major layout. -/// 2.1. Get column vectors for each argument. If we already lowered the -/// definition of an argument, use the produced column vectors directly. -/// If not, split the operand vector containing an embedded matrix into -/// a set of column vectors, -/// 2.2. Lower the instruction in terms of column major operations, which -/// yields a set of column vectors containing result matrix. Note that we -/// lower all instructions that have shape information. Besides the -/// intrinsics, this includes stores for example. -/// 2.3. Update uses of the lowered instruction. If we have shape information -/// for a user, there is nothing to do, as we will look up the result -/// column matrix when lowering the user. For other uses, we embed the -/// result matrix in a flat vector and update the use. -/// 2.4. Cache the result column matrix for the instruction we lowered -/// 3. After we lowered all instructions in a function, remove the now -/// obsolete instructions. -/// -class LowerGemmini { - Function &Func; - const DataLayout &DL; - const TargetTransformInfo &TTI; - AliasAnalysis *AA; - DominatorTree *DT; - LoopInfo *LI; - OptimizationRemarkEmitter *ORE; - - /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation. - struct OpInfoTy { - /// Number of stores emitted to generate this matrix. - unsigned NumStores = 0; - /// Number of loads emitted to generate this matrix. - unsigned NumLoads = 0; - /// Number of compute operations emitted to generate this matrix. - unsigned NumComputeOps = 0; - /// Most of the time transposes can be fused with matrix multiplies or can - /// be folded away via algebraic simplifications. This is the number of - /// transposes that we failed to make "free" via such optimizations. - unsigned NumExposedTransposes = 0; - - OpInfoTy &operator+=(const OpInfoTy &RHS) { - NumStores += RHS.NumStores; - NumLoads += RHS.NumLoads; - NumComputeOps += RHS.NumComputeOps; - NumExposedTransposes += RHS.NumExposedTransposes; - return *this; - } - }; - - /// Wrapper class representing a matrix as a set of vectors, either in row or - /// column major layout. All vectors must have the same vector type. - class MatrixTy { - SmallVector Vectors; - - OpInfoTy OpInfo; - - bool IsColumnMajor = true; - - public: - MatrixTy() : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} - MatrixTy(ArrayRef Vectors) - : Vectors(Vectors.begin(), Vectors.end()), - IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} - MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy) - : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) { - - unsigned D = isColumnMajor() ? NumColumns : NumRows; - for (unsigned J = 0; J < D; ++J) - addVector(UndefValue::get(FixedVectorType::get( - EltTy, isColumnMajor() ? NumRows : NumColumns))); - } - - Value *getVector(unsigned i) const { return Vectors[i]; } - Value *getColumn(unsigned i) const { - assert(isColumnMajor() && "only supported for column-major matrixes"); - return Vectors[i]; - } - Value *getRow(unsigned i) const { - assert(!isColumnMajor() && "only supported for row-major matrixes"); - return Vectors[i]; - } - - void setVector(unsigned i, Value *V) { Vectors[i] = V; } - - Type *getElementType() const { return getVectorTy()->getElementType(); } - - unsigned getNumVectors() const { - if (isColumnMajor()) - return getNumColumns(); - return getNumRows(); - } - - unsigned getNumColumns() const { - if (isColumnMajor()) - return Vectors.size(); - else { - assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); - return cast(Vectors[0]->getType())->getNumElements(); - } - } - unsigned getNumRows() const { - if (isColumnMajor()) { - assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); - return cast(Vectors[0]->getType())->getNumElements(); - } else - return Vectors.size(); - } - - void addVector(Value *V) { Vectors.push_back(V); } - VectorType *getColumnTy() { - assert(isColumnMajor() && "only supported for column-major matrixes"); - return getVectorTy(); - } - - VectorType *getVectorTy() const { - return cast(Vectors[0]->getType()); - } - - iterator_range::iterator> columns() { - assert(isColumnMajor() && - "columns() only supported for column-major matrixes"); - return make_range(Vectors.begin(), Vectors.end()); - } - - iterator_range::iterator> vectors() { - return make_range(Vectors.begin(), Vectors.end()); - } - - /// Embed the vectors of the matrix into a flat vector by concatenating - /// them. - Value *embedInVector(IRBuilder<> &Builder) const { - return Vectors.size() == 1 ? Vectors[0] - : concatenateVectors(Builder, Vectors); - } - - MatrixTy &addNumLoads(unsigned N) { - OpInfo.NumLoads += N; - return *this; - } - - void setNumLoads(unsigned N) { OpInfo.NumLoads = N; } - - MatrixTy &addNumStores(unsigned N) { - OpInfo.NumStores += N; - return *this; - } - - MatrixTy &addNumExposedTransposes(unsigned N) { - OpInfo.NumExposedTransposes += N; - return *this; - } - - MatrixTy &addNumComputeOps(unsigned N) { - OpInfo.NumComputeOps += N; - return *this; - } - - unsigned getNumStores() const { return OpInfo.NumStores; } - unsigned getNumLoads() const { return OpInfo.NumLoads; } - unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; } - - const OpInfoTy &getOpInfo() const { return OpInfo; } - - bool isColumnMajor() const { return IsColumnMajor; } - - unsigned getStride() const { - if (isColumnMajor()) - return getNumRows(); - return getNumColumns(); - } - - /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the - /// matrix is column-major, the result vector is extracted from a column - /// vector, otherwise from a row vector. - Value *extractVector(unsigned I, unsigned J, unsigned NumElts, - IRBuilder<> &Builder) const { - Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I); - assert(cast(Vec->getType())->getNumElements() >= - NumElts && - "Extracted vector will contain poison values"); - return Builder.CreateShuffleVector( - Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0), - "block"); - } - }; - - struct ShapeInfo { - unsigned NumRows; - unsigned NumColumns; - - bool IsColumnMajor; - - ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0) - : NumRows(NumRows), NumColumns(NumColumns), - IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {} - - ShapeInfo(Value *NumRows, Value *NumColumns) - : ShapeInfo(cast(NumRows)->getZExtValue(), - cast(NumColumns)->getZExtValue()) {} - - bool operator==(const ShapeInfo &other) { - return NumRows == other.NumRows && NumColumns == other.NumColumns; - } - bool operator!=(const ShapeInfo &other) { return !(*this == other); } - - /// Returns true if shape-information is defined, meaning both dimensions - /// are != 0. - operator bool() const { - assert(NumRows == 0 || NumColumns != 0); - return NumRows != 0; - } - - unsigned getStride() const { - if (IsColumnMajor) - return NumRows; - return NumColumns; - } - - unsigned getNumVectors() const { - if (IsColumnMajor) - return NumColumns; - return NumRows; - } - - /// Returns the transposed shape. - ShapeInfo t() const { return ShapeInfo(NumColumns, NumRows); } - }; - - /// Maps instructions to their shape information. The shape information - /// describes the shape to be used while lowering. This matches the shape of - /// the result value of the instruction, with the only exceptions being store - /// instructions and the matrix_column_major_store intrinsics. For those, the - /// shape information indicates that those instructions should be lowered - /// using shape information as well. A ValueMap is used so that when - /// sub-passes like optimizeTransposes performs RAUW the map stays - /// up-to-date. - ValueMap ShapeMap; - - /// List of instructions to remove. While lowering, we are not replacing all - /// users of a lowered instruction, if shape information is available and - /// those need to be removed after we finished lowering. - SmallVector ToRemove; - - /// Map from instructions to their produced column matrix. - MapVector Inst2ColumnMatrix; - -private: - static FastMathFlags getFastMathFlags(Instruction *Inst) { - FastMathFlags FMF; - - if (isa(*Inst)) - FMF = Inst->getFastMathFlags(); - - FMF.setAllowContract(AllowContractEnabled || FMF.allowContract()); - - return FMF; - } - -public: - LowerGemmini(Function &F, TargetTransformInfo &TTI, - AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI, - OptimizationRemarkEmitter *ORE) - : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT), - LI(LI), ORE(ORE) {} - - unsigned getNumOps(Type *VT) { - assert(isa(VT) && "Expected vector type"); - return getNumOps(VT->getScalarType(), - cast(VT)->getNumElements()); - } - - /// Return the estimated number of vector ops required for an operation on - /// \p VT * N. - unsigned getNumOps(Type *ST, unsigned N) { - return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedValue() / - double(TTI.getRegisterBitWidth( - TargetTransformInfo::RGK_FixedWidthVector) - .getFixedValue())); - } - - /// Return the set of vectors that a matrix value is lowered to. - /// - /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise - /// split the flat vector \p MatrixVal containing a matrix with shape \p SI - /// into vectors. - MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, - IRBuilder<> &Builder) { - VectorType *VType = dyn_cast(MatrixVal->getType()); - assert(VType && "MatrixVal must be a vector type"); - assert(cast(VType)->getNumElements() == - SI.NumRows * SI.NumColumns && - "The vector size must match the number of matrix elements"); - - // Check if we lowered MatrixVal using shape information. In that case, - // return the existing matrix, if it matches the requested shape - // information. If there is a mis-match, embed the result in a flat - // vector and split it later. - auto Found = Inst2ColumnMatrix.find(MatrixVal); - if (Found != Inst2ColumnMatrix.end()) { - MatrixTy &M = Found->second; - // Return the found matrix, if its shape matches the requested shape - // information - if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns()) - return M; - - MatrixVal = M.embedInVector(Builder); - } - - // Otherwise split MatrixVal. - SmallVector SplitVecs; - for (unsigned MaskStart = 0; - MaskStart < cast(VType)->getNumElements(); - MaskStart += SI.getStride()) { - Value *V = Builder.CreateShuffleVector( - MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0), - "split"); - SplitVecs.push_back(V); - } - - return {SplitVecs}; - } - - /// If \p V already has a known shape return false. Otherwise set the shape - /// for instructions that support it. - bool setShapeInfo(Value *V, ShapeInfo Shape) { - assert(Shape && "Shape not set"); - if (isa(V) || !supportsShapeInfo(V)) - return false; - - auto SIter = ShapeMap.find(V); - if (SIter != ShapeMap.end()) { - LLVM_DEBUG(dbgs() << " not overriding existing shape: " - << SIter->second.NumRows << " " - << SIter->second.NumColumns << " for " << *V << "\n"); - return false; - } - - ShapeMap.insert({V, Shape}); - LLVM_DEBUG(dbgs() << " " << Shape.NumRows << " x " << Shape.NumColumns - << " for " << *V << "\n"); - return true; - } - - bool isUniformShape(Value *V) { - Instruction *I = dyn_cast(V); - if (!I) - return true; - - switch (I->getOpcode()) { - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: // Scalar multiply. - case Instruction::FNeg: - case Instruction::Add: - case Instruction::Mul: - case Instruction::Sub: - return true; - default: - return false; - } - } - - /// Returns true if shape information can be used for \p V. The supported - /// instructions must match the instructions that can be lowered by this pass. - bool supportsShapeInfo(Value *V) { - Instruction *Inst = dyn_cast(V); - if (!Inst) - return false; - - IntrinsicInst *II = dyn_cast(Inst); - if (II) - switch (II->getIntrinsicID()) { - case Intrinsic::matrix_multiply: - case Intrinsic::matrix_transpose: - case Intrinsic::matrix_column_major_load: - case Intrinsic::matrix_column_major_store: - return true; - default: - return false; - } - return isUniformShape(V) || isa(V) || isa(V); - } - - /// Propagate the shape information of instructions to their users. - /// The work list contains instructions for which we can compute the shape, - /// either based on the information provided by matrix intrinsics or known - /// shapes of operands. - SmallVector - propagateShapeForward(SmallVectorImpl &WorkList) { - SmallVector NewWorkList; - // Pop an element for which we guaranteed to have at least one of the - // operand shapes. Add the shape for this and then add users to the work - // list. - LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n"); - while (!WorkList.empty()) { - Instruction *Inst = WorkList.pop_back_val(); - - // New entry, set the value and insert operands - bool Propagate = false; - - Value *MatrixA; - Value *MatrixB; - Value *M; - Value *N; - Value *K; - if (match(Inst, m_Intrinsic( - m_Value(MatrixA), m_Value(MatrixB), m_Value(M), - m_Value(N), m_Value(K)))) { - Propagate = setShapeInfo(Inst, {M, K}); - } else if (match(Inst, m_Intrinsic( - m_Value(MatrixA), m_Value(M), m_Value(N)))) { - // Flip dimensions. - Propagate = setShapeInfo(Inst, {N, M}); - } else if (match(Inst, m_Intrinsic( - m_Value(MatrixA), m_Value(), m_Value(), - m_Value(), m_Value(M), m_Value(N)))) { - Propagate = setShapeInfo(Inst, {N, M}); - } else if (match(Inst, m_Intrinsic( - m_Value(), m_Value(), m_Value(), m_Value(M), - m_Value(N)))) { - Propagate = setShapeInfo(Inst, {M, N}); - } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) { - auto OpShape = ShapeMap.find(MatrixA); - if (OpShape != ShapeMap.end()) - setShapeInfo(Inst, OpShape->second); - continue; - } else if (isUniformShape(Inst)) { - // Find the first operand that has a known shape and use that. - for (auto &Op : Inst->operands()) { - auto OpShape = ShapeMap.find(Op.get()); - if (OpShape != ShapeMap.end()) { - Propagate |= setShapeInfo(Inst, OpShape->second); - break; - } - } - } - /**/ - if (Propagate) { - NewWorkList.push_back(Inst); - // for (auto *User : Inst->users()) - // if (ShapeMap.count(User) == 0) - // WorkList.push_back(cast(User)); - } - } - - return NewWorkList; - } - - /// Propagate the shape to operands of instructions with shape information. - /// \p Worklist contains the instruction for which we already know the shape. - SmallVector - propagateShapeBackward(SmallVectorImpl &WorkList) { - SmallVector NewWorkList; - - auto pushInstruction = [](Value *V, - SmallVectorImpl &WorkList) { - Instruction *I = dyn_cast(V); - if (I) - WorkList.push_back(I); - }; - // Pop an element with known shape. Traverse the operands, if their shape - // derives from the result shape and is unknown, add it and add them to the - // worklist. - LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n"); - while (!WorkList.empty()) { - Value *V = WorkList.pop_back_val(); - - size_t BeforeProcessingV = WorkList.size(); - if (!isa(V)) - continue; - - Value *MatrixA; - Value *MatrixB; - Value *M; - Value *N; - Value *K; - if (match(V, m_Intrinsic( - m_Value(MatrixA), m_Value(MatrixB), m_Value(M), - m_Value(N), m_Value(K)))) { - if (setShapeInfo(MatrixA, {M, N})) - pushInstruction(MatrixA, WorkList); - - if (setShapeInfo(MatrixB, {N, K})) - pushInstruction(MatrixB, WorkList); - - } else if (match(V, m_Intrinsic( - m_Value(MatrixA), m_Value(M), m_Value(N)))) { - // Flip dimensions. - if (setShapeInfo(MatrixA, {M, N})) - pushInstruction(MatrixA, WorkList); - } else if (match(V, m_Intrinsic( - m_Value(MatrixA), m_Value(), m_Value(), m_Value(), - m_Value(M), m_Value(N)))) { - if (setShapeInfo(MatrixA, {M, N})) { - pushInstruction(MatrixA, WorkList); - } - } else if (isa(V) || - match(V, m_Intrinsic())) { - // Nothing to do, no matrix input. - } else if (isa(V)) { - // Nothing to do. We forward-propagated to this so we would just - // backward propagate to an instruction with an already known shape. - } else if (isUniformShape(V)) { - // Propagate to all operands. - ShapeInfo Shape = ShapeMap[V]; - for (Use &U : cast(V)->operands()) { - if (setShapeInfo(U.get(), Shape)) - pushInstruction(U.get(), WorkList); - } - } - // After we discovered new shape info for new instructions in the - // worklist, we use their users as seeds for the next round of forward - // propagation. - for (size_t I = BeforeProcessingV; I != WorkList.size(); I++) - for (User *U : WorkList[I]->users()) - if (isa(U) && V != U) - NewWorkList.push_back(cast(U)); - } - return NewWorkList; - } - - /// (Op0 op Op1)^T -> Op0^T op Op1^T - /// Transpose \p Op0 and \p Op1 of shape \p Shape0 and \p Shape1, then use - /// them on both sides of \p Operation. - Instruction *distributeTransposes( - Value *Op0, ShapeInfo Shape0, Value *Op1, ShapeInfo Shape1, - MatrixBuilder &Builder, - function_ref - Operation) { - Value *T0 = Builder.CreateMatrixTranspose( - Op0, Shape0.NumRows, Shape0.NumColumns, Op0->getName() + "_t"); - // We are being run after shape prop, add shape for newly created - // instructions so that we lower them later. - setShapeInfo(T0, Shape0.t()); - Value *T1 = Builder.CreateMatrixTranspose( - Op1, Shape1.NumRows, Shape1.NumColumns, Op1->getName() + "_t"); - setShapeInfo(T1, Shape1.t()); - return Operation(T0, Shape0.t(), T1, Shape1.t()); - } - - void updateShapeAndReplaceAllUsesWith(Instruction &Old, Value *New) { - // We need to remove Old from the ShapeMap otherwise RAUW will replace it - // with New. We should only add New it it supportsShapeInfo so we insert - // it conditionally instead. - auto S = ShapeMap.find(&Old); - if (S != ShapeMap.end()) { - ShapeMap.erase(S); - if (supportsShapeInfo(New)) - ShapeMap.insert({New, S->second}); - } - Old.replaceAllUsesWith(New); - } - - /// Sink a top-level transpose inside matmuls and adds. - /// This creates and erases instructions as needed, and returns the newly - /// created instruction while updating the iterator to avoid invalidation. If - /// this returns nullptr, no new instruction was created. - Instruction *sinkTranspose(Instruction &I, BasicBlock::reverse_iterator &II) { - BasicBlock &BB = *I.getParent(); - IRBuilder<> IB(&I); - MatrixBuilder Builder(IB); - - Value *TA, *TAMA, *TAMB; - ConstantInt *R, *K, *C; - if (!match(&I, m_Intrinsic( - m_Value(TA), m_ConstantInt(R), m_ConstantInt(C)))) - return nullptr; - - // Transpose of a transpose is a nop - Value *TATA; - if (match(TA, m_Intrinsic(m_Value(TATA)))) { - updateShapeAndReplaceAllUsesWith(I, TATA); - eraseFromParentAndMove(&I, II, BB); - eraseFromParentAndMove(TA, II, BB); - return nullptr; - } - - // k^T -> k - if (isSplat(TA)) { - updateShapeAndReplaceAllUsesWith(I, TA); - eraseFromParentAndMove(&I, II, BB); - return nullptr; - } - - // (A * B)^t -> B^t * A^t - // RxK KxC CxK KxR - if (match(TA, m_Intrinsic( - m_Value(TAMA), m_Value(TAMB), m_ConstantInt(R), - m_ConstantInt(K), m_ConstantInt(C)))) { - auto NewInst = distributeTransposes( - TAMB, {K, C}, TAMA, {R, K}, Builder, - [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) { - return Builder.CreateMatrixMultiply(T0, T1, Shape0.NumRows, - Shape0.NumColumns, - Shape1.NumColumns, "mmul"); - }); - updateShapeAndReplaceAllUsesWith(I, NewInst); - eraseFromParentAndMove(&I, II, BB); - eraseFromParentAndMove(TA, II, BB); - return NewInst; - } - - // Same as above, but with a mul, which occurs when multiplied - // with a scalar. - // (A * k)^t -> A^t * k - // R x C RxC - if (match(TA, m_AnyMul(m_Value(TAMA), m_Value(TAMB))) && - (isSplat(TAMA) || isSplat(TAMB))) { - IRBuilder<> LocalBuilder(&I); - // We know that the transposed operand is of shape RxC. - // An when multiplied with a scalar, the shape is preserved. - auto NewInst = distributeTransposes( - TAMA, {R, C}, TAMB, {R, C}, Builder, - [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) { - bool IsFP = I.getType()->isFPOrFPVectorTy(); - auto *Mul = IsFP ? LocalBuilder.CreateFMul(T0, T1, "mmul") - : LocalBuilder.CreateMul(T0, T1, "mmul"); - auto *Result = cast(Mul); - setShapeInfo(Result, Shape0); - return Result; - }); - updateShapeAndReplaceAllUsesWith(I, NewInst); - eraseFromParentAndMove(&I, II, BB); - eraseFromParentAndMove(TA, II, BB); - return NewInst; - } - - // (A + B)^t -> A^t + B^t - // RxC RxC CxR CxR - if (match(TA, m_AnyAdd(m_Value(TAMA), m_Value(TAMB)))) { - IRBuilder<> LocalBuilder(&I); - auto NewInst = distributeTransposes( - TAMA, {R, C}, TAMB, {R, C}, Builder, - [&](Value *T0, ShapeInfo Shape0, Value *T1, ShapeInfo Shape1) { - bool IsFP = I.getType()->isFPOrFPVectorTy(); - auto *Add = IsFP ? LocalBuilder.CreateFAdd(T0, T1, "madd") - : LocalBuilder.CreateAdd(T0, T1, "madd"); - - auto *Result = cast(Add); - setShapeInfo(Result, Shape0); - return Result; - }); - updateShapeAndReplaceAllUsesWith(I, NewInst); - eraseFromParentAndMove(&I, II, BB); - eraseFromParentAndMove(TA, II, BB); - return NewInst; - } - - return nullptr; - } - - void liftTranspose(Instruction &I) { - // Erase dead Instructions after lifting transposes from binops. - auto CleanupBinOp = [](Instruction &T, Value *A, Value *B) { - if (T.use_empty()) - T.eraseFromParent(); - if (A->use_empty()) - cast(A)->eraseFromParent(); - if (A != B && B->use_empty()) - cast(B)->eraseFromParent(); - }; - - Value *A, *B, *AT, *BT; - ConstantInt *R, *K, *C; - // A^t * B ^t -> (B * A)^t - if (match(&I, m_Intrinsic( - m_Value(A), m_Value(B), m_ConstantInt(R), - m_ConstantInt(K), m_ConstantInt(C))) && - match(A, m_Intrinsic(m_Value(AT))) && - match(B, m_Intrinsic(m_Value((BT))))) { - IRBuilder<> IB(&I); - MatrixBuilder Builder(IB); - Value *M = Builder.CreateMatrixMultiply( - BT, AT, C->getZExtValue(), K->getZExtValue(), R->getZExtValue()); - setShapeInfo(M, {C, R}); - Instruction *NewInst = Builder.CreateMatrixTranspose(M, C->getZExtValue(), - R->getZExtValue()); - updateShapeAndReplaceAllUsesWith(I, NewInst); - CleanupBinOp(I, A, B); - } - // A^t + B ^t -> (A + B)^t - else if (match(&I, m_FAdd(m_Value(A), m_Value(B))) && - match(A, m_Intrinsic( - m_Value(AT), m_ConstantInt(R), m_ConstantInt(C))) && - match(B, m_Intrinsic( - m_Value(BT), m_ConstantInt(R), m_ConstantInt(C)))) { - IRBuilder<> Builder(&I); - Value *Add = cast(Builder.CreateFAdd(AT, BT, "mfadd")); - setShapeInfo(Add, {C, R}); - MatrixBuilder MBuilder(Builder); - Instruction *NewInst = MBuilder.CreateMatrixTranspose( - Add, C->getZExtValue(), R->getZExtValue(), "mfadd_t"); - updateShapeAndReplaceAllUsesWith(I, NewInst); - CleanupBinOp(I, A, B); - } - } - - /// Try moving transposes in order to fold them away or into multiplies. - void optimizeTransposes() { - // First sink all transposes inside matmuls and adds, hoping that we end up - // with NN, NT or TN variants. - for (BasicBlock &BB : reverse(Func)) { - for (auto II = BB.rbegin(); II != BB.rend();) { - Instruction &I = *II; - // We may remove II. By default continue on the next/prev instruction. - ++II; - if (Instruction *NewInst = sinkTranspose(I, II)) - II = std::next(BasicBlock::reverse_iterator(NewInst)); - } - } - - // If we have a TT matmul or a TT add, lift the transpose. We may be able - // to fold into consuming multiply or add. - for (BasicBlock &BB : Func) { - for (Instruction &I : llvm::make_early_inc_range(BB)) { - liftTranspose(I); - } - } - } - - bool Visit() { - SmallVector WorkList; - - // Initially only the shape of matrix intrinsics is known. - // Initialize the work list with ops carrying shape information. - for (BasicBlock &BB : Func) - for (Instruction &Inst : BB) { - IntrinsicInst *II = dyn_cast(&Inst); - if (!II) - continue; - - switch (II->getIntrinsicID()) { - case Intrinsic::matrix_multiply: - case Intrinsic::matrix_transpose: - case Intrinsic::matrix_column_major_load: - case Intrinsic::matrix_column_major_store: - WorkList.push_back(&Inst); - break; - default: - break; - } - } - - // Avoid unnecessary work if there are no matrix intrinsics in the function. - if (WorkList.empty()) - return false; - - // Propagate shapes until nothing changes any longer. - while (!WorkList.empty()) { - WorkList = propagateShapeForward(WorkList); - // WorkList = propagateShapeBackward(WorkList); - } - - optimizeTransposes(); - - bool Changed = false; - SmallVector MaybeFusableInsts; - SmallVector MatrixInsts; - - // First, collect all instructions with shape information and candidates for - // fusion (currently only matrix multiplies). - ReversePostOrderTraversal RPOT(&Func); - for (auto *BB : RPOT) - for (Instruction &I : *BB) { - if (ShapeMap.find(&I) == ShapeMap.end()) - continue; - if (match(&I, m_Intrinsic())) - MaybeFusableInsts.push_back(cast(&I)); - MatrixInsts.push_back(&I); - } - - // Second, try to lower any dot products - SmallPtrSet FusedInsts; - for (CallInst *CI : MaybeFusableInsts) - lowerDotProduct(CI, FusedInsts, getFastMathFlags(CI)); - - // Third, try to fuse candidates. - for (CallInst *CI : MaybeFusableInsts) - LowerMatrixMultiplyFused(CI, FusedInsts); - - Changed = !FusedInsts.empty(); - - // Fourth, lower remaining instructions with shape information. - for (Instruction *Inst : MatrixInsts) { - if (FusedInsts.count(Inst)) - continue; - - IRBuilder<> Builder(Inst); - - if (CallInst *CInst = dyn_cast(Inst)) - Changed |= VisitCallInst(CInst); - - Value *Op1; - Value *Op2; - if (auto *BinOp = dyn_cast(Inst)) - Changed |= VisitBinaryOperator(BinOp); - if (auto *UnOp = dyn_cast(Inst)) - Changed |= VisitUnaryOperator(UnOp); - if (match(Inst, m_Load(m_Value(Op1)))) - Changed |= VisitLoad(cast(Inst), Op1, Builder); - else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2)))) - Changed |= VisitStore(cast(Inst), Op1, Op2, Builder); - } - - if (ORE) { - RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func); - RemarkGen.emitRemarks(); - } - - // Delete the instructions backwards, as it has a reduced likelihood of - // having to update as many def-use and use-def chains. - // - // Because we add to ToRemove during fusion we can't guarantee that defs - // are before uses. Change uses to poison temporarily as these should get - // removed as well. - // - // For verification, we keep track of where we changed uses to poison in - // PoisonedInsts and then check that we in fact remove them. - SmallSet PoisonedInsts; - for (auto *Inst : reverse(ToRemove)) { - for (Use &U : llvm::make_early_inc_range(Inst->uses())) { - if (auto *Poisoned = dyn_cast(U.getUser())) - PoisonedInsts.insert(Poisoned); - U.set(PoisonValue::get(Inst->getType())); - } - Inst->eraseFromParent(); - PoisonedInsts.erase(Inst); - } - if (!PoisonedInsts.empty()) { - // If we didn't remove all poisoned instructions, it's a hard error. - dbgs() << "Poisoned but present instructions:\n"; - for (auto *I : PoisonedInsts) - dbgs() << *I << "\n"; - llvm_unreachable("Poisoned but instruction not removed"); - } - - return Changed; - } - - /// Turns \p BasePtr into an elementwise pointer to \p EltType. - Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) { - unsigned AS = cast(BasePtr->getType())->getAddressSpace(); - Type *EltPtrType = PointerType::get(EltType, AS); - return Builder.CreatePointerCast(BasePtr, EltPtrType); - } - - /// Replace intrinsic calls - bool VisitCallInst(CallInst *Inst) { - if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic()) - return false; - - switch (Inst->getCalledFunction()->getIntrinsicID()) { - case Intrinsic::matrix_multiply: - LowerMultiply(Inst); - break; - case Intrinsic::matrix_transpose: - LowerTranspose(Inst); - break; - case Intrinsic::matrix_column_major_load: - LowerColumnMajorLoad(Inst); - break; - case Intrinsic::matrix_column_major_store: - LowerColumnMajorStore(Inst); - break; - default: - return false; - } - return true; - } - - /// Compute the alignment for a column/row \p Idx with \p Stride between them. - /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a - /// ConstantInt, reduce the initial alignment based on the byte offset. For - /// non-ConstantInt strides, return the common alignment of the initial - /// alignment and the element size in bytes. - Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy, - MaybeAlign A) const { - Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy); - if (Idx == 0) - return InitialAlign; - - TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy); - if (auto *ConstStride = dyn_cast(Stride)) { - uint64_t StrideInBytes = - ConstStride->getZExtValue() * ElementSizeInBits / 8; - return commonAlignment(InitialAlign, Idx * StrideInBytes); - } - return commonAlignment(InitialAlign, ElementSizeInBits / 8); - } - - CallInst* gemmini_extended_mv(IRBuilder<> &Builder, bool is_mvin, Value* dram_addr, Value* spad_addr, uint64_t cols, uint64_t rows) { - StringRef asmString; - if (is_mvin) - asmString = ".insn r CUSTOM_1, " STR(k_MVIN) ", 4, x0, $0, $1"; - else - asmString = ".insn r CUSTOM_1, " STR(k_MVOUT) ", 4, x0, $0, $1"; - StringRef constraints = "r,r,~{dirflag},~{fpsr},~{flags}"; - SmallVector args; - SmallVector ty_args; - - /* Sanity Check */ - assert(~(rows & ~ROW_MASK || cols & ~COL_MASK)); - Value* arg2 = Builder.CreateOr(spad_addr, (uint64_t)(rows << (ADDR_LEN + COL_LEN)) | (cols << ADDR_LEN)); - args.push_back(dram_addr); - args.push_back(arg2); - ty_args.push_back(dram_addr->getType()); - ty_args.push_back(arg2->getType()); - FunctionType *AsmFty = FunctionType::get(Builder.getVoidTy(), ty_args, false); - InlineAsm *ia = InlineAsm::get(AsmFty, asmString, constraints, true); - CallInst *ptr = Builder.CreateCall(ia, args); - return ptr; - } - - /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between - /// vectors. - MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride, - bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) { - auto *VType = cast(Ty); - Type *EltTy = VType->getElementType(); - Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride()); - Value *EltPtr = createElementPtr(Ptr, EltTy, Builder); - MatrixTy Result; - Value *dram_addr = Builder.CreatePtrToInt(EltPtr, Builder.getInt64Ty()); - gemmini_extended_mv(Builder, true, dram_addr, dram_addr, Shape.getStride(), Shape.getNumVectors()); - for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) { - Value *GEP = computeVectorAddr( - EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I), - Stride, Shape.getStride(), EltTy, Builder); - Value *Vector = Builder.CreateAlignedLoad( - VecTy, GEP, getAlignForIndex(I, Stride, EltTy, MAlign), - IsVolatile, "col.load"); - - Result.addVector(Vector); - } - return Result.addNumLoads(getNumOps(Result.getVectorTy()) * - Result.getNumVectors()); - } - - /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix, - /// starting at \p MatrixPtr[I][J]. - MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile, - ShapeInfo MatrixShape, Value *I, Value *J, - ShapeInfo ResultShape, Type *EltTy, - IRBuilder<> &Builder) { - - Value *Offset = Builder.CreateAdd( - Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I); - - unsigned AS = cast(MatrixPtr->getType())->getAddressSpace(); - Value *EltPtr = - Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); - Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); - auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows * - ResultShape.NumColumns); - Type *TilePtrTy = PointerType::get(TileTy, AS); - Value *TilePtr = - Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); - - return loadMatrix(TileTy, TilePtr, Align, - Builder.getInt64(MatrixShape.getStride()), IsVolatile, - ResultShape, Builder); - } - - /// Lower a load instruction with shape information. - void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride, - bool IsVolatile, ShapeInfo Shape) { - IRBuilder<> Builder(Inst); - finalizeLowering(Inst, - loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, - Shape, Builder), - Builder); - } - - /// Lowers llvm.matrix.column.major.load. - /// - /// The intrinsic loads a matrix from memory using a stride between columns. - void LowerColumnMajorLoad(CallInst *Inst) { - assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && - "Intrinsic only supports column-major layout!"); - Value *Ptr = Inst->getArgOperand(0); - Value *Stride = Inst->getArgOperand(1); - LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride, - cast(Inst->getArgOperand(2))->isOne(), - {Inst->getArgOperand(3), Inst->getArgOperand(4)}); - } - - /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p - /// MatrixPtr[I][J]. - void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr, - MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape, - Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) { - Value *Offset = Builder.CreateAdd( - Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I); - - unsigned AS = cast(MatrixPtr->getType())->getAddressSpace(); - Value *EltPtr = - Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); - Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); - auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() * - StoreVal.getNumColumns()); - Type *TilePtrTy = PointerType::get(TileTy, AS); - Value *TilePtr = - Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); - - storeMatrix(TileTy, StoreVal, TilePtr, MAlign, - Builder.getInt64(MatrixShape.getStride()), IsVolatile, MatrixShape, Builder); - } - - /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between - /// vectors. - MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr, - MaybeAlign MAlign, Value *Stride, bool IsVolatile, - ShapeInfo Shape, IRBuilder<> &Builder) { - auto VType = cast(Ty); - Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); - Value *dram_addr = Builder.CreatePtrToInt(EltPtr, Builder.getInt64Ty()); - for (auto Vec : enumerate(StoreVal.vectors())) { - Value *GEP = computeVectorAddr( - EltPtr, - Builder.getIntN(Stride->getType()->getScalarSizeInBits(), - Vec.index()), - Stride, StoreVal.getStride(), VType->getElementType(), Builder); - Builder.CreateAlignedStore(Vec.value(), GEP, - getAlignForIndex(Vec.index(), Stride, - VType->getElementType(), - MAlign), - IsVolatile); - } - gemmini_extended_mv(Builder, false, dram_addr, dram_addr, Shape.getStride(), Shape.getNumVectors()); - return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) * - StoreVal.getNumVectors()); - } - - /// Lower a store instruction with shape information. - void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A, - Value *Stride, bool IsVolatile, ShapeInfo Shape) { - IRBuilder<> Builder(Inst); - auto StoreVal = getMatrix(Matrix, Shape, Builder); - finalizeLowering(Inst, - storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, - IsVolatile, Shape, Builder), - Builder); - } - - /// Lowers llvm.matrix.column.major.store. - /// - /// The intrinsic store a matrix back memory using a stride between columns. - void LowerColumnMajorStore(CallInst *Inst) { - assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && - "Intrinsic only supports column-major layout!"); - Value *Matrix = Inst->getArgOperand(0); - Value *Ptr = Inst->getArgOperand(1); - Value *Stride = Inst->getArgOperand(2); - LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride, - cast(Inst->getArgOperand(3))->isOne(), - {Inst->getArgOperand(4), Inst->getArgOperand(5)}); - } - - // Set elements I..I+NumElts-1 to Block - Value *insertVector(Value *Col, unsigned I, Value *Block, - IRBuilder<> &Builder) { - - // First, bring Block to the same size as Col - unsigned BlockNumElts = - cast(Block->getType())->getNumElements(); - unsigned NumElts = cast(Col->getType())->getNumElements(); - assert(NumElts >= BlockNumElts && "Too few elements for current block"); - - Block = Builder.CreateShuffleVector( - Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts)); - - // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7, - // 8, 4, 5, 6 - SmallVector Mask; - unsigned i; - for (i = 0; i < I; i++) - Mask.push_back(i); - - unsigned VecNumElts = - cast(Col->getType())->getNumElements(); - for (; i < I + BlockNumElts; i++) - Mask.push_back(i - I + VecNumElts); - - for (; i < VecNumElts; i++) - Mask.push_back(i); - - return Builder.CreateShuffleVector(Col, Block, Mask); - } - - Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp, - IRBuilder<> &Builder, bool AllowContraction, - unsigned &NumComputeOps) { - NumComputeOps += getNumOps(A->getType()); - if (!Sum) - return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B); - - if (UseFPOp) { - if (AllowContraction) { - // Use fmuladd for floating point operations and let the backend decide - // if that's profitable. - Function *FMulAdd = Intrinsic::getDeclaration( - Func.getParent(), Intrinsic::fmuladd, A->getType()); - return Builder.CreateCall(FMulAdd, {A, B, Sum}); - } - NumComputeOps += getNumOps(A->getType()); - Value *Mul = Builder.CreateFMul(A, B); - return Builder.CreateFAdd(Sum, Mul); - } - - NumComputeOps += getNumOps(A->getType()); - Value *Mul = Builder.CreateMul(A, B); - return Builder.CreateAdd(Sum, Mul); - } - - /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For - /// users with shape information, there's nothing to do: they will use the - /// cached value when they are lowered. For other users, \p Matrix is - /// flattened and the uses are updated to use it. Also marks \p Inst for - /// deletion. - void finalizeLowering(Instruction *Inst, MatrixTy Matrix, - IRBuilder<> &Builder) { - auto inserted = Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix)); - (void)inserted; - assert(inserted.second && "multiple matrix lowering mapping"); - - ToRemove.push_back(Inst); - Value *Flattened = nullptr; - for (Use &U : llvm::make_early_inc_range(Inst->uses())) { - if (ShapeMap.find(U.getUser()) == ShapeMap.end()) { - if (!Flattened) - Flattened = Matrix.embedInVector(Builder); - U.set(Flattened); - } - } - } - - /// Special case for MatMul lowering. Prevents scalar loads of row-major - /// vectors Lowers to vector reduction add instead of sequential add if - /// reassocation is enabled. - void lowerDotProduct(CallInst *MatMul, - SmallPtrSet &FusedInsts, - FastMathFlags FMF) { - if (FusedInsts.contains(MatMul) || - MatrixLayout != MatrixLayoutTy::ColumnMajor) - return; - ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); - ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); - - if (LShape.NumRows != 1 || RShape.NumColumns != 1) // not a dot product - return; - - Value *LHS = MatMul->getArgOperand(0); - Value *RHS = MatMul->getArgOperand(1); - - Type *ElementType = cast(LHS->getType())->getElementType(); - bool IsIntVec = ElementType->isIntegerTy(); - - // Floating point reductions require reassocation. - if (!IsIntVec && !FMF.allowReassoc()) - return; - - auto CanBeFlattened = [](Value *Op) { - return match(Op, m_OneUse(m_CombineOr( - m_Load(m_Value()), - m_Intrinsic( - m_Value(), m_SpecificInt(1))))); - }; - // Returns the cost benefit of using \p Op with the dot product lowering. If - // the returned cost is < 0, the argument is cheaper to use in the - // dot-product lowering. - auto GetCostForArg = [this, &CanBeFlattened](Value *Op, unsigned N) { - if (!isa(Op)) - return InstructionCost(0); - - FixedVectorType *VecTy = cast(Op->getType()); - Type *EltTy = VecTy->getElementType(); - - if (CanBeFlattened(Op)) { - if (N == 1) - return InstructionCost(0); - - return TTI.getMemoryOpCost(Instruction::Load, VecTy, Align(1), 0) - - N * TTI.getMemoryOpCost(Instruction::Load, EltTy, Align(1), 0); - } - - InstructionCost EmbedCost(0); - // Roughly estimate the cost for embedding the columns into a vector. - for (unsigned I = 1; I < N; ++I) - EmbedCost += - TTI.getShuffleCost(TTI::SK_Splice, FixedVectorType::get(EltTy, 1), - std::nullopt, TTI::TCK_RecipThroughput); - return EmbedCost; - }; - auto LHSCost = GetCostForArg(LHS, LShape.NumColumns); - - // We compare the costs of a vector.reduce.add to sequential add. - int AddOpCode = IsIntVec ? Instruction::Add : Instruction::FAdd; - int MulOpCode = IsIntVec ? Instruction::Mul : Instruction::FMul; - InstructionCost ReductionCost = - TTI.getArithmeticReductionCost( - AddOpCode, cast(LHS->getType()), - IsIntVec ? std::nullopt : std::optional(FMF)) + - TTI.getArithmeticInstrCost(MulOpCode, LHS->getType()); - InstructionCost SequentialAddCost = - TTI.getArithmeticInstrCost(AddOpCode, ElementType) * - (LShape.NumColumns - 1) + - TTI.getArithmeticInstrCost(MulOpCode, ElementType) * - (LShape.NumColumns); - if ((LHSCost + ReductionCost - SequentialAddCost) > InstructionCost(0)) - return; - - FusedInsts.insert(MatMul); - IRBuilder<> Builder(MatMul); - auto FlattenArg = [&Builder, &FusedInsts, - &CanBeFlattened](Value *Op) -> Value * { - // Matmul must be the only user of loads because we don't use LowerLoad - // for row vectors (LowerLoad results in scalar loads and shufflevectors - // instead of single vector load). - if (!CanBeFlattened(Op)) - return Op; - - FusedInsts.insert(cast(Op)); - // If vector uses the builtin load, lower to a LoadInst - Value *Ptr; - if (match(Op, m_Intrinsic( - m_Value(Ptr)))) { - auto *NewLoad = Builder.CreateLoad(Op->getType(), Ptr); - Op->replaceAllUsesWith(NewLoad); - cast(Op)->eraseFromParent(); - return NewLoad; - } - return Op; - }; - LHS = FlattenArg(LHS); - - // Insert mul/fmul and llvm.vector.reduce.fadd - Value *Mul = - IsIntVec ? Builder.CreateMul(LHS, RHS) : Builder.CreateFMul(LHS, RHS); - - Value *Result; - if (IsIntVec) - Result = Builder.CreateAddReduce(Mul); - else { - Result = Builder.CreateFAddReduce( - ConstantFP::get(cast(LHS->getType())->getElementType(), - 0.0), - Mul); - cast(Result)->setFastMathFlags(FMF); - } - - // pack scalar back into a matrix and then replace matmul inst - Result = Builder.CreateInsertElement(PoisonValue::get(MatMul->getType()), - Result, uint64_t(0)); - MatMul->replaceAllUsesWith(Result); - FusedInsts.insert(MatMul); - ToRemove.push_back(MatMul); - } - - /// Compute \p Result += \p A * \p B for input matrices with left-associating - /// addition. - /// - /// We can fold a transpose into the operand that is used to extract scalars. - /// This is the first operands with row-major and the second with - /// column-major. If \p IsScalarMatrixTransposed we assume the appropriate - /// operand is transposed. - void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A, - const MatrixTy &B, IRBuilder<> &Builder, bool IsTiled, - bool IsScalarMatrixTransposed, FastMathFlags FMF) { - const unsigned VF = std::max( - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedValue() / - Result.getElementType()->getPrimitiveSizeInBits().getFixedValue(), - 1U); - unsigned R = Result.getNumRows(); - unsigned C = Result.getNumColumns(); - unsigned M = A.getNumColumns(); - - bool IsFP = Result.getElementType()->isFloatingPointTy(); - assert(A.isColumnMajor() == B.isColumnMajor() && - Result.isColumnMajor() == A.isColumnMajor() && - "operands must agree on matrix layout"); - unsigned NumComputeOps = 0; - - Builder.setFastMathFlags(FMF); - - if (A.isColumnMajor()) { - // Multiply columns from the first operand with scalars from the second - // operand. Then move along the K axes and accumulate the columns. With - // this the adds can be vectorized without reassociation. - for (unsigned J = 0; J < C; ++J) { - unsigned BlockSize = VF; - // If Result is zero, we don't need to accumulate in the K==0 iteration. - bool isSumZero = isa(Result.getColumn(J)); - - for (unsigned I = 0; I < R; I += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (I + BlockSize > R) - BlockSize /= 2; - - Value *Sum = IsTiled ? Result.extractVector(I, J, BlockSize, Builder) - : nullptr; - for (unsigned K = 0; K < M; ++K) { - Value *L = A.extractVector(I, K, BlockSize, Builder); - Value *RH = Builder.CreateExtractElement( - B.getColumn(IsScalarMatrixTransposed ? K : J), - IsScalarMatrixTransposed ? J : K); - Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); - Sum = - createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat, - IsFP, Builder, FMF.allowContract(), NumComputeOps); - } - Result.setVector(J, - insertVector(Result.getVector(J), I, Sum, Builder)); - } - } - } else { - // Multiply rows from the second operand with scalars from the first - // operand. Then move along the K axes and accumulate the rows. With this - // the adds can be vectorized without reassociation. - for (unsigned I = 0; I < R; ++I) { - unsigned BlockSize = VF; - bool isSumZero = isa(Result.getRow(I)); - for (unsigned J = 0; J < C; J += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (J + BlockSize > C) - BlockSize /= 2; - - Value *Sum = nullptr; - for (unsigned K = 0; K < M; ++K) { - Value *R = B.extractVector(K, J, BlockSize, Builder); - Value *LH = Builder.CreateExtractElement( - A.getVector(IsScalarMatrixTransposed ? K : I), - IsScalarMatrixTransposed ? I : K); - Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat"); - Sum = - createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R, - IsFP, Builder, FMF.allowContract(), NumComputeOps); - } - Result.setVector(I, - insertVector(Result.getVector(I), J, Sum, Builder)); - } - } - } - Result.addNumComputeOps(NumComputeOps); - } - - /// Ensure that the memory in \p Load does not alias \p Store by potentially - /// copying it to a new location. This new or otherwise the original location - /// is returned. - Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store, - CallInst *MatMul) { - MemoryLocation StoreLoc = MemoryLocation::get(Store); - MemoryLocation LoadLoc = MemoryLocation::get(Load); - - // If we can statically determine noalias we're good. - if (AA->isNoAlias(LoadLoc, StoreLoc)) - return Load->getPointerOperand(); - - // Create code to check if the memory locations of the Load and Store - // overlap and if they do, copy Load's operand to a new buffer. - - // First, create new blocks for 2n part of the check and the copy. - BasicBlock *Check0 = MatMul->getParent(); - // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a - // DT. Manually collect dominator tree updates, to avoid unnecessary work, - // as we adjust Check0 and Check1's branches. - SmallVector DTUpdates; - for (BasicBlock *Succ : successors(Check0)) - DTUpdates.push_back({DT->Delete, Check0, Succ}); - - BasicBlock *Check1 = - SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI, - nullptr, "alias_cont"); - BasicBlock *Copy = - SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI, - nullptr, "copy"); - BasicBlock *Fusion = - SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI, - nullptr, "no_alias"); - - // Check if the loaded memory location begins before the end of the store - // location. If the condition holds, they might overlap, otherwise they are - // guaranteed to not overlap. - IRBuilder<> Builder(MatMul); - Check0->getTerminator()->eraseFromParent(); - Builder.SetInsertPoint(Check0); - Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout()); - Value *StoreBegin = Builder.CreatePtrToInt( - const_cast(StoreLoc.Ptr), IntPtrTy, "store.begin"); - Value *StoreEnd = Builder.CreateAdd( - StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()), - "store.end", true, true); - Value *LoadBegin = Builder.CreatePtrToInt(const_cast(LoadLoc.Ptr), - IntPtrTy, "load.begin"); - Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1, - Fusion); - - // Check if the store begins before the end of the load location. If the - // condition holds, they alias, otherwise they are guaranteed to not - // overlap. - Check1->getTerminator()->eraseFromParent(); - Builder.SetInsertPoint(Check1, Check1->begin()); - Value *LoadEnd = Builder.CreateAdd( - LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()), - "load.end", true, true); - Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy, - Fusion); - - // Copy load operand to new alloca. - Builder.SetInsertPoint(Copy, Copy->begin()); - auto *VT = cast(Load->getType()); - // Use an array type for the alloca, to avoid potentially huge alignment - // requirements for large vector types. - auto *ArrayTy = ArrayType::get(VT->getElementType(), VT->getNumElements()); - AllocaInst *Alloca = - Builder.CreateAlloca(ArrayTy, Load->getPointerAddressSpace()); - Value *BC = Builder.CreateBitCast(Alloca, VT->getPointerTo()); - - Builder.CreateMemCpy(BC, Alloca->getAlign(), Load->getPointerOperand(), - Load->getAlign(), LoadLoc.Size.getValue()); - Builder.SetInsertPoint(Fusion, Fusion->begin()); - PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3); - PHI->addIncoming(Load->getPointerOperand(), Check0); - PHI->addIncoming(Load->getPointerOperand(), Check1); - PHI->addIncoming(BC, Copy); - - // Adjust DT. - DTUpdates.push_back({DT->Insert, Check0, Check1}); - DTUpdates.push_back({DT->Insert, Check0, Fusion}); - DTUpdates.push_back({DT->Insert, Check1, Copy}); - DTUpdates.push_back({DT->Insert, Check1, Fusion}); - DT->applyUpdates(DTUpdates); - return PHI; - } - - bool isFusionProfitable(CallInst *MatMul) { - if (ForceFusion) - return true; - - ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); - ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); - - const unsigned R = LShape.NumRows; - const unsigned C = RShape.NumColumns; - const unsigned M = LShape.NumColumns; - auto *EltType = cast(MatMul->getType())->getElementType(); - - const unsigned VF = std::max( - TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) - .getFixedValue() / - EltType->getPrimitiveSizeInBits().getFixedValue(), - 1U); - - // Cost model for tiling - // - // For tiling to be beneficial, we need reuse either along the R or - // the C axis. We vectorize along the R axis so that means at least - // 3 elements. - // TODO: Also consider cost of copying if operands alias. - if (R <= VF && C == 1) - return false; - // Then we need enough elements to exceed the number of vector - // registers we have. Note that this is an oversimplification since - // fusing also takes some extra loads which may exceed the number of - // reloads necessary. - unsigned Op0Regs = (R + VF - 1) / VF * M; - unsigned Op1Regs = (M + VF - 1) / VF * C; - return Op0Regs + Op1Regs > - TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true)); - } - - MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) { - MatrixTy Res; - auto *ColumType = FixedVectorType::get(EltType, R); - for (unsigned I = 0; I < C; ++I) - Res.addVector(ConstantAggregateZero::get(ColumType)); - return Res; - } - - void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape, - Value *RPtr, ShapeInfo RShape, StoreInst *Store) { - auto *EltType = cast(MatMul->getType())->getElementType(); - - // Create the main tiling loop nest. - TileInfo TI(LShape.NumRows, RShape.NumColumns, LShape.NumColumns, TileSize); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - Instruction *InsertI = cast(MatMul); - BasicBlock *Start = InsertI->getParent(); - BasicBlock *End = - SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue"); - IRBuilder<> Builder(MatMul); - BasicBlock *InnerBody = TI.CreateTiledLoops(Start, End, Builder, DTU, *LI); - - Type *TileVecTy = - FixedVectorType::get(MatMul->getType()->getScalarType(), TileSize); - MatrixTy TileResult; - // Insert in the inner loop header. - Builder.SetInsertPoint(TI.KLoop.Header->getTerminator()); - // Create PHI nodes for the result columns to accumulate across iterations. - SmallVector ColumnPhis; - for (unsigned I = 0; I < TileSize; I++) { - auto *Phi = Builder.CreatePHI(TileVecTy, 2, "result.vec." + Twine(I)); - Phi->addIncoming(ConstantAggregateZero::get(TileVecTy), - TI.RowLoop.Header->getSingleSuccessor()); - TileResult.addVector(Phi); - ColumnPhis.push_back(Phi); - } - - // Insert in the inner loop body, which computes - // Res += Load(CurrentRow, K) * Load(K, CurrentColumn) - Builder.SetInsertPoint(InnerBody->getTerminator()); - // Load tiles of the operands. - MatrixTy A = - loadMatrix(LPtr, {}, false, LShape, TI.RowLoop.Index, TI.KLoop.Index, - {TileSize, TileSize}, EltType, Builder); - MatrixTy B = - loadMatrix(RPtr, {}, false, RShape, TI.KLoop.Index, TI.ColumnLoop.Index, - {TileSize, TileSize}, EltType, Builder); - emitMatrixMultiply(TileResult, A, B, Builder, true, false, - getFastMathFlags(MatMul)); - // Store result after the inner loop is done. - Builder.SetInsertPoint(TI.RowLoop.Latch->getTerminator()); - storeMatrix(TileResult, Store->getPointerOperand(), Store->getAlign(), - Store->isVolatile(), {LShape.NumRows, RShape.NumColumns}, - TI.RowLoop.Index, TI.ColumnLoop.Index, EltType, Builder); - - for (unsigned I = 0; I < TileResult.getNumVectors(); I++) - ColumnPhis[I]->addIncoming(TileResult.getVector(I), TI.KLoop.Latch); - - // Force unrolling of a few iterations of the inner loop, to make sure there - // is enough work per iteration. - // FIXME: The unroller should make this decision directly instead, but - // currently the cost-model is not up to the task. - unsigned InnerLoopUnrollCount = std::min(10u, LShape.NumColumns / TileSize); - addStringMetadataToLoop(LI->getLoopFor(TI.KLoop.Header), - "llvm.loop.unroll.count", InnerLoopUnrollCount); - } - - void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1, - StoreInst *Store, - SmallPtrSetImpl &FusedInsts) { - assert(MatrixLayout == MatrixLayoutTy::ColumnMajor && - "Tiling only supported for column-major matrixes at the moment!"); - if (!isFusionProfitable(MatMul)) - return; - - ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); - ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); - - const unsigned R = LShape.NumRows; - const unsigned C = RShape.NumColumns; - const unsigned M = LShape.NumColumns; - auto *EltType = cast(MatMul->getType())->getElementType(); - - Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul); - Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul); - Value *CPtr = Store->getPointerOperand(); - - if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0)) - createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store); - else { - IRBuilder<> Builder(Store); - for (unsigned J = 0; J < C; J += TileSize) - for (unsigned I = 0; I < R; I += TileSize) { - const unsigned TileR = std::min(R - I, unsigned(TileSize)); - const unsigned TileC = std::min(C - J, unsigned(TileSize)); - MatrixTy Res = getZeroMatrix(EltType, TileR, TileC); - - for (unsigned K = 0; K < M; K += TileSize) { - const unsigned TileM = std::min(M - K, unsigned(TileSize)); - MatrixTy A = - loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(), - LShape, Builder.getInt64(I), Builder.getInt64(K), - {TileR, TileM}, EltType, Builder); - MatrixTy B = - loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(), - RShape, Builder.getInt64(K), Builder.getInt64(J), - {TileM, TileC}, EltType, Builder); - emitMatrixMultiply(Res, A, B, Builder, true, false, - getFastMathFlags(MatMul)); - } - storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M}, - Builder.getInt64(I), Builder.getInt64(J), EltType, - Builder); - } - } - - // Mark eliminated instructions as fused and remove them. - FusedInsts.insert(Store); - FusedInsts.insert(MatMul); - Store->eraseFromParent(); - MatMul->eraseFromParent(); - if (LoadOp0->hasNUses(0)) { - FusedInsts.insert(LoadOp0); - LoadOp0->eraseFromParent(); - } - if (LoadOp1 != LoadOp0 && LoadOp1->hasNUses(0)) { - FusedInsts.insert(LoadOp1); - LoadOp1->eraseFromParent(); - } - } - - /// Try to lower matrix multiply chains by fusing operations. - /// - /// Call finalizeLowering on lowered instructions. Instructions that are - /// completely eliminated by fusion are added to \p FusedInsts. - void LowerMatrixMultiplyFused(CallInst *MatMul, - SmallPtrSetImpl &FusedInsts) { - if (!FuseMatrix || !DT) - return; - - assert(AA && LI && "Analyses should be available"); - - Value *A = MatMul->getArgOperand(0); - Value *B = MatMul->getArgOperand(1); - - // We can fold the transpose into the operand that is used to fetch scalars. - Value *T; - if (MatrixLayout == MatrixLayoutTy::ColumnMajor - ? match(B, m_Intrinsic(m_Value(T))) - : match(A, m_Intrinsic(m_Value(T)))) { - IRBuilder<> Builder(MatMul); - auto *EltType = cast(MatMul->getType())->getElementType(); - ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); - ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); - const unsigned R = LShape.NumRows; - const unsigned M = LShape.NumColumns; - const unsigned C = RShape.NumColumns; - - MatrixTy MA; - MatrixTy MB; - - Value *Transpose; - if (MatrixLayout == MatrixLayoutTy::ColumnMajor) { - MA = getMatrix(A, ShapeInfo(R, M), Builder); - MB = getMatrix(T, ShapeInfo(C, M), Builder); - Transpose = B; - } else { - MA = getMatrix(T, ShapeInfo(R, M), Builder); - MB = getMatrix(B, ShapeInfo(C, M), Builder); - Transpose = A; - } - - // Initialize the output - MatrixTy Result(R, C, EltType); - - emitMatrixMultiply(Result, MA, MB, Builder, false, true, - getFastMathFlags(MatMul)); - - FusedInsts.insert(MatMul); - if (Transpose->hasOneUse()) { - FusedInsts.insert(cast(Transpose)); - ToRemove.push_back(cast(Transpose)); - // TODO: add a fake entry for the folded instruction so that this is - // included in the expression in the remark. - Inst2ColumnMatrix[Transpose] = MatrixTy(M, C, EltType); - } - finalizeLowering(MatMul, Result, Builder); - return; - } - - if (!MatMul->hasOneUse() || MatrixLayout != MatrixLayoutTy::ColumnMajor) - return; - - // Lower {ld, ld} -> matmul -> st chains. No need to call finalizeLowering - // since the single store user will be lowered as part of this. - auto *LoadOp0 = dyn_cast(A); - auto *LoadOp1 = dyn_cast(B); - auto *Store = dyn_cast(*MatMul->user_begin()); - if (LoadOp0 && LoadOp1 && Store) { - // The store address must dominate the MatMul instruction, otherwise - // we create invalid IR. - SetVector WorkList; - WorkList.insert(Store->getOperand(1)); - SmallVector ToHoist; - for (unsigned I = 0; I != WorkList.size(); ++I) { - Value *Current = WorkList[I]; - auto *CurrI = dyn_cast(Current); - if (!CurrI) - continue; - if (isa(CurrI)) - return; - if (DT->dominates(CurrI, MatMul)) - continue; - if (CurrI->mayHaveSideEffects() || CurrI->mayReadFromMemory()) - return; - ToHoist.push_back(CurrI); - WorkList.insert(CurrI->op_begin(), CurrI->op_end()); - } - - sort(ToHoist, [this](Instruction *A, Instruction *B) { - return DT->dominates(A, B); - }); - for (Instruction *I : ToHoist) - I->moveBefore(MatMul); - - emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts); - return; - } - } - - /// Lowers llvm.matrix.multiply. - void LowerMultiply(CallInst *MatMul) { - IRBuilder<> Builder(MatMul); - auto *EltType = cast(MatMul->getType())->getElementType(); - ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); - ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); - - const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder); - const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder); - assert(Lhs.getElementType() == Rhs.getElementType() && - "Matrix multiply argument element types do not match."); - - const unsigned R = LShape.NumRows; - const unsigned C = RShape.NumColumns; - assert(LShape.NumColumns == RShape.NumRows); - - // Initialize the output - MatrixTy Result(R, C, EltType); - assert(Lhs.getElementType() == Result.getElementType() && - "Matrix multiply result element type does not match arguments."); - - emitMatrixMultiply(Result, Lhs, Rhs, Builder, false, false, - getFastMathFlags(MatMul)); - finalizeLowering(MatMul, Result, Builder); - } - - /// Lowers llvm.matrix.transpose. - void LowerTranspose(CallInst *Inst) { - MatrixTy Result; - IRBuilder<> Builder(Inst); - Value *InputVal = Inst->getArgOperand(0); - VectorType *VectorTy = cast(InputVal->getType()); - ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2)); - MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); - - const unsigned NewNumVecs = - InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns; - const unsigned NewNumElts = - InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows; - - for (unsigned I = 0; I < NewNumVecs; ++I) { - // Build a single result vector. First initialize it. - Value *ResultVector = PoisonValue::get( - FixedVectorType::get(VectorTy->getElementType(), NewNumElts)); - // Go through the old elements and insert it into the resulting vector. - for (auto J : enumerate(InputMatrix.vectors())) { - Value *Elt = Builder.CreateExtractElement(J.value(), I); - // Row and column indices are transposed. - ResultVector = - Builder.CreateInsertElement(ResultVector, Elt, J.index()); - } - Result.addVector(ResultVector); - } - - // TODO: Improve estimate of operations needed for transposes. Currently we - // just count the insertelement/extractelement instructions, but do not - // account for later simplifications/combines. - finalizeLowering( - Inst, - Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns) - .addNumExposedTransposes(1), - Builder); - } - - /// Lower load instructions, if shape information is available. - bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) { - auto I = ShapeMap.find(Inst); - if (I == ShapeMap.end()) - return false; - - LowerLoad(Inst, Ptr, Inst->getAlign(), - Builder.getInt64(I->second.getStride()), Inst->isVolatile(), - I->second); - return true; - } - - bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr, - IRBuilder<> &Builder) { - auto I = ShapeMap.find(StoredVal); - if (I == ShapeMap.end()) - return false; - - LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(), - Builder.getInt64(I->second.getStride()), Inst->isVolatile(), - I->second); - return true; - } - - /// Lower binary operators, if shape information is available. - bool VisitBinaryOperator(BinaryOperator *Inst) { - auto I = ShapeMap.find(Inst); - if (I == ShapeMap.end()) - return false; - - Value *Lhs = Inst->getOperand(0); - Value *Rhs = Inst->getOperand(1); - - IRBuilder<> Builder(Inst); - ShapeInfo &Shape = I->second; - - MatrixTy Result; - MatrixTy A = getMatrix(Lhs, Shape, Builder); - MatrixTy B = getMatrix(Rhs, Shape, Builder); - assert(A.isColumnMajor() == B.isColumnMajor() && - Result.isColumnMajor() == A.isColumnMajor() && - "operands must agree on matrix layout"); - - Builder.setFastMathFlags(getFastMathFlags(Inst)); - - // Helper to perform binary op on vectors. - auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) { - switch (Inst->getOpcode()) { - case Instruction::Add: - return Builder.CreateAdd(LHS, RHS); - case Instruction::Mul: - return Builder.CreateMul(LHS, RHS); - case Instruction::Sub: - return Builder.CreateSub(LHS, RHS); - case Instruction::FAdd: - return Builder.CreateFAdd(LHS, RHS); - case Instruction::FMul: - return Builder.CreateFMul(LHS, RHS); - case Instruction::FSub: - return Builder.CreateFSub(LHS, RHS); - default: - llvm_unreachable("Unsupported binary operator for matrix"); - } - }; - - for (unsigned I = 0; I < Shape.getNumVectors(); ++I) - Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I))); - - finalizeLowering(Inst, - Result.addNumComputeOps(getNumOps(Result.getVectorTy()) * - Result.getNumVectors()), - Builder); - return true; - } - - /// Lower unary operators, if shape information is available. - bool VisitUnaryOperator(UnaryOperator *Inst) { - auto I = ShapeMap.find(Inst); - if (I == ShapeMap.end()) - return false; - - Value *Op = Inst->getOperand(0); - - IRBuilder<> Builder(Inst); - ShapeInfo &Shape = I->second; - - MatrixTy Result; - MatrixTy M = getMatrix(Op, Shape, Builder); - - Builder.setFastMathFlags(getFastMathFlags(Inst)); - - // Helper to perform unary op on vectors. - auto BuildVectorOp = [&Builder, Inst](Value *Op) { - switch (Inst->getOpcode()) { - case Instruction::FNeg: - return Builder.CreateFNeg(Op); - default: - llvm_unreachable("Unsupported unary operator for matrix"); - } - }; - - for (unsigned I = 0; I < Shape.getNumVectors(); ++I) - Result.addVector(BuildVectorOp(M.getVector(I))); - - finalizeLowering(Inst, - Result.addNumComputeOps(getNumOps(Result.getVectorTy()) * - Result.getNumVectors()), - Builder); - return true; - } - - /// Helper to linearize a matrix expression tree into a string. Currently - /// matrix expressions are linarized by starting at an expression leaf and - /// linearizing bottom up. - struct ExprLinearizer { - unsigned LengthToBreak = 100; - std::string Str; - raw_string_ostream Stream; - unsigned LineLength = 0; - const DataLayout &DL; - - /// Mapping from instructions to matrixes. It is used to identify - /// matrix instructions. - const MapVector &Inst2Matrix; - - /// Mapping from values to the leaves of all expressions that the value is - /// part of. - const DenseMap> &Shared; - - /// Set of matrix expressions in the scope of a given DISubprogram. - const SmallSetVector &ExprsInSubprogram; - - /// Leaf node of the expression to linearize. - Value *Leaf; - - /// Used to keep track of sub-expressions that get reused while linearizing - /// the expression. Re-used sub-expressions are marked as (reused). - SmallPtrSet ReusedExprs; - - ExprLinearizer(const DataLayout &DL, - const MapVector &Inst2Matrix, - const DenseMap> &Shared, - const SmallSetVector &ExprsInSubprogram, - Value *Leaf) - : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared), - ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {} - - void indent(unsigned N) { - LineLength += N; - for (unsigned i = 0; i < N; i++) - Stream << " "; - } - - void lineBreak() { - Stream << "\n"; - LineLength = 0; - } - - void maybeIndent(unsigned Indent) { - if (LineLength >= LengthToBreak) - lineBreak(); - - if (LineLength == 0) - indent(Indent); - } - - void write(StringRef S) { - LineLength += S.size(); - Stream << S; - } - - Value *getUnderlyingObjectThroughLoads(Value *V) { - if (Value *Ptr = getPointerOperand(V)) - return getUnderlyingObjectThroughLoads(Ptr); - else if (V->getType()->isPointerTy()) - return getUnderlyingObject(V); - return V; - } - - /// Returns true if \p V is a matrix value in the given subprogram. - bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); } - - /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to - /// \p SS. - void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) { - auto M = Inst2Matrix.find(V); - if (M == Inst2Matrix.end()) - SS << "unknown"; - else { - SS << M->second.getNumRows(); - SS << "x"; - SS << M->second.getNumColumns(); - } - } - - /// Write the called function name. Handles calls to llvm.matrix.* - /// specially: we write the name, followed by the dimensions of the input - /// matrixes, followed by the scalar type name. - void writeFnName(CallInst *CI) { - if (!CI->getCalledFunction()) - write(""); - else { - StringRef Name = CI->getCalledFunction()->getName(); - if (!Name.startswith("llvm.matrix")) { - write(Name); - return; - } - auto *II = cast(CI); - write(Intrinsic::getBaseName(II->getIntrinsicID()) - .drop_front(StringRef("llvm.matrix.").size())); - write("."); - std::string Tmp; - raw_string_ostream SS(Tmp); - - switch (II->getIntrinsicID()) { - case Intrinsic::matrix_multiply: - prettyPrintMatrixType(II->getOperand(0), SS); - SS << "."; - prettyPrintMatrixType(II->getOperand(1), SS); - SS << "." << *II->getType()->getScalarType(); - break; - case Intrinsic::matrix_transpose: - prettyPrintMatrixType(II->getOperand(0), SS); - SS << "." << *II->getType()->getScalarType(); - break; - case Intrinsic::matrix_column_major_load: - prettyPrintMatrixType(II, SS); - SS << "." << *II->getType()->getScalarType(); - break; - case Intrinsic::matrix_column_major_store: - prettyPrintMatrixType(II->getOperand(0), SS); - SS << "." << *II->getOperand(0)->getType()->getScalarType(); - break; - default: - llvm_unreachable("Unhandled case"); - } - SS.flush(); - write(Tmp); - } - } - - unsigned getNumShapeArgs(CallInst *CI) const { - if (IntrinsicInst *II = dyn_cast(CI)) { - switch (II->getIntrinsicID()) { - case Intrinsic::matrix_multiply: - return 3; - case Intrinsic::matrix_transpose: - return 2; - case Intrinsic::matrix_column_major_load: - case Intrinsic::matrix_column_major_store: - return 3; - default: - return 0; - } - } - return 0; - } - - /// Special printing for values: for pointers, we print if they refer to an - /// (function) external address or a stack address, for other values we - /// either print the constant or "scalar"/"matrix" for other values. - void write(Value *V) { - V = getUnderlyingObjectThroughLoads(V); - if (V->getType()->isPointerTy()) { - if (isa(V)) { - Stream << "stack addr"; - LineLength += StringRef("stack addr").size(); - } else { - Stream << "addr"; - LineLength += StringRef("addr").size(); - } - if (!V->getName().empty()) { - Stream << " %" << V->getName() << ""; - LineLength += V->getName().size() + 2; - } - return; - } - - std::string Tmp; - raw_string_ostream TmpStream(Tmp); - - if (auto *CI = dyn_cast(V)) - TmpStream << CI->getValue(); - else if (isa(V)) - TmpStream << "constant"; - else { - if (isMatrix(V)) - TmpStream << "matrix"; - else - TmpStream << "scalar"; - } - TmpStream.flush(); - Tmp = std::string(StringRef(Tmp).trim()); - LineLength += Tmp.size(); - Stream << Tmp; - } - - /// Linearize expression \p Expr starting at an indentation of \p Indent. - /// Expressions that are re-used multiple times are prefixed with (reused) - /// at the re-used root instruction. - void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused, - bool ParentShared) { - auto *I = cast(Expr); - maybeIndent(Indent); - SmallVector Ops; - - // Is Expr shared with other expression leaves? - bool ExprShared = false; - - // Deal with shared subtrees. Mark them as shared, if required. - if (!ParentShared) { - auto SI = Shared.find(Expr); - assert(SI != Shared.end() && SI->second.count(Leaf)); - - for (Value *S : SI->second) { - if (S == Leaf) - continue; - DebugLoc DL = cast(S)->getDebugLoc(); - write("shared with remark at line " + std::to_string(DL.getLine()) + - " column " + std::to_string(DL.getCol()) + " ("); - } - ExprShared = SI->second.size() > 1; - } - - bool Reused = !ReusedExprs.insert(Expr).second; - if (Reused && !ParentReused) - write("(reused) "); - - if (auto *CI = dyn_cast(I)) { - writeFnName(CI); - - Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI)); - } else if (isa(Expr)) { - // Special case bitcasts, which are used to materialize matrixes from - // non-matrix ops. - write("matrix"); - return; - } else { - Ops.append(I->value_op_begin(), I->value_op_end()); - write(std::string(I->getOpcodeName())); - } - - write(std::string("(")); - - unsigned NumOpsToBreak = 1; - if (match(Expr, m_Intrinsic())) - NumOpsToBreak = 2; - - for (Value *Op : Ops) { - if (Ops.size() > NumOpsToBreak) - lineBreak(); - - maybeIndent(Indent + 1); - if (isMatrix(Op)) - linearizeExpr(Op, Indent + 1, Reused, ExprShared); - else - write(Op); - if (Op != Ops.back()) - write(", "); - } - - write(")"); - } - - const std::string &getResult() { - Stream.flush(); - return Str; - } - }; - - /// Generate remarks for matrix operations in a function. To generate remarks - /// for matrix expressions, the following approach is used: - /// 1. Use the inlined-at debug information to group matrix operations to the - /// DISubprograms they are contained in. - /// 2. Collect leaves of matrix expressions (done in - /// RemarkGenerator::getExpressionLeaves) for each subprogram - expression - // mapping. Leaves are lowered matrix instructions without other matrix - // users (like stores) in the current subprogram. - /// 3. For each leaf, create a remark containing a linearizied version of the - /// matrix expression. The expression is linearized by a recursive - /// bottom-up traversal of the matrix operands, starting at a leaf. Note - /// that multiple leaves can share sub-expressions. Shared subexpressions - /// are explicitly marked as shared(). - struct RemarkGenerator { - const MapVector &Inst2Matrix; - OptimizationRemarkEmitter &ORE; - Function &Func; - const DataLayout &DL; - - RemarkGenerator(const MapVector &Inst2Matrix, - OptimizationRemarkEmitter &ORE, Function &Func) - : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func), - DL(Func.getParent()->getDataLayout()) {} - - /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are - /// instructions in Inst2Matrix returning void or without any users in - /// \p ExprsInSubprogram. Currently that should only include stores. - SmallVector - getExpressionLeaves(const SmallSetVector &ExprsInSubprogram) { - SmallVector Leaves; - for (auto *Expr : ExprsInSubprogram) - if (Expr->getType()->isVoidTy() || - !any_of(Expr->users(), [&ExprsInSubprogram](User *U) { - return ExprsInSubprogram.count(U); - })) - Leaves.push_back(Expr); - return Leaves; - } - - /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf - /// to all visited expressions in \p Shared. Limit the matrix operations to - /// the ones in \p ExprsInSubprogram. - void collectSharedInfo(Value *Leaf, Value *V, - const SmallSetVector &ExprsInSubprogram, - DenseMap> &Shared) { - - if (!ExprsInSubprogram.count(V)) - return; - - auto I = Shared.insert({V, {}}); - I.first->second.insert(Leaf); - - for (Value *Op : cast(V)->operand_values()) - collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared); - } - - /// Calculate the number of exclusive and shared op counts for expression - /// starting at \p V. Expressions used multiple times are counted once. - /// Limit the matrix operations to the ones in \p ExprsInSubprogram. - std::pair - sumOpInfos(Value *Root, SmallPtrSetImpl &ReusedExprs, - const SmallSetVector &ExprsInSubprogram, - DenseMap> &Shared) const { - if (!ExprsInSubprogram.count(Root)) - return {}; - - // Already counted this expression. Stop. - if (!ReusedExprs.insert(Root).second) - return {}; - - OpInfoTy SharedCount; - OpInfoTy Count; - - auto I = Shared.find(Root); - auto CM = Inst2Matrix.find(Root); - if (I->second.size() == 1) - Count = CM->second.getOpInfo(); - else - SharedCount = CM->second.getOpInfo(); - - for (Value *Op : cast(Root)->operand_values()) { - auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared); - Count += C.first; - SharedCount += C.second; - } - return {Count, SharedCount}; - } - - void emitRemarks() { - if (!ORE.allowExtraAnalysis(DEBUG_TYPE)) - return; - - // Map matrix operations to their containting subprograms, by traversing - // the inlinedAt chain. If the function does not have a DISubprogram, we - // only map them to the containing function. - MapVector> Subprog2Exprs; - for (const auto &KV : Inst2Matrix) { - if (Func.getSubprogram()) { - auto *I = cast(KV.first); - DILocation *Context = I->getDebugLoc(); - while (Context) { - auto I = - Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}}); - I.first->second.push_back(KV.first); - Context = DebugLoc(Context).getInlinedAt(); - } - } else { - auto I = Subprog2Exprs.insert({nullptr, {}}); - I.first->second.push_back(KV.first); - } - } - for (auto &KV : Subprog2Exprs) { - SmallSetVector ExprsInSubprogram(KV.second.begin(), - KV.second.end()); - auto Leaves = getExpressionLeaves(ExprsInSubprogram); - - DenseMap> Shared; - for (Value *Leaf : Leaves) - collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared); - - // Generate remarks for each leaf. - for (auto *L : Leaves) { - - DebugLoc Loc = cast(L)->getDebugLoc(); - DILocation *Context = cast(L)->getDebugLoc(); - while (Context) { - if (getSubprogram(Context->getScope()) == KV.first) { - Loc = Context; - break; - } - Context = DebugLoc(Context).getInlinedAt(); - } - - SmallPtrSet ReusedExprs; - OpInfoTy Counts, SharedCounts; - std::tie(Counts, SharedCounts) = - sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared); - - OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc, - cast(L)->getParent()); - - Rem << "Lowered with "; - Rem << ore::NV("NumStores", Counts.NumStores) << " stores, " - << ore::NV("NumLoads", Counts.NumLoads) << " loads, " - << ore::NV("NumComputeOps", Counts.NumComputeOps) - << " compute ops, " - << ore::NV("NumExposedTransposes", Counts.NumExposedTransposes) - << " exposed transposes"; - - if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 || - SharedCounts.NumComputeOps > 0) { - Rem << ",\nadditionally " - << ore::NV("NumStores", SharedCounts.NumStores) << " stores, " - << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, " - << ore::NV("NumFPOps", SharedCounts.NumComputeOps) - << " compute ops" - << " are shared with other expressions"; - } - - Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL)); - ORE.emit(Rem); - } - } - } - - std::string - linearize(Value *L, - const DenseMap> &Shared, - const SmallSetVector &ExprsInSubprogram, - const DataLayout &DL) { - ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L); - Lin.linearizeExpr(L, 0, false, false); - return Lin.getResult(); - } - }; -}; -} // namespace - -PreservedAnalyses LowerGemminiPass::run(Function &F, - FunctionAnalysisManager &AM) { - auto &TTI = AM.getResult(F); - OptimizationRemarkEmitter *ORE = nullptr; - AAResults *AA = nullptr; - DominatorTree *DT = nullptr; - LoopInfo *LI = nullptr; - - ORE = &AM.getResult(F); - AA = &AM.getResult(F); - DT = &AM.getResult(F); - LI = &AM.getResult(F); - - LowerGemmini LMT(F, TTI, AA, DT, LI, ORE); - if (LMT.Visit()) { - PreservedAnalyses PA; - PA.preserve(); - PA.preserve(); - return PA; - } - return PreservedAnalyses::all(); -} - -void LowerGemminiPass::printPipeline( - raw_ostream &OS, function_ref MapClassName2PassName) { - static_cast *>(this)->printPipeline( - OS, MapClassName2PassName); - OS << '<'; - OS << '>'; -} - -llvm::PassPluginLibraryInfo getLowerGemminiPassPluginInfo() { - return {LLVM_PLUGIN_API_VERSION, "LowerGemminiPass", LLVM_VERSION_STRING, - [](PassBuilder &PB) { - PB.registerPipelineParsingCallback( - [](StringRef Name, FunctionPassManager &FPM, - ArrayRef) { - if (Name == "LowerGemminiPass") { - FPM.addPass(LowerGemminiPass()); - return true; - } - return false; - }); - }}; -} - -extern "C" LLVM_ATTRIBUTE_WEAK ::llvm::PassPluginLibraryInfo -llvmGetPassPluginInfo() { - return getLowerGemminiPassPluginInfo(); -} diff --git a/GemminiLowerPass/LowerGemminiPass.h b/GemminiLowerPass/LowerGemminiPass.h deleted file mode 100644 index 4eef7fee..00000000 --- a/GemminiLowerPass/LowerGemminiPass.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef LOWERGEMMINIPASS_H -#define LOWERGEMMINIPASS_H - -namespace llvm { -class LowerGemminiPass - : public PassInfoMixin { - -public: - LowerGemminiPass() {} - PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); - void printPipeline(raw_ostream &OS, - function_ref MapClassName2PassName); - static bool isRequired() { return true; } -}; -} // namespace llvm - -/* GEMMINI DEFINITION */ -#define ADDR_LEN 32 -#define ROW_LEN 16 -#define COL_LEN 16 -#define BITMASK(x) ((1 << x) - 1) -#define ADDR_MASK BITMASK(ADDR_LEN) -#define ROW_MASK BITMASK(ROW_LEN) -#define COL_MASK BITMASK(COL_LEN) - -#define k_CONFIG 0 -#define k_MVIN2 1 -#define k_MVIN 2 -#define k_MVOUT 3 -#define k_COMPUTE_PRELOADED 4 -#define k_COMPUTE_ACCUMULATE 5 -#define k_PRELOAD 6 -#define k_FLUSH 7 - -#define STR_HELPER(x) #x -#define STR(x) STR_HELPER(x) -#endif diff --git a/PyTorchSimFrontend/extension_codecache.py b/PyTorchSimFrontend/extension_codecache.py index 83561bd4..20152e9f 100644 --- a/PyTorchSimFrontend/extension_codecache.py +++ b/PyTorchSimFrontend/extension_codecache.py @@ -5,8 +5,6 @@ from torch._inductor.codecache import AsyncCompile, get_lock_dir, get_hash, write from AsmParser.tog_generator import tog_generator -from AsmParser.riscv_parser import riscv_parser -from PyTorchSimFrontend.llvm.llvm_caller_codegen import LLVMKernelCallerCodeGen from PyTorchSimFrontend.mlir.mlir_caller_codegen import MLIRKernelCallerCodeGen from PyTorchSimFrontend import extension_config from Simulator.simulator import FunctionalSimulator, CycleSimulator, BackendSimulator @@ -259,75 +257,6 @@ def load(cls, source_code, ) return key -class LLVMCodeCache: - cache = dict() - clear = staticmethod(cache.clear) # Todo: Cache - - @staticmethod - def _load_library(path): - pass - - @classmethod - def load(cls, source_code, - validation_wrapper_name="validation_wrapper", - validation_binary_name="validation_bin", - cycle_wrapper_name="cycle_wrapper", - cycle_binary_name="cycle_bin", - arg_attributes=[], loop_info={}, - load_tile_info={}, store_tile_info={}, **kwargs): - write_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(get_hash(source_code.strip()))) - key, input_path = write(source_code, "ll", specified_dir=write_path) - output_path = input_path[:-2] + "s" - - cmds = llvm_compile_command(input_path, output_path) - opt_cmd = shlex.split(cmds[0]) - llc_cmd = shlex.split(cmds[1]) - - from filelock import FileLock - lock_dir = get_lock_dir() - lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) - with lock: - # if not os.path.exists(output_path): - try: - subprocess.check_call(opt_cmd) - subprocess.check_call(llc_cmd) - except subprocess.CalledProcessError as e: - print("Command failed with exit code", e.returncode) - print("Error output:", e.output) - assert(0) # Todo: make LLVMCompileError - - # Launch tile graph generator - tile_graph_generator = riscv_parser() - tile_graph_generator.load_file(output_path, - loop_info=loop_info, - load_tile_info=load_tile_info, - store_tile_info=store_tile_info) - # Create code for sampling - tile_graph_generator.dump_sampling_code(output_path[:-2] + "_sample.s") - - # Generate LLVM kernel calller and binary for validation - if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: - val_llvm_caller = LLVMKernelCallerCodeGen(extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, arg_attributes) - val_llvm_caller.generate_wrapper_file(write_path, validation_wrapper_name) - val_llvm_caller.compile_wih_kernel(write_path, key, validation_wrapper_name, validation_binary_name) - - # Generate LLVM kernel calller and binary for cycle calculation - cycle_llvm_caller = LLVMKernelCallerCodeGen(False, arg_attributes) - cycle_llvm_caller.generate_wrapper_file(write_path, cycle_wrapper_name) - cycle_llvm_caller.compile_wih_kernel(write_path, key + "_sample", cycle_wrapper_name, cycle_binary_name) - array_size = [] - for (arg_name, arg_attribute) in arg_attributes: - array_size.append(str(arg_attribute[2])) - - # Run cyclesim - cyclesim = CycleSimulator() - cycle_list = cyclesim.compile_and_simulate(os.path.join(write_path, cycle_binary_name), " ".join(array_size), vectorlane_size) - - if extension_config.CONFIG_TORCHSIM_DUMP_FILE: - tile_graph_generator.dump_basic_block_graph(os.path.join(write_path, "basic_block.onnx")) - tile_graph_generator.cycle_analysis(cycle_list=cycle_list, name=os.path.join(write_path, "tile_graph")) - return key - class CustomAsyncCompile(AsyncCompile): def __init__(self): self.validation_wrapper_name = "validation_wrapper" @@ -417,49 +346,3 @@ def dryrun_simulator(*args, **kwargs): target_simulator.arg_attributes = arg_attributes target_simulator.future = future return target_simulator - - def llvm(self, source_code, arg_attributes=[], **kwargs): - def task(): - key = LLVMCodeCache.load(source_code, - valdiation_wrapper_name=self.validation_binary_name, - validation_binary_name=self.validation_binary_name, - arg_attributes=arg_attributes, **kwargs) - return key - future = self.submit(task) - - def dummy_simulator(*args, **kwargs): - # Wait for compilation - key = future.result() - - # Run simulator pass - result_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key)) - print("Running dummy simulator!") - print("OUTPUT PATH > ", result_path) - - # Dump arguments and meta data - dump_metadata(args, arg_attributes, result_path) - if extension_config.CONFIG_TORCHSIM_VALIDATION_MODE: - funcsim = FunctionalSimulator(result_path, key) - funcsim.run_spike(args, arg_attributes, - os.path.join(result_path, self.validation_binary_name), - kwargs['intermediate_op'] if 'intermediate_op' in kwargs else None) - - assembly_path = os.path.join(result_path, f'{key}.s') - try: - with open(assembly_path, 'r') as file: - file_contents = file.read() - print("Assembly > \n", file_contents) - except FileNotFoundError: - print(f'{assembly_path} not found.') - except Exception as e: - print(f"Error while reading.") - - onnx_path = os.path.join(result_path, "tile_graph.onnx") - attribute_path = os.path.join(extension_config.CONFIG_TORCHSIM_DUMP_PATH, "tmp", hash_prefix(key), "attribute") - backend_path = os.path.join(extension_config.CONFIG_TORCHSIM_DIR, "PyTorchSimBackend") - backsim = BackendSimulator(backend_path, extension_config.CONFIG_TORCHSIM_BACKEND_CONFIG) - attribute_path = backsim.create_attribute_file(attribute_path, args) - result_path = backsim.simulation(onnx_path, attribute_path) - result = BackendSimulator.get_result_from_file(result_path) - return result - return dummy_simulator diff --git a/PyTorchSimFrontend/llvm/llvm_autotune.py b/PyTorchSimFrontend/llvm/llvm_autotune.py deleted file mode 100644 index 3c61961d..00000000 --- a/PyTorchSimFrontend/llvm/llvm_autotune.py +++ /dev/null @@ -1,75 +0,0 @@ -import functools -import torch -from torch._inductor.autotune_process import BenchmarkRequest -from torch._inductor.autotune_process import TensorMeta -from torch._inductor.codecache import CUDACodeCache - -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Sequence, - TYPE_CHECKING, - Union, -) - -class LLVMBenchmarkRequest(BenchmarkRequest): - def __init__( - self, - kernel_name: str, - input_tensor_meta: Union[TensorMeta, List[TensorMeta]], - output_tensor_meta: Union[TensorMeta, List[TensorMeta]], - extra_args: Iterable[Any], - source_code: str, - ): - super().__init__(kernel_name, input_tensor_meta, output_tensor_meta, extra_args) - self.source_code = source_code - self.workspace_size: int = 0 - self.workspace: Optional[torch.Tensor] = None - self.hash_key: str = "" - self.source_file: str = "" - #self.hash_key, self.source_file = CUDACodeCache.write(self.source_code, "so") - - def make_run_fn( - self, *input_tensors: torch.Tensor, output_tensor: torch.Tensor - ) -> Callable[[], None]: - self.DLL, self.hash_key, self.source_file = CUDACodeCache.load( - self.source_code, "so" - ) - - args = [ - tensor.data_ptr() - for tensor in list(input_tensors) + [output_tensor] - ] - - print( - "make_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, args=%s, self.extra_args=%s", - self.kernel_name, - self.source_file, - self.hash_key, - args, - self.extra_args, - ) - - run_method = getattr(self.DLL, self.kernel_name) - - # Retrieve workspace_size and initialize workspace. - run_method( - *args, # input ptrs and output ptrs - *self.extra_args, - ) - - # Generate partial function. - return functools.partial( - run_method, - *args, - *self.extra_args, - None, # null workspace size ptr - None, # set workspace ptr, TODO: update it to a real ptr if workspace_size > 0 - ) - - def __str__(self) -> str: - return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}" \ No newline at end of file diff --git a/PyTorchSimFrontend/llvm/llvm_codegen_backend.py b/PyTorchSimFrontend/llvm/llvm_codegen_backend.py deleted file mode 100644 index 6951b5bd..00000000 --- a/PyTorchSimFrontend/llvm/llvm_codegen_backend.py +++ /dev/null @@ -1,1157 +0,0 @@ -import dataclasses -import contextlib -import sympy -import itertools -import re -from functools import reduce -from operator import mul -from typing import List -from typing import Dict -import torch -from torch._inductor import dependencies -from torch._inductor.codegen import cpp, wrapper, common -from torch._inductor.scheduler import BaseScheduling -from torch._inductor.virtualized import V, _ops as ops -from torch._inductor.utils import IndentedBuffer -import PyTorchSimFrontend.extension_codecache as extension_codecache - -from . import llvm_common -from . import llvm_lowering - -def reduction_alloc(code, stack, vars): - # FIXME. USE VARIABLES' TYPE... - REDUCTION_TYPE = "float" - REDUCTION_SIZE = 4 - for var in vars: - line = f"%{var} = alloca {REDUCTION_TYPE}, align {REDUCTION_SIZE}" - code.writeline(line) - -def matrix_reduction_alloc(code, stack, vars, tile_row): - # FIXME. USE VARIABLES' TYPE... - REDUCTION_TYPE = "float" - REDUCTION_SIZE = 4 - for var in vars: - line = f"%{var} = alloca {REDUCTION_TYPE}, i32 {tile_row}, align {REDUCTION_SIZE}" - code.writeline(line) - -def reduction_init(reduction_type, dtype): - if dtype in cpp.DTYPE_LOWP_FP: - # Since load promotes all half-precision inputs to float, the initial - # constant for reduction must be promoted as well - dtype = torch.float32 - if reduction_type in ("xor_sum", "sum", "any"): - return "0.0" - if reduction_type == "prod": - return "1.0" - if reduction_type in {"max", "argmax"}: - return "0.0" - if reduction_type in {"min", "argmin"}: - return "0.0" - raise AssertionError(reduction_type) - -def reduction_combine(reduction_type, var, next_value): - if reduction_type == "sum": - return f"fadd float %{var}, %{next_value}" - if reduction_type == "prod": - return f"fmul float %{var}, %{next_value}" - if reduction_type == "xor_sum": - raise NotImplementedError() # TODO: implement - if reduction_type == "any": - raise NotImplementedError() - if reduction_type in ("min", "max"): - raise NotImplementedError() - if reduction_type == "welford_reduce": - raise NotImplementedError() - if reduction_type == "welford_combine": - raise NotImplementedError() - raise AssertionError(reduction_type) - -def vector_reduction_combine(reduction_type, start_value, vector_value): - if reduction_type == "sum": - return f"tail call float @llvm.vector.reduce.fadd.nxv2f32(float %{start_value}, %{vector_value})" - if reduction_type == "prod": - return f"tail call float @llvm.vector.reduce.fmul.nxv2f32(float %{start_value}, %{vector_value})" - if reduction_type == "xor_sum": - raise NotImplementedError() # TODO: implement - if reduction_type == "any": - raise NotImplementedError() - if reduction_type in ("min", "max"): - raise NotImplementedError() - if reduction_type == "welford_reduce": - raise NotImplementedError() - if reduction_type == "welford_combine": - raise NotImplementedError() - raise AssertionError(reduction_type) - -def matrix_reduction_combine(reduction_type, start_value, vector_value, tile_row=64): - if reduction_type == "sum": - return f"fadd <{tile_row} x float> %{start_value}, %{vector_value}" - if reduction_type == "prod": - return f"fmul <{tile_row} x float> %{start_value}, %{vector_value}" - if reduction_type == "xor_sum": - raise NotImplementedError() # TODO: implement - if reduction_type == "any": - raise NotImplementedError() - if reduction_type in ("min", "max"): - raise NotImplementedError() - if reduction_type == "welford_reduce": - raise NotImplementedError() - if reduction_type == "welford_combine": - raise NotImplementedError() - raise AssertionError(reduction_type) - -def matrix_partial_reduction_combine(reduction_type, vector_value, tile_row=64): - if reduction_type == "sum": - return f"tail call float @llvm.vector.reduce.fadd.nxv2f32(float 0.0, <{tile_row} x float> %{vector_value})" - if reduction_type == "prod": - return f"tail call float @llvm.vector.reduce.fmul.nxv2f32(float 1.0, <{tile_row} x float> %{vector_value})" - if reduction_type in ("min", "max"): - return f"tail call float @llvm.vector.reduce.f{reduction_type}.nxv2f32(<{tile_row} x float> %{vector_value})" - raise AssertionError(reduction_type) - -class ExtensionWrapperCodegen(wrapper.WrapperCodeGen): - def __init__(self): - super().__init__() - - def write_header(self): - self.header.splice( - f""" - from ctypes import c_void_p, c_long - import torch - import math - import random - import os - import tempfile - from math import inf, nan - from torch._inductor.hooks import run_intermediate_hooks - from torch._inductor.utils import maybe_profile - from torch._inductor.codegen.memory_planning import _align as align - - from torch import device, empty, empty_strided - from {extension_codecache.__name__} import CustomAsyncCompile - from torch._inductor.select_algorithm import extern_kernels - - aten = torch.ops.aten - inductor_ops = torch.ops.inductor - assert_size_stride = torch._C._dynamo.guards.assert_size_stride - alloc_from_pool = torch.ops.inductor._alloc_from_pool - reinterpret_tensor = torch.ops.aten._reinterpret_tensor - async_compile = CustomAsyncCompile() - - """ - ) - -class ExtensionOverrides(common.OpOverrides): - """Map element-wise ops to LLVM IR""" - - @staticmethod - def add(operand1, operand2, **kwargs): - return f'fadd float %{operand1}, %{operand2}' # TODO: separate float and integer - - @staticmethod - def sub(operand1, operand2, **kwargs): - return f'fsub float %{operand1}, %{operand2}' - - @staticmethod - def mul(operand1, operand2, **kwargs): - return f'fmul float %{operand1}, %{operand2}' - - @staticmethod - def div(operand1, operand2, **kwargs): - return f'fdiv float %{operand1}, %{operand2}' - -class VectorOverrides(ExtensionOverrides): - @staticmethod - def vector_add(operand1, operand2, **kwargs): - return f'fadd %{operand1}, %{operand2}' - - @staticmethod - def vector_sub(operand1, operand2, **kwargs): - return f'fsub %{operand1}, %{operand2}' - - @staticmethod - def vector_mul(operand1, operand2, **kwargs): - return f'fmul %{operand1}, %{operand2}' - - @staticmethod - def vector_div(operand1, operand2, **kwargs): - return f'fdiv %{operand1}, %{operand2}' - -class MatrixOverrides(ExtensionOverrides): - @staticmethod - def add(operand1, operand2, tile_size=16): - return f'fadd <{tile_size} x float> %{operand1}, %{operand2}' - - @staticmethod - def sub(operand1, operand2, tile_size=4): - return f'fsub <{tile_size} x float> %{operand1}, %{operand2}' - - @staticmethod - def mul(operand1, operand2, tile_size=4): - return f'fmul <{tile_size} x float> %{operand1}, %{operand2}' - - @staticmethod - def div(operand1, operand2, tile_size=4): - return f'fdiv <{tile_size} x float> %{operand1}, %{operand2}' - - @staticmethod - def truediv(operand1, operand2, tile_size=4): - return f'fdiv <{tile_size} x float> %{operand1}, %{operand2}' - - @staticmethod - def constant(value, dtype, tile_size=4): - return f'insertelement <1 x float> undef, float {value}, i32 0' - - @staticmethod - def exp(operand, tile_size=4): - return f'tail call <{tile_size} x float> @llvm.exp.f32(<{tile_size} x float> %{operand})' - - @staticmethod - def maximum(operand1, operand2, tile_size=4): - return f'tail call <{tile_size} x float> @llvm.maximum.f32(<{tile_size} x float> %{operand1}, <{tile_size} x float >%{operand2})' - - @staticmethod - def relu(x, tile_size=4): - return ops.maximum(x, ops.constant(0.0, "f32")) - -SYMPY_TO_LLVM = { - sympy.core.mul.Mul: "mul", - sympy.core.add.Add: "add", -} - -class LLVMKernel(llvm_common.BaseLLVMKernel): - overrides = ExtensionOverrides - newvar_prefix = "%" - - def __init__(self, args=None): - super().__init__(llvm_common.LLVMKernelArgs()) - self.call_ranges = None - self.ranges = None - self.itervars = None - self.reduction_depth = None - self.reduction_prefix = IndentedBuffer() - self.reduction_suffix = IndentedBuffer() - self.reduction_vars = {} - self.reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc") - self.index_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_idx") - self.loop_info = {} - self.load_desc = {} - self.store_desc = {} - self.tiling_indices = [0, 1] - - def get_constant_vector(self, expr): - constant_vector = [int(expr.coeff(var)) for var in self.itervars] - return constant_vector - - def add_desc(self, is_load, base_addr, element_size, stride_list, tile_size): - if is_load: - key = f"load{len(self.load_desc)}" - self.load_desc[key] = { - "base_addr": base_addr, - "element_size": element_size, - "stride_list": stride_list, - "tile_size": tile_size, - "tile_stride": stride_list[-2:] - } - else: - key = f"store{len(self.store_desc)}" - self.store_desc[key] = { - "base_addr": base_addr, - "element_size": element_size, - "stride_list": stride_list, - "tile_size": tile_size, - "tile_stride": stride_list[-2:] - } - - def depth_first_traverse(self, expr, buffer, cse): - child_var = [] - for arg in expr.args: - child_var.append(self.depth_first_traverse(arg, buffer, cse)) - - while len(child_var) >= 3: - first = child_var.pop(0) - second = child_var.pop(0) - first_prefix = "" if first.is_number else "%" - second_prefix = "" if second.is_number else "%" - - line = f"{SYMPY_TO_LLVM[expr.func]} nsw i64 {first_prefix}{first}, {second_prefix}{second}" - var = cse.generate(buffer, line) - var = sympy.symbols(f"{var}") - child_var.append(var) - - if len(expr.args) == 0: - return expr - - elif len(child_var) == 2: - first = child_var[1] - second = child_var[0] - first_prefix = "" if first.is_number else "%" - second_prefix = "" if second.is_number else "%" - line = f"{SYMPY_TO_LLVM[expr.func]} nsw i64 {first_prefix}{first}, {second_prefix}{second}" - var = cse.generate(buffer, line) - var = sympy.symbols(f"{var}") - return var - else: - raise Exception() - - def codegen_nodes(self, nodes, kernel_name): - _, (group, reduction_group) = max( - nodes, key=lambda x: int(x.is_reduction()) - ).group - - def select_tiling_indices(): - all_index = [] - for node in nodes: - rw = dependencies.extract_read_writes(node._body, *node._sizes) - all_index += [dep.index for dep in itertools.chain(rw.reads, rw.writes)] - contig_vars = set() - contig_vars_list = [] - non_contig_stride_const = set() - non_contig_stride_other = set() - for index in all_index: - for var in index.free_symbols: - if not re.search(r"^d\d+$", var.name): - continue - stride = cpp.stride_at(var, index) - if stride == 1: - contig_vars.add(int(var.name[1:])) - contig_vars_list.append(int(var.name[1:])) - elif all(s.name.startswith("s") for s in stride.free_symbols): - non_contig_stride_const.add(int(var.name[1:])) - else: - non_contig_stride_other.add(int(var.name[1:])) - contig_only = ( - contig_vars - non_contig_stride_const - non_contig_stride_other - ) - if len(contig_vars) == 0: - # no contiguous vars - return [len(self.itervars) - 1] - if contig_only: - return sorted(contig_only)[-1:] - contig_and_const_stride = ( - contig_vars & non_contig_stride_const - ) - non_contig_stride_other - contig_vars_sorted = sorted(contig_vars) - if ( - len(contig_vars_sorted) == 2 - and contig_vars_sorted[-1] in contig_and_const_stride - and contig_vars_sorted[-1] == len(self.itervars) - 1 - ): - return contig_vars_sorted - return sorted(contig_vars_sorted, key=contig_vars_list.count)[-1:] - - self.set_ranges(group, reduction_group) - self.tiling_indices = select_tiling_indices() - - with self as kernel: - for node in nodes: - vars, reduction_vars = kernel.set_ranges(group, reduction_group) - node.run(vars, reduction_vars) - - src_code = self.codegen_kernel(kernel_name=kernel_name) - self.meta_kernel() - return src_code - - def load(self, name: str, index: sympy.Expr): - index = self.rename_indexing(index) - index = self.depth_first_traverse(index, self.loads, self.index_cse) - var = self.args.input(name) - dtype = V.graph.get_dtype(name) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 %{index}" - var = self.cse.generate(self.loads, line) - line = f"load {type_name}, ptr %{var}, align {align}" - return self.cse.generate(self.loads, line) - - def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): - index = self.rename_indexing(index) - index = self.depth_first_traverse(index, self.stores, self.index_cse) - var = self.args.output(name) - dtype = V.graph.get_dtype(name) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 %{index}" - var = self.cse.generate(self.stores, line) - if (isinstance(value, list)): - value = value[1] - line = f"store {type_name} %{value}, ptr %{var}, align {align}" - self.cse.generate(self.stores, line, assignment = False) - - def reduction(self, dtype, src_dtype, reduction_type, value): - argmax_or_argmin = reduction_type in {"argmax", "argmin"} - if argmax_or_argmin: - raise NotImplementedError() #TODO: argmin, argmax - else: - reduction_key = src_dtype, reduction_type, value - acc = self.reduction_cse.generate( - self.loads, f"reduction {reduction_key}", write=False - ) - self.reduction_vars[acc] = reduction_type - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - self.reduction_prefix.writeline(f"store {type_name} {reduction_init(reduction_type, dtype)}, ptr %{acc}, align {align}") - line = f"load {type_name}, ptr %{acc}, align {align}" - - # NOTE. To keep below line be under the compute, used store buffers - temp = self.cse.generate(self.stores, line) - output = self.cse.generate(self.stores, reduction_combine(reduction_type, temp, value)) - line = f"store {type_name} %{output}, ptr %{acc}, align {align}" - self.cse.generate(self.stores, line, assignment = False) - self.reduction_cse.reduction_cache[reduction_key] = acc - return acc - - def store_reduction(self, name, index, value): - index = self.rename_indexing(index) - index = self.depth_first_traverse(index, self.reduction_suffix, self.index_cse) - var = self.args.output(name) - dtype = V.graph.get_dtype(name) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - line = f"load {type_name}, ptr %{value}, align {align}" - value = self.reduction_cse.generate(self.reductions_suffix, line) - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 %{index}" - var = self.cse.generate(self.reductions_suffix, line) - line = f"store {type_name} %{value}, ptr %{var}, align {align}" - self.cse.generate(self.reductions_suffix, line, assignment = False) - - def codegen_loops(self): - code = common.BracesBuffer() - # Loop body part - loops = [LoopLevel(var, size, idx) for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))] - loops, reductions = [LoopNest(loops[: self.reduction_depth]), - LoopNest(loops[self.reduction_depth :])] - reductions.mark_reduction(self.reduction_vars) - - with contextlib.ExitStack() as stack: - if self.reduction_vars: - reduction_alloc(code, stack, self.reduction_vars) - loops.codegen(code, stack) - with contextlib.ExitStack() as stack_outer: - code.splice(self.reduction_prefix) - with contextlib.ExitStack() as stack: - reductions.codegen(code, stack) - code.splice(self.loads) - code.splice(self.compute) - code.splice(self.stores) - code.splice(self.reductions_suffix) - code.writeline(f"ret void") - return code - - def codegen_kernel(self, kernel_name): - wrapper = V.graph.wrapper_code - arg_defs, _, _ = self.args.llvm_argdefs() - code = self._codegen_kernel(arg_defs, kernel_name) - return code.getvalue() - - def meta_kernel(self): - wrapper = V.graph.wrapper_code - _, _, arg_attributes = self.args.llvm_argdefs() - wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')') - wrapper.add_import_once(f'\nfrom extension_codecache import CustomAsyncCompile') - wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()') - # Dump loop and load/store information - wrapper.add_import_once(f"loop_info = {self.loop_info}") - wrapper.add_import_once(f"load_tile_info = {self.load_desc}") - wrapper.add_import_once(f"store_tile_info = {self.store_desc}") - wrapper.add_import_once(f"arg_attributes = {arg_attributes}") - - - def call_kernel(self, kernel_name): - wrapper = V.graph.wrapper_code - _, call_args, arg_attributes = self.args.llvm_argdefs() - # generate the code to call this - wrapper.generate_kernel_call(kernel_name, call_args, cuda=False) - - def _codegen_kernel(self, arg_defs, kernel_name): - arg_defs = ",\n".ljust(25).join(arg_defs) - code = common.BracesBuffer() - - # Todo. kernel name custom - kernel_decl_name = kernel_name if V.graph.cpp_wrapper else "kernel" - code.writeline(f'define void @{kernel_decl_name}({arg_defs})') - with code.indent(): - for old, new in self.args.aliases(): - code.writeline(f"auto {old} = {new};") - # Loop body part - code.splice(self.codegen_loops()) - code.writeline(f'declare i64 @llvm.umax.i64(i64, i64) #1') - code.writeline(f'declare i32 @llvm.umax.i32(i32, i32) #1') - return code - - - def set_ranges(self, lengths, reduction_lengths): - if self.call_ranges: - assert self.call_ranges == tuple(lengths) + tuple( - reduction_lengths - ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}" - assert self.reduction_depth == len(lengths) - else: - self.call_ranges = tuple(lengths) + tuple(reduction_lengths) - self.ranges = [self.rename_indexing(x) for x in self.call_ranges] - self.itervars = [sympy.Symbol(f"index{n}") for n in range(len(self.ranges))] - self.reduction_depth = len(lengths) - return ( - self.itervars[: self.reduction_depth], - self.itervars[self.reduction_depth :], - ) - -class VectorizedLLVMKernel(LLVMKernel): - overrides = VectorOverrides - - def __init__(self): - super().__init__() - self.vector_loads = IndentedBuffer() - self.vector_stores = IndentedBuffer() - self.vector_index_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_vec_idx") - self.vector_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="vector_body") - self.vector_reduction_cse = common.CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc") - - def load(self, name: str, index: sympy.Expr): - scalar_var = super().load(name, index) - index = self.rename_indexing(index) - index = index.replace(sympy.symbols(f"index{len(self.itervars)-1}"), sympy.symbols(f"vector.index{len(self.itervars)-1}")) - index = self.depth_first_traverse(index, self.vector_loads, self.vector_index_cse) - var = self.args.input(name) - dtype = V.graph.get_dtype(name) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 %{index}" - var = self.vector_cse.generate(self.vector_loads, line) - - # NOTE. Since clang 16.0 always used this constant, 2 is hard coded - line = f"load , ptr %{var}, align {align}" - return [self.vector_cse.generate(self.vector_loads, line), scalar_var] - - def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): - super().store(name, index, value, *args, **kwargs) - index = self.rename_indexing(index) - index = index.replace(sympy.symbols(f"index{len(self.itervars)-1}"), sympy.symbols(f"vector.index{len(self.itervars)-1}")) - index = self.depth_first_traverse(index, self.vector_stores, self.vector_index_cse) - var = self.args.output(name) - dtype = V.graph.get_dtype(name) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 %{index}" - var = self.vector_cse.generate(self.vector_stores, line) - - # NOTE. Since clang 16.0 always used this constant, 2 is hard coded - if (isinstance(value, list)): - value = value[0] - line = f"store %{value}, ptr %{var}, align {align}" - self.vector_cse.generate(self.vector_stores, line, assignment = False) - - def reduction(self, dtype, src_dtype, reduction_type, value): - super().reduction(dtype, src_dtype, reduction_type, value[1]) - argmax_or_argmin = reduction_type in {"argmax", "argmin"} - if argmax_or_argmin: - raise NotImplementedError() #TODO: argmin, argmax - else: - reduction_key = src_dtype, reduction_type, value - acc = self.vector_reduction_cse.generate( - self.loads, f"reduction {reduction_key}", write=False - ) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - line = f"load {type_name}, ptr %{acc}, align {align}" - - # NOTE. To keep lines below under the compute lines, used store buffers - temp = self.vector_cse.generate(self.vector_stores, line) - output = self.vector_cse.generate(self.vector_stores, vector_reduction_combine(reduction_type, temp, value[0])) - line = f"store {type_name} %{output}, ptr %{acc}, align {align}" - self.vector_cse.generate(self.vector_stores, line, assignment = False) - return acc - - def codegen_loops(self): - code = common.BracesBuffer() - # Loop arguments - loops_args = [[var, size, idx] for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))] - - # Loop initialize - loops_list = [LoopLevel(*args) for args in loops_args[:-1]] - vector_loops_list = [VectorLoopLevel(*loops_args[-1])] - scalar_loop = [LoopLevel(loops_args[-1][0], loops_args[-1][1], loops_args[-1][2], f"%scalar.index.ph")] - - loops = LoopNest(loops_list[: self.reduction_depth]) - reductions = LoopNest(loops_list[self.reduction_depth :]) - inner_most = LoopNest(vector_loops_list) - inner_most_scalar = LoopNest(scalar_loop) - - reductions.mark_reduction(self.reduction_vars) - - with contextlib.ExitStack() as stack: - if self.reduction_vars: - reduction_alloc(code, stack, self.reduction_vars) - loops.codegen(code, stack) - code.splice(self.reduction_prefix) - with contextlib.ExitStack() as stack: - reductions.codegen(code, stack) - with contextlib.ExitStack() as stack_inner: - inner_most.codegen(code,stack_inner) - code.splice(self.vector_loads) - code.splice(self.vector_compute) - code.splice(self.vector_stores) - - with contextlib.ExitStack() as stack_inner: - inner_most_scalar.codegen(code, stack_inner) - code.splice(self.loads) - code.splice(self.compute) - code.splice(self.stores) - code.splice(self.reductions_suffix) - code.writeline(f"ret void") - return code - - def _codegen_kernel(self, arg_defs, kernel_name): - code = super()._codegen_kernel(arg_defs, kernel_name) - # Add vector llvm intrinsics definition - code.writeline(f'declare i64 @llvm.vscale.i64() #2') - code.writeline(f'declare i32 @llvm.vscale.i32() #2') - return code - -class MatrixLLVMKernel(LLVMKernel): - overrides = MatrixOverrides - - def __init__(self): - super().__init__() - # Defaulat tile setting - self.tile_row = 64 - self.tile_col = 64 - self.tile_size = self.tile_row * self.tile_col - - def get_load_info(self, vec_len, cv): - tile_row = self.tile_row - tile_col = self.tile_col - if vec_len > 1: - stride = reduce(mul, cv, 1) - if len(cv) == 2 and cv[1] == 0: # if the tile is row major vector - vec_len, tile_col, stride = self.tile_row, 1, self.tile_row - elif len(cv) == 2 and cv[0] == 0: # if the tile is colum major vector - vec_len, tile_row, stride = self.tile_col, 1, 1 - elif len(cv) == 1: # if the tile is vector - stride = tile_row - elif vec_len == 1: # scalar - tile_row, tile_col, stride = 1, 1, 1 - return tile_row, tile_col, stride, vec_len - - def need_vec_transpose(self, index): - return ( - len(self.itervars) > 1 - and self.outer_idx is not None - and cpp.stride_at(self.itervars[self.outer_idx], index) == 1 - and index.has(self.itervars[self.tiling_idx]) - and not cpp.stride_at(self.itervars[self.tiling_idx], index).has( - self.itervars[self.tiling_idx] - ) - and not cpp.stride_at(self.itervars[self.tiling_idx], index).has( - self.itervars[self.outer_idx] - ) - ) - - def load(self, name: str, index: sympy.Expr): - var = self.args.input(name) - dtype = V.graph.get_dtype(name) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - - index = self.rename_indexing(index) - cv = self.get_constant_vector(index) - self.add_desc(True, name, align, cv, [self.tile_col, self.tile_row]) - new_index = self.depth_first_traverse(index, self.loads, self.index_cse) - vec_len = self.tile_size if not new_index.is_number else 1 - new_index = f"%{new_index}" if not new_index.is_number else new_index - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 {new_index}" - var = self.cse.generate(self.loads, line) - tile_row, tile_col, stride, vec_len = self.get_load_info(vec_len, cv) - line = f"call <{vec_len} x {type_name}> @llvm.matrix.column.major.load.v{vec_len}f32.p0f32(ptr %{var}, i64 {stride}, i1 0, i32 {tile_row}, i32 {tile_col})" - out_var = self.cse.generate(self.loads, line) - if self.need_vec_transpose(index): - line = f"call <{vec_len} x {type_name}> @llvm.matrix.transpose.v{vec_len}f32.p0f32(<{vec_len} x {type_name}> %{out_var}, i32 {tile_row}, i32 {tile_col})" - out_var = self.cse.generate(self.loads, line) - self.tile_shape[out_var] = [tile_row, tile_col] - return out_var - - def store(self, name: str, index: sympy.Expr, value, *args, **kwargs): - var = self.args.output(name) - dtype = V.graph.get_dtype(name) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - - index = self.rename_indexing(index) - cv = self.get_constant_vector(index) - self.add_desc(False, name, align, cv, [self.tile_col, self.tile_row]) - new_index = self.depth_first_traverse(index, self.stores, self.index_cse) - vec_len = self.tile_size if not new_index.is_number else 1 - new_index = f"%{new_index}" if not new_index.is_number else new_index - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 {new_index}" - var = self.cse.generate(self.stores, line) - if (isinstance(value, list)): - value = value[0] - if self.need_vec_transpose(index): - line = f"call <{vec_len} x {type_name}> @llvm.matrix.transpose.v{vec_len}f32.p0f32(<{vec_len} x {type_name}> %{value}, i32 {self.tile_row}, i32 {self.tile_col})" - value = self.cse.generate(self.stores, line) - if vec_len > 1: - stride = self.ranges[-1] - line = f"call void @llvm.matrix.column.major.store.v{self.tile_size}f32.p0f32(<{self.tile_size} x {type_name}> %{value}, ptr %{var}, i64 {stride}, i1 0, i32 {self.tile_row}, i32 {self.tile_col})" - elif vec_len == 1: - line = f"call void @llvm.matrix.column.major.store.v1f32.p0f32(<1 x {type_name}> %{value}, ptr %{var}, i64 1, i1 0, i32 1, i32 1)" - self.cse.generate(self.stores, line, assignment = False) - - def reduction(self, dtype, src_dtype, reduction_type, value): - argmax_or_argmin = reduction_type in {"argmax", "argmin"} - if argmax_or_argmin: - raise NotImplementedError() #TODO: argmin, argmax - else: - reduction_key = src_dtype, reduction_type, value - acc = self.reduction_cse.generate( - self.loads, f"reduction {reduction_key}", write=False - ) - self.reduction_vars[acc] = reduction_type - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - comma = ", " - if (self.tile_col == 1): # if the tile is vector - self.reduction_prefix.writeline(f"store {type_name} {reduction_init(reduction_type, dtype)}, ptr %{acc}, align {align}") - line = f"load <{self.tile_col} x {type_name}>, ptr %{acc}, align {align}" - else: - array_line = [f"{type_name} {reduction_init(reduction_type, dtype)}" for _ in range(self.tile_row)] - self.reduction_prefix.writeline(f"store <{self.tile_row} x {type_name}> <{comma.join(array_line)}>, ptr %{acc}, align {align}") - line = f"load <{self.tile_row} x {type_name}>, ptr %{acc}, align {align}" - - # NOTE. To keep below line be under the compute, used store buffers - temp = self.cse.generate(self.stores, line) - if self.tiling_idx >= self.reduction_depth: # horizontal reduction - output = [] - for i in range(self.tile_col): - if self.tile_col != 1: - indexes = [f"i32 {i*self.tile_row+j}" for j in range(self.tile_row)] - line = f"shufflevector <{self.tile_size} x {type_name}> %{value}, <{self.tile_size} x {type_name}> undef, <{self.tile_row} x i32> <{comma.join(indexes)}>" - split_vector = self.cse.generate(self.stores, line) - reduced_vector = self.cse.generate(self.stores, matrix_partial_reduction_combine(reduction_type, split_vector, self.tile_row)) - else: - reduced_vector = self.cse.generate(self.stores, matrix_partial_reduction_combine(reduction_type, value, self.tile_row)) - output.append(reduced_vector) - length = len(output) - size = 1 - while(len(output) > 1): - op1 = output.pop(0) - op2 = output.pop(0) - if size == 1: - line = f"insertelement <2 x {type_name}> undef, {type_name} %{op1}, i32 0" - temp_vec = self.cse.generate(self.stores, line) - line = f"insertelement <2 x {type_name}> %{temp_vec}, {type_name} %{op2}, i32 1" - else: - indexes = [f"i32 {j}" for j in range(size * 2)] - line = f"shufflevector <{size} x {type_name}> %{op1}, <{size} x {type_name}> %{op2}, <{size * 2} x i32> <{comma.join(indexes)}>" - out = self.cse.generate(self.stores, line) - output.append(out) - if (len(output) == length / 2): - size *= 2 - length = len(output) - if (self.tile_col == 1): - line = f"insertelement <{self.tile_col} x {type_name}> undef, {type_name} %{output[0]}, i32 0" - cast_vec = self.cse.generate(self.stores, line) - line = f"fadd <{self.tile_col} x {type_name}> %{temp}, %{cast_vec}" - else: - line = f"fadd <{self.tile_row} x {type_name}> %{temp}, %{output[0]}" - stored_vector = self.cse.generate(self.stores, line) - else: - partial_value = temp - for i in range(self.tile_col): - if self.tile_col != 1: - indexes = [f"i32 {i*self.tile_row+j}" for j in range(self.tile_row)] - line = f"shufflevector <{self.tile_size} x {type_name}> %{value}, <{self.tile_size} x {type_name}> undef, <{self.tile_row} x i32> <{comma.join(indexes)}>" - split_vector = self.cse.generate(self.stores, line) - partial_value = self.cse.generate(self.stores, matrix_reduction_combine(reduction_type, partial_value, split_vector, self.tile_row)) - else: - partial_value = self.cse.generate(self.stores, matrix_reduction_combine(reduction_type, partial_value, value, self.tile_row)) - stored_vector = partial_value - if (self.tile_col == 1): - line = f"store <{self.tile_col} x {type_name}> %{stored_vector}, ptr %{acc}, align {align}" - else: - line = f"store <{self.tile_row} x {type_name}> %{stored_vector}, ptr %{acc}, align {align}" - self.cse.generate(self.stores, line, assignment = False) - self.reduction_cse.reduction_cache[reduction_key] = acc - return acc - - def store_reduction(self, name, index, value): - var = self.args.output(name) - dtype = V.graph.get_dtype(name) - type_name = llvm_common.DTYPE_TO_LLVM[dtype] - align = llvm_common.DTYPE_SIZE[dtype] - - index = self.rename_indexing(index) - cv = self.get_constant_vector(index) - self.add_desc(False, name, align, cv, [1, self.tile_row]) - index = self.depth_first_traverse(index, self.reduction_suffix, self.index_cse) - if (self.tile_col == 1): - line = f"load <{self.tile_col} x {type_name}>, ptr %{value}, align {align}" - else: - line = f"load <{self.tile_row} x {type_name}>, ptr %{value}, align {align}" - value = self.reduction_cse.generate(self.reductions_suffix, line) - if (self.tile_col == 1): - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 0" - else: - line = f"getelementptr inbounds {type_name}, ptr %{var}, i64 %{index}" - var = self.cse.generate(self.reductions_suffix, line) - stride = self.ranges[-1] - if (self.tile_col == 1): - line = f"call void @llvm.matrix.column.major.store.v{self.tile_col}f32.p0f32(<{self.tile_col} x {type_name}> %{value}, ptr %{var}, i64 1, i1 0, i32 {self.tile_col}, i32 1)" - else: - line = f"call void @llvm.matrix.column.major.store.v{self.tile_row}f32.p0f32(<{self.tile_row} x {type_name}> %{value}, ptr %{var}, i64 {stride}, i1 0, i32 {self.tile_row}, i32 1)" - self.cse.generate(self.reductions_suffix, line, assignment = False) - - def codegen_loops(self): - code = common.BracesBuffer() - self.loop_info = {} - # Loop body part - loops_args = [[var, size, idx] for idx, (var, size) in enumerate(zip(self.itervars, self.ranges))] - outer_loops = [LoopLevel(var, size, idx) for var, size, idx in loops_args[:-2]] - loops = [MatrixLoopLevel(var, size, idx, tile_row=self.tile_row) for var, size, idx in loops_args[-2:]] - loops = outer_loops + loops - loops, reductions = [LoopNest(loops[: self.reduction_depth]), - LoopNest(loops[self.reduction_depth :])] - reductions.mark_reduction(self.reduction_vars) - - with contextlib.ExitStack() as stack: - if self.reduction_vars: - matrix_reduction_alloc(code, stack, self.reduction_vars, self.tile_row) - self.loop_info.update(loops.codegen(code, stack)) - with contextlib.ExitStack() as stack_outer: - code.splice(self.reduction_prefix) - with contextlib.ExitStack() as stack: - self.loop_info.update(reductions.codegen(code, stack)) - code.splice(self.loads) - code.splice(self.compute) - code.splice(self.stores) - code.splice(self.reductions_suffix) - code.writeline(f"ret void") - return code - - def set_ranges(self, lengths, reduction_lengths): - ret = super().set_ranges(lengths, reduction_lengths) - # do vertical reduction as the tail loop - if len(self.itervars) > 1: - if len(self.tiling_indices) == 1: - self.tiling_idx = self.tiling_indices[0] - self.outer_idx = None - else: - self.outer_idx, self.tiling_idx = ( - self.tiling_indices - if self.tiling_indices[1] < self.reduction_depth - else reversed(self.tiling_indices) - ) - else: - self.tiling_idx = self.tiling_indices[0] - self.outer_idx = None - - # FIXME. this doesn't look pretty... - # We have to change this logic to configurable tile_size - if len(self.itervars) == 1: - self.tile_row = min(self.tile_size, self.ranges[0]) - self.tile_col = 1 - elif len(self.itervars) > 1: - self.tile_row = min(self.tile_row, self.ranges[0]) - self.tile_col = min(self.tile_col, self.ranges[1]) - self.tile_size = self.tile_row * self.tile_col - return ret - - def _codegen_kernel(self, arg_defs, kernel_name): - code = super()._codegen_kernel(arg_defs, kernel_name) - # Add llvm matrix intrinsics definition - code.writeline(f'declare <{self.tile_size} x float> @llvm.matrix.column.major.load.v{self.tile_size}f32.p0f32(ptr , i64, i1, i32, i32) #2') - if self.tile_size != self.tile_row: - code.writeline(f'declare <{self.tile_row} x float> @llvm.matrix.column.major.load.v{self.tile_row}f32.p0f32(ptr , i64, i1, i32, i32) #2') - code.writeline(f'declare <1 x float> @llvm.matrix.column.major.load.v1f32.p0f32(ptr , i64, i1, i32, i32) #2') - code.writeline(f'declare void @llvm.matrix.column.major.store.v1f32.p0f32(<1 x float>, ptr , i64, i1, i32, i32) #3') - code.writeline(f'declare <{self.tile_size} x float> @llvm.matrix.multiply.v{self.tile_size}f32.v16f32.v16f32(<16 x float>, <16 x float>, i32, i32, i32) #1') - code.writeline(f'declare void @llvm.matrix.column.major.store.v{self.tile_size}f32.p0f32(<{self.tile_size} x float>, ptr , i64, i1, i32, i32) #3') - if self.tile_size != self.tile_row: - code.writeline(f'declare void @llvm.matrix.column.major.store.v{self.tile_row}f32.p0f32(<{self.tile_row} x float>, ptr , i64, i1, i32, i32) #3') - code.writeline(f'declare <{self.tile_size} x float> @llvm.matrix.transpose.v{self.tile_size}f32.p0f32(<{self.tile_size} x float>, i32, i32) #2') - code.writeline(f'declare float @llvm.vector.reduce.fadd.nxv2f32(float, <{self.tile_row} x float>)') - code.writeline(f'declare float @llvm.vector.reduce.fmax.nxv2f32(<{self.tile_row} x float>)') - code.writeline(f'declare <{self.tile_size} x float> @llvm.exp.f32(<{self.tile_size} x float>) #1') - code.writeline(f'declare <{self.tile_size} x float> @llvm.maximum.f32(<{self.tile_size} x float>, <{self.tile_size} x float>) #1') - return code - - -@dataclasses.dataclass -class LoopLevel: - var: sympy.Expr - size: sympy.Expr - idx: int - start: int = 0 - reduction_vars: Dict[str, str] = None - - # Todo. Type change for reduction - INDEX_TYPE = "i64" - INDEX_SIZE = 8 - - def lines(self, line, stride=1): - loop_index = self.idx - self.stride = stride - @contextlib.contextmanager - def ctx(): - entry_label = f"entry{loop_index}" - for_body_label = f"for.body{loop_index}" - for_inc_label = f"for.inc{loop_index}" - for_end_label = f"for.end{loop_index}" - - index = f"%index{loop_index}" - index_next = f"%index.next{loop_index}" - cmp_var = f"%cmp{loop_index}" - - line.writeline(f"br label %{entry_label}") - line.writeline(f"\n{entry_label}:") - line.writeline(f"br label %{for_body_label}") - - line.writeline(f"\n{for_body_label}:") - line.writeline(f"{index} = phi {self.INDEX_TYPE} [ {self.start}, %{entry_label} ], [ {index_next}, %{for_inc_label} ]") - yield - line.writeline(f"br label %{for_inc_label}") - line.writeline(f"\n{for_inc_label}:") - line.writeline(f"{index_next} = add nsw {self.INDEX_TYPE} {index}, {stride}") - line.writeline(f"{cmp_var} = icmp eq {self.INDEX_TYPE} {index_next}, {self.size}") - line.writeline(f"br i1 {cmp_var}, label %{for_end_label}, label %{for_body_label}") - - line.writeline(f"\n{for_end_label}:") - return ctx() - -@dataclasses.dataclass -class VectorLoopLevel(LoopLevel): - var: sympy.Expr - size: sympy.Expr - idx: int - reduction_vars: Dict[str, str] = None - - DATA_TYPE = "i32" - DATA_SIZE = 4 - def lines(self, line, stride=1): - loop_index = self.idx - self.stride = stride # FIXME. vector type can't be determined in this time... - @contextlib.contextmanager - def ctx(): - # Label definition - entry_label = f"vector.entry{loop_index}" - vector_body_label = f"vector.body{loop_index}" - ph_label = f"vector.ph{loop_index}" - middle_label = f"middle.block{loop_index}" - preheader_label = f"vector.for.body.preheader{loop_index}" - for_body_label = f"vector.for.body{loop_index}" - func_ret_label = f"for.end{loop_index}" - - # Variable definition - entry_var0 = f"%entry_var.0" - entry_var1 = f"%entry_var.1" - entry_var2 = f"%entry_var.2" - min_iter_check = f"%min.iters.check" - ph_var0 = f"%ph_var.0" - ph_var1 = f"%ph_var.1" - ph_mod = f"%n_mod.vf" - ph_vec = f"%n.vec" - ph_stride0 = f"%ph_stride.0" - ph_stride1 = f"%ph_stride.1" - ph_stride2 = f"%ph_stride.2" - min_iter_check = f"%min.iters.check{loop_index}" - idx = f"%vector.index{loop_index}" - idx_next = f"%vector.index.next{loop_index}" - loop_condition = f"%vector.condition{loop_index}" - scalar_condition = f"%scalar.condition" - scalar_index_ph = f"%scalar.index.ph" - - line.writeline(f"br label %{entry_label}") - line.writeline(f"\n{entry_label}:") - line.writeline(f"{entry_var0} = tail call {self.INDEX_TYPE} @llvm.vscale.{self.INDEX_TYPE}()") - line.writeline(f"{entry_var1} = shl nuw nsw {self.INDEX_TYPE} {entry_var0}, 2") - line.writeline(f"{entry_var2} = tail call {self.INDEX_TYPE} @llvm.umax.{self.INDEX_TYPE}({self.INDEX_TYPE} {entry_var1}, {self.INDEX_TYPE} 16)") - line.writeline(f"{min_iter_check} = icmp ugt {self.INDEX_TYPE} {entry_var2}, {self.size}") - line.writeline(f"br i1 {min_iter_check}, label %{preheader_label}, label %{ph_label}") - - # Vector loop body part - line.writeline(f"\n{ph_label}:") - line.writeline(f"{ph_var0} = tail call {self.INDEX_TYPE} @llvm.vscale.{self.INDEX_TYPE}()") - line.writeline(f"{ph_var1} = shl nuw nsw {self.INDEX_TYPE} {ph_var0}, 2") # FIXME. 2 is hardcoded... - line.writeline(f"{ph_mod} = urem {self.INDEX_TYPE} {self.size}, {ph_var1}") - line.writeline(f"{ph_vec} = sub nuw nsw {self.INDEX_TYPE} {self.size}, {ph_mod}") - line.writeline(f"{ph_stride0} = tail call {self.DATA_TYPE} @llvm.vscale.{self.DATA_TYPE}()") - line.writeline(f"{ph_stride1} = shl nuw nsw {self.DATA_TYPE} {ph_stride0}, 1") - line.writeline(f"{ph_stride2} = zext {self.DATA_TYPE} {ph_stride1} to {self.INDEX_TYPE}") - - line.writeline(f"br label %{vector_body_label}") - line.writeline(f"\n{vector_body_label}:") - line.writeline(f"{idx} = phi {self.INDEX_TYPE} [ 0, %{ph_label} ], [ {idx_next}, %{vector_body_label} ]") - yield - - # Increment & condition check part - line.writeline(f"{idx_next} = add nuw {self.INDEX_TYPE} {idx}, {ph_stride2}") - line.writeline(f"{loop_condition} = icmp eq {self.INDEX_TYPE} {idx_next}, {ph_vec}") - line.writeline(f"br i1 {loop_condition}, label %{middle_label}, label %{vector_body_label}") - - line.writeline(f"\n{middle_label}:") - line.writeline(f"{scalar_condition} = icmp eq {self.INDEX_TYPE} {ph_mod}, 0") - line.writeline(f"br i1 {scalar_condition}, label %{func_ret_label}, label %{preheader_label}") - - line.writeline(f"\n{preheader_label}:") - line.writeline(f"{scalar_index_ph} = phi {self.INDEX_TYPE} [ 0, %{entry_label} ], [ {ph_vec}, %{middle_label} ]") - return ctx() - -@dataclasses.dataclass -class MatrixLoopLevel(LoopLevel): - var: sympy.Expr - size: sympy.Expr - idx: int - start: int = 0 - tile_row: int = 4 - reduction_vars: Dict[str, str] = None - - # Todo. Type change for reduction - INDEX_TYPE = "i64" - INDEX_SIZE = 8 - - def lines(self, line, stride=1): - loop_index = self.idx - self.stride = stride * self.tile_row # FIXME: this stride is not correct (it should be stride of tile, not stride of scalar) - @contextlib.contextmanager - def ctx(): - entry_label = f"entry{loop_index}" - for_body_label = f"for.body{loop_index}" - for_inc_label = f"for.inc{loop_index}" - for_end_label = f"for.end{loop_index}" - - index = f"%index{loop_index}" - index_next = f"%index.next{loop_index}" - cmp_var = f"%cmp{loop_index}" - - line.writeline(f"br label %{entry_label}") - line.writeline(f"\n{entry_label}:") - line.writeline(f"br label %{for_body_label}") - - line.writeline(f"\n{for_body_label}:") - line.writeline(f"{index} = phi {self.INDEX_TYPE} [ {self.start}, %{entry_label} ], [ {index_next}, %{for_inc_label} ]") - yield - line.writeline(f"br label %{for_inc_label}") - line.writeline(f"\n{for_inc_label}:") - line.writeline(f"{index_next} = add nsw {self.INDEX_TYPE} {index}, {self.tile_row}") - line.writeline(f"{cmp_var} = icmp eq {self.INDEX_TYPE} {index_next}, {self.size}") - line.writeline(f"br i1 {cmp_var}, label %{for_end_label}, label %{for_body_label}") - - line.writeline(f"\n{for_end_label}:") - return ctx() - -@dataclasses.dataclass -class LoopNest: - loops: List[LoopLevel] - - def __bool__(self): - return bool(self.loops) - - def mark_reduction(self, reduction_vars): - for loop in self.loops: - loop.reduction_vars = reduction_vars - - def mark_parallel(self, par_depth): - loops = self.loops - loops[0].parallel = par_depth - for i in range(1, par_depth): - loops[i].collapsed = True - loops[0].simd = loops[par_depth - 1].simd - - def codegen(self, code, stack): - size_list = [] - stride_list = [] - loop_info = {} - for loop in self.loops: - stride_list.append(loop.size) - stride_list.append(1) - - var = 1 - for sz in size_list[::-1]: - var = var * sz - stride_list.append(var) - stride_list = stride_list[::-1] - - for loop, stride in zip(self.loops, stride_list): - stack.enter_context(loop.lines(code, stride=stride)) - loop_info[str(loop.var)] = [loop.start, loop.size, loop.stride] - return loop_info - -class LLVMScheduling(BaseScheduling): - count = 0 - target_kernel = LLVMKernel - def __init__(self, scheduler): - self.scheduler = scheduler - self._scheduling = cpp.CppScheduling(scheduler) - - def can_fuse_vertical(self, node1, node2): - return False - - def can_fuse_horizontal(self, node1, node2): - return False - - def group_fn(self, sizes): - return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes) - - def codegen_nodes(self, nodes): - _, (group, reduction_group) = max( - nodes, key=lambda x: int(x.is_reduction()) - ).group - ex_kernel = self.target_kernel() - - kernel_name = f"extension_kernel_{self.count}" - self.count += 1 - src_code = ex_kernel.codegen_nodes(nodes, kernel_name) - self.define_kernel(src_code, kernel_name) - ex_kernel.call_kernel(kernel_name) - - def codegen_sync(self): - pass - - def flush(self): - self._scheduling.flush() - - def define_function(self, kernel): - code = kernel.def_function() - if code is not None: - wrapper = V.graph.wrapper_code - wrapper.header.writeline(code) - - def define_kernel(self, src_code, kernel_name): - wrapper = V.graph.wrapper_code - if src_code in wrapper.src_to_kernel: - kernel_name = wrapper.src_to_kernel[src_code] - else: - wrapper.src_to_kernel[src_code] = kernel_name - - codecache_def = IndentedBuffer() - codecache_def.writeline("custom_async_compile.llvm('''") - codecache_def.splice(src_code) - codecache_def.writeline("''', ") - codecache_def.writeline("loop_info=loop_info,") - codecache_def.writeline("load_tile_info=load_tile_info,") - codecache_def.writeline("store_tile_info=store_tile_info,") - codecache_def.writeline("arg_attributes=arg_attributes)") - - wrapper.define_kernel(kernel_name, codecache_def.getvalue(), cuda=False) - return kernel_name - -class VectorizedLLVMScheduling(LLVMScheduling): - target_kernel = VectorizedLLVMKernel - -class MatrixLLVMScheduling(LLVMScheduling): - target_kernel = MatrixLLVMKernel - def codegen_template(self, template_node, epilogue_nodes): - _, (numel, rnumel) = template_node.group - - template_buffer = template_node.node - kernel, render = template_buffer.make_kernel_render(template_buffer, epilogue_nodes=epilogue_nodes) - with kernel: - for node in [template_node, *epilogue_nodes]: - node.mark_run() - src_code = render() - - with V.set_kernel_handler(kernel): - node_schedule = [template_node, *epilogue_nodes] - kernel.meta_kernel() - kernel_name = self.define_kernel(src_code, kernel.kernel_name) - self.define_function(kernel) - kernel.call_kernel(kernel_name) \ No newline at end of file diff --git a/PyTorchSimFrontend/llvm/llvm_conv_template.py b/PyTorchSimFrontend/llvm/llvm_conv_template.py deleted file mode 100644 index 139298f0..00000000 --- a/PyTorchSimFrontend/llvm/llvm_conv_template.py +++ /dev/null @@ -1,230 +0,0 @@ -import os, math -from typing import List, Optional, cast -from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs -from PyTorchSimFrontend.llvm.llvm_template import LLVMTemplate -from PyTorchSimFrontend.llvm.llvm_template import LLVMTemplateKernel -from torch._inductor.ir import Buffer -from torch._inductor.ir import IRNode -from torch._inductor.codecache import get_hash -from PyTorchSimFrontend import extension_config - -CONV2D_TEMPLATE = r""" -@sram_accum = dso_local global [{{ TILE_M * TILE_N }} x {{ DATA_TYPE }}] zeroinitializer, align 64 - -define dso_local void @{{ KERNEL_NAME }}(ptr %X, ptr %Y, ptr %B, ptr %W) { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv59 = phi i64 [ 0, %entry ], [ %indvars.iv.next60, %for.cond.cleanup3 ] - %0 = mul nuw nsw i64 %indvars.iv59, {{ N }} - %add.ptr = getelementptr inbounds {{ DATA_TYPE }}, ptr %B, i64 %0 - %1 = mul nuw nsw i64 %indvars.iv59, {{ K }} - %add.ptr13 = getelementptr inbounds {{ DATA_TYPE }}, ptr %X, i64 %1 - %add.ptr27 = getelementptr inbounds {{ DATA_TYPE }}, ptr %W, i64 %0 - br label %for.body4 - -for.cond.cleanup: - ret void - -for.cond.cleanup3: - %indvars.iv.next60 = add nuw nsw i64 %indvars.iv59, {{ TILE_M }} - %cmp = icmp ult i64 %indvars.iv59, {{ M - TILE_M }} - br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup - -for.body4: - %indvars.iv57 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next58, %for.cond.cleanup9 ] - %add.ptr6 = getelementptr inbounds {{ DATA_TYPE }}, ptr %add.ptr, i64 %indvars.iv57 - %call = {{ kernel.load_matrix(TILE_M, TILE_N, N, DATA_TYPE, DATA_STYPE, "%add.ptr6", "B", DATA_SIZE)}} - tail call void @llvm.memset.p0.i64(ptr @sram_accum, i8 0, i64 {{ TILE_M * TILE_N * DATA_SIZE }}, i1 false) - %invariant.gep = getelementptr inbounds {{ DATA_TYPE }}, ptr %Y, i64 %indvars.iv57 - br label %for.body10 - -for.cond.cleanup9: - %call24 = fadd <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }} > %call, %call18 - %add.ptr29 = getelementptr inbounds {{ DATA_TYPE }}, ptr %add.ptr27, i64 %indvars.iv57 - {{ kernel.store_matrix(TILE_M, TILE_N, N, DATA_TYPE, DATA_STYPE, "%add.ptr29", "%call24", "W", DATA_SIZE) }} - %indvars.iv.next58 = add nuw nsw i64 %indvars.iv57, {{ TILE_N }} - %cmp2 = icmp ult i64 %indvars.iv57, {{ N - TILE_N }} - br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 - -for.body10: - %indvars.iv = phi i64 [ 0, %for.body4 ], [ %indvars.iv.next, %for.body10 ] - %add.ptr15 = getelementptr inbounds {{ DATA_TYPE }}, ptr %add.ptr13, i64 %indvars.iv - %call16 = {{ kernel.load_matrix(TILE_M, TILE_K, K, DATA_TYPE, DATA_STYPE, "%add.ptr15", "X", DATA_SIZE)}} - %2 = mul nuw nsw i64 %indvars.iv, {{ N }} - %gep = getelementptr inbounds {{ DATA_TYPE }}, ptr %invariant.gep, i64 %2 - %call22 = {{ kernel.load_matrix(TILE_K, TILE_N, N, DATA_TYPE, DATA_STYPE, "%gep", "Y", DATA_SIZE)}} - %call23 = call <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }}> @llvm.matrix.multiply.v{{ TILE_M*TILE_K }}{{ DATA_STYPE }}.v{{ TILE_K*TILE_N }}{{ DATA_STYPE }}.v{{ TILE_M*TILE_N }}{{ DATA_STYPE }}(<{{ TILE_N * TILE_K}} x {{ DATA_TYPE }}> %call22, <{{ TILE_M * TILE_K}} x {{ DATA_TYPE }}> %call16, i32 {{ TILE_M }}, i32 {{ TILE_K }}, i32 {{ TILE_N }}) - - %tmp_acc = load <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }}>, ptr @sram_accum, align 64 - %call18 = fadd <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }} > %call23, %tmp_acc - store <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }}> %call18, ptr @sram_accum, align 64 - - %indvars.iv.next = add nuw nsw i64 %indvars.iv, {{ TILE_K }} - %cmp8 = icmp ult i64 %indvars.iv, {{ K - TILE_K }} - br i1 %cmp8, label %for.body10, label %for.cond.cleanup9 -} - -declare void @llvm.memset.p0.i64(ptr, i8, i64, i1) -{% if TILE_M == TILE_N %} -declare <{{TILE_M * TILE_K}} x float> @llvm.matrix.column.major.load.v{{ TILE_M * TILE_K }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(ptr , i64, i1, i32, i32) #2 -{% else %} -declare <{{TILE_M * TILE_K}} x float> @llvm.matrix.column.major.load.v{{ TILE_M * TILE_K }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(ptr , i64, i1, i32, i32) #2 -declare <{{TILE_N * TILE_K}} x float> @llvm.matrix.column.major.load.v{{ TILE_N * TILE_K }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(ptr , i64, i1, i32, i32) #2 -{% endif %} -declare <{{TILE_M * TILE_N}} x float> @llvm.matrix.multiply.v{{ TILE_M*TILE_K }}{{ DATA_STYPE }}.v{{ TILE_K*TILE_N }}{{ DATA_STYPE }}.v{{ TILE_M*TILE_N }}{{ DATA_STYPE }}(<{{ TILE_M*TILE_K }} x {{ DATA_TYPE }}>, < {{ TILE_N*TILE_K }} x {{ DATA_TYPE }}>, i32, i32, i32) #1 -declare void @llvm.matrix.column.major.store.v{{ TILE_M * TILE_N }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(<{{ TILE_M*TILE_N }} x {{ DATA_TYPE }}>, ptr , i64, i1, i32, i32) #3 -""" - -CONV2D_FUNC = r""" -def {{ FUNC_NAME }}({{ INPUT }}, {{ WEIGHT }}, {{ BIAS }}, {{ OUT }}): - {{ INPUT }}_cpu = {{ INPUT }}.cpu() - {{ WEIGHT }}_cpu = {{ WEIGHT }}.cpu() - {{ BIAS }}_cpu = {{ BIAS }}.cpu() - {{ OUT }}_cpu = {{ OUT }}.cpu() - - # Torch support NCHW, so we need to transpose for now - {{ INPUT }}_cpu = {{ INPUT }}_cpu.permute(0, 2, 3, 1) - {{ WEIGHT }}_cpu = {{ WEIGHT }}_cpu.permute(0, 2, 3, 1) - {{ OUT }}_cpu = {{ OUT }}_cpu.permute(0, 2, 3, 1) - {{ OUT }}_cpu.zero_() - - input_shape = {{ INPUT }}_cpu.shape - weight_shape = {{ WEIGHT }}_cpu.shape - output_shape = {{ OUT }}_cpu.shape - - input_pad_shape = (input_shape[0], input_shape[1]+2*{{ PADDING_H }}, input_shape[2]+2*{{ PADDING_W }}, input_shape[3]) - input_pad = torch.zeros(input_pad_shape) - - if {{ PADDING_H }} != 0 and {{ PADDING_W }} != 0: - input_pad[:, {{ PADDING_H }}:-{{ PADDING_H }}, {{ PADDING_W }}:-{{ PADDING_W }}, :] = {{ INPUT }}_cpu - elif {{ PADDING_H }} != 0: - input_pad[:, {{ PADDING_H }}:-{{ PADDING_H }}, :, :] = {{ INPUT }}_cpu - elif {{ PADDING_W }} != 0: - input_pad[:,:, {{ PADDING_W }}:-{{ PADDING_W }}, :] = {{ INPUT }}_cpu - else: - input_pad = {{ INPUT }}_cpu - - {% if VALIDATION_MODE %} - {% endif %} - - for kh in range(weight_shape[1]): - for kw in range(weight_shape[2]): - input_tile = input_pad[:, kh:input_pad_shape[1]-(weight_shape[1]-1)+kh, kw:input_pad_shape[2]-(weight_shape[2]-1)+kw, :] - input_tile = input_tile[:,::{{ STRIDE_H }},::{{ STRIDE_W }}, :] - kernel_tile = {{ WEIGHT }}_cpu[:, kh, kw, :].t() - input_tile = input_tile.reshape(-1, input_pad_shape[3]) - {% if VALIDATION_MODE %} - if kh == 0 and kw == 0: - {{ KERNEL_NAME }}(input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu, intermediate_op=0b01) - elif kh == weight_shape[1]-1 and kw == weight_shape[2]-1: - {{ KERNEL_NAME }}(input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu, intermediate_op=0b10) - else: - {{ KERNEL_NAME }}(input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu, intermediate_op=0b11) - {% else %} - {{ KERNEL_NAME }}(input_tile, kernel_tile, {{ OUT }}_cpu, {{ OUT }}_cpu) # input, weight, bias, out - {% endif %} - - {{ OUT }}_cpu = {{ OUT }}_cpu.permute(0, 3, 1, 2) - {{ OUT }}.copy_({{ OUT }}_cpu) -""" - -class LLVMConvTemplate(LLVMTemplate): - def __init__(self, input_nodes, layout, input_reorder=None, **kwargs): - super().__init__("kernel", input_nodes, layout, input_reorder) - self.stride = kwargs["stride"] - self.padding = kwargs["padding"] - self.dilation = kwargs["dilation"] - weight_shape = [str(i) for i in input_nodes[1].layout.size] - self.function_name = "Conv2D_" + "_".join(weight_shape) - self.gemm_args = ['input', 'weight', 'bias', 'output'] - - def render(self, - kernel: LLVMTemplateKernel, - template_buffer_node = None, - epilogue_nodes: Optional[List[IRNode]] = None, - **kwargs): - if template_buffer_node is not None: - self.output_node = template_buffer_node - if epilogue_nodes is not None and len(epilogue_nodes) > 0: - self.output_nodes = cast(Buffer, epilogue_nodes[-1]) - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - input_h = X.get_size()[2] - input_w = X.get_size()[3] - kernel_h = W.get_size()[2] - kernel_w = W.get_size()[3] - i_c = X.get_size()[1] - o_c = W.get_size()[0] - - weight_shape = self.input_nodes[1].get_size() - i_tile_h = int((input_h + 2*self.padding[0] - (weight_shape[2]-1) - 1) / self.stride[0]) + 1 - i_tile_w = int((input_w + 2*self.padding[1] - (weight_shape[3]-1) - 1) / self.stride[1]) + 1 - - gemm_m = i_tile_h * i_tile_w - gemm_n = o_c - gemm_k = i_c - - options = dict( - KERNEL_NAME=self.name, - kernel=kernel, - M=gemm_m, - N=gemm_n, - K=gemm_k, - TILE_M=4, - TILE_N=4, - TILE_K=4, - DATA_TYPE="float", - DATA_STYPE="f32", - DATA_SIZE=4, - ) - code = self._template_from_string(CONV2D_TEMPLATE).render(**options) - kernel.add_loop_info([options["M"], options["N"], options["K"]], [options["TILE_M"], options["TILE_N"], options["TILE_K"]]) - kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=self.input_reorder) - - self.hash_value = get_hash(code.strip()) - return code - - def function_render(self, kernel_name, input_args): - - options = dict( - KERNEL_NAME=kernel_name, - FUNC_NAME=self.function_name, - INPUT=input_args[0], - WEIGHT=input_args[1], - BIAS=input_args[2], - OUT=input_args[3], - STRIDE_H=self.stride[0], - STRIDE_W=self.stride[1], - PADDING_H=self.padding[0], - PADDING_W=self.padding[1], - DILATION_H=self.dilation[0], - DILATION_W=self.dilation[1], - VALIDATION_MODE=extension_config.CONFIG_TORCHSIM_VALIDATION_MODE, - HASH_VALUE=self.hash_value - ) - code = self._template_from_string(CONV2D_FUNC).render(**options) - return code - - def get_arg_attributes(self): - arg_attributes = {} - - input_shape = self.input_nodes[0].get_size() - weight_shape = self.input_nodes[1].get_size() - gemm_h = int((input_shape[2] + 2*self.padding[0] - (weight_shape[2]-1) - 1) / self.stride[0]) + 1 - gemm_w = int((input_shape[3] + 2*self.padding[1] - (weight_shape[3]-1) - 1) / self.stride[1]) + 1 - - gemm_input_shape = [input_shape[0],input_shape[1],gemm_h, gemm_w] - gemm_weight_shape = [weight_shape[0],weight_shape[1],1,1] - gemm_output_shape = [gemm_input_shape[2]*gemm_input_shape[3], gemm_weight_shape[0]] # Consider Batch size 1 - - arg_attributes[self.gemm_args[0]] = [LLVMKernelArgs.LLVM_ARGS_IN, self.input_nodes[0].layout.dtype, math.prod(gemm_input_shape)] - arg_attributes[self.gemm_args[1]] = [LLVMKernelArgs.LLVM_ARGS_IN, self.input_nodes[1].layout.dtype, math.prod(gemm_weight_shape)] - arg_attributes[self.gemm_args[2]] = [LLVMKernelArgs.LLVM_ARGS_IN, self.input_nodes[0].layout.dtype, math.prod(gemm_output_shape)] - arg_attributes[self.gemm_args[3]] = [LLVMKernelArgs.LLVM_ARGS_OUT, self.input_nodes[0].layout.dtype, math.prod(gemm_output_shape)] - - return arg_attributes \ No newline at end of file diff --git a/PyTorchSimFrontend/llvm/llvm_gemm_template.py b/PyTorchSimFrontend/llvm/llvm_gemm_template.py deleted file mode 100644 index 534b7727..00000000 --- a/PyTorchSimFrontend/llvm/llvm_gemm_template.py +++ /dev/null @@ -1,139 +0,0 @@ -from typing import List, Optional, cast - -from PyTorchSimFrontend.llvm.llvm_template import LLVMTemplate -from PyTorchSimFrontend.llvm.llvm_template import LLVMTemplateKernel -from torch._inductor.ir import Buffer -from torch._inductor.ir import IRNode -from torch._inductor.ir import ReinterpretView - -GEMM_TEMPLATE = r""" -@sram_accum = dso_local global [{{ TILE_M * TILE_N }} x {{ DATA_TYPE }}] zeroinitializer, align 4 - -define dso_local void @{{ KERNEL_NAME }}{{kernel.def_kernel(inputs=[X, W, Bias], outputs=[Y], names_str="X, W, Bias, Y", input_reorder=input_reorder)}} { -entry: - br label %for.cond1.preheader - -for.cond1.preheader: - %indvars.iv49 = phi i64 [ 0, %entry ], [ %indvars.iv.next50, %for.cond.cleanup3 ] - {% if X_transposed %}%add.ptr = getelementptr inbounds {{ DATA_TYPE }}, ptr %X, i64 %indvars.iv49{% else %}%0 = mul nuw nsw i64 %indvars.iv49, {{ K }} - %add.ptr = getelementptr inbounds {{ DATA_TYPE }}, ptr %X, i64 %0{% endif %} - {% if not X_transposed %}%1{% else %}%0{% endif %} = mul nuw nsw i64 %indvars.iv49, {{ N }} - %add.ptr20 = getelementptr inbounds {{ DATA_TYPE }}, ptr %Y, i64 {% if not X_transposed %}%1{% else %}%0{% endif %} - br label %for.body4 - -for.cond.cleanup: - ret void - -for.cond.cleanup3: - %indvars.iv.next50 = add nuw nsw i64 %indvars.iv49, {{ TILE_M }} - %cmp = icmp ult i64 %indvars.iv49, {{ M - TILE_M }} - br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup - -for.body4: - %indvars.iv47 = phi i64 [ 0, %for.cond1.preheader ], [ %indvars.iv.next48, %for.cond.cleanup7 ] - tail call void @llvm.memset.p0.i64(ptr @sram_accum, i8 0, i64 {{ TILE_M * TILE_N * DATA_SIZE }}, i1 false) - {% if W_transposed%}{% if X_transposed %}%1{% else %}%2{% endif %} = mul nuw nsw i64 %indvars.iv47, {{ K }} - %invariant.gep = getelementptr inbounds {{ DATA_TYPE }}, ptr %W, i64 {% if X_transposed %}%1{% else %}%2{% endif %}{% else %}%invariant.gep = getelementptr inbounds {{ DATA_TYPE }}, ptr %W, i64 %indvars.iv47{% endif %} - br label %for.body8 - -for.cond.cleanup7: - %add.ptr22 = getelementptr inbounds {{ DATA_TYPE }}, ptr %add.ptr20, i64 %indvars.iv47 - {{ kernel.store_output(TILE_N, TILE_M, N, DATA_TYPE, DATA_STYPE, "%add.ptr22", "%call18", "Y", DATA_SIZE) }} - %indvars.iv.next48 = add nuw nsw i64 %indvars.iv47, {{ TILE_N }} - %cmp2 = icmp ult i64 %indvars.iv47, {{ N - TILE_N }} - br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 - -for.body8: - %indvars.iv = phi i64 [ 0, %for.body4 ], [ %indvars.iv.next, %for.body8 ] - {% if X_transposed%}{% if W_transposed %}%2{% else %}%1{% endif %} = mul nuw nsw i64 %indvars.iv, {{ M }}{% endif %} - %add.ptr10 = getelementptr inbounds {{ DATA_TYPE }}, ptr %add.ptr, i64 {% if X_transposed %}{% if W_transposed %}%2{% else %}%1{% endif %}{% else %}%indvars.iv{% endif %} - %call = {{ kernel.load_matrix(TILE_K, TILE_M, K, DATA_TYPE, DATA_STYPE, "%add.ptr10", "X", DATA_SIZE)}} - {% if W_transposed %}%gep = getelementptr inbounds {{ DATA_TYPE }}, ptr %invariant.gep, i64 %indvars.iv - %call16 = {{ kernel.load_matrix(TILE_K, TILE_N, K, DATA_TYPE, DATA_STYPE, "%gep", "W", DATA_SIZE)}}{% else %}%2 = mul nuw nsw i64 %indvars.iv, {{ N }} - %gep = getelementptr inbounds {{ DATA_TYPE }}, ptr %invariant.gep, i64 %2 - %call16 = {{ kernel.load_matrix(TILE_N, TILE_K, N, DATA_TYPE, DATA_STYPE, "%gep", "W", DATA_SIZE)}}{% endif %} - {% if W_transposed %}%trans0 = call <{{ TILE_K * TILE_N }} x {{ DATA_TYPE }}> @llvm.matrix.transpose.v{{ TILE_K*TILE_N }}{{ DATA_STYPE }}(<{{ TILE_N * TILE_K }} x {{ DATA_TYPE }}> %call16, i32 {{ TILE_K }}, i32 {{ TILE_N }}){% endif %} - {% if X_transposed %}%trans1 = call <{{ TILE_M * TILE_K }} x {{ DATA_TYPE }}> @llvm.matrix.transpose.v{{ TILE_M*TILE_K }}{{ DATA_STYPE }}(<{{ TILE_K * TILE_M }} x {{ DATA_TYPE }}> %call, i32 {{ TILE_K }}, i32 {{ TILE_M }}){% endif %} - %call17 = call <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }}> @llvm.matrix.multiply.v{{ TILE_M*TILE_K }}{{ DATA_STYPE }}.v{{ TILE_K*TILE_N }}{{ DATA_STYPE }}.v{{ TILE_M*TILE_N }}{{ DATA_STYPE }}(<{{ TILE_K * TILE_N}} x {{ DATA_TYPE }}> {% if W_transposed %}%trans0{% else %}%call16{% endif %}, <{{ TILE_M * TILE_K}} x {{ DATA_TYPE }}> {% if X_transposed %}%trans1{% else %}%call{% endif %}, i32 {{ TILE_N }}, i32 {{ TILE_K }}, i32 {{ TILE_M }}) - %tmp_acc = load <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }}>, ptr @sram_accum, align 4 - %call18 = fadd <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }} > %call17, %tmp_acc - store <{{ TILE_M * TILE_N }} x {{ DATA_TYPE }}> %call18, ptr @sram_accum, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, {{ TILE_K }} - %cmp6 = icmp ult i64 %indvars.iv, {{ K - TILE_K }} - br i1 %cmp6, label %for.body8, label %for.cond.cleanup7 -} -declare void @llvm.memset.p0.i64(ptr, i8, i64, i1) -{% if TILE_M == TILE_N %} -declare <{{TILE_M * TILE_K}} x float> @llvm.matrix.column.major.load.v{{ TILE_M * TILE_K }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(ptr , i64, i1, i32, i32) #2 -{% else %} -declare <{{TILE_M * TILE_K}} x float> @llvm.matrix.column.major.load.v{{ TILE_M * TILE_K }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(ptr , i64, i1, i32, i32) #2 -declare <{{TILE_N * TILE_K}} x float> @llvm.matrix.column.major.load.v{{ TILE_N * TILE_K }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(ptr , i64, i1, i32, i32) #2 -{% endif %} -declare <{{TILE_N}} x float> @llvm.matrix.column.major.load.v{{ TILE_N }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(ptr , i64, i1, i32, i32) #2 -declare <{{TILE_M * TILE_N}} x float> @llvm.matrix.multiply.v{{ TILE_M*TILE_K }}{{ DATA_STYPE }}.v{{ TILE_K*TILE_N }}{{ DATA_STYPE }}.v{{ TILE_M*TILE_N }}{{ DATA_STYPE }}(<{{ TILE_N*TILE_K }} x {{ DATA_TYPE }}>, <{{ TILE_K*TILE_M }} x {{ DATA_TYPE }}>, i32, i32, i32) #1 -declare void @llvm.matrix.column.major.store.v{{ TILE_M * TILE_N }}{{ DATA_STYPE }}.p0{{ DATA_STYPE }}(<{{ TILE_M*TILE_N }} x {{ DATA_TYPE }}>, ptr , i64, i1, i32, i32) #3 -{% if W_transposed %} -declare <{{TILE_K * TILE_N}} x float> @llvm.matrix.transpose.v{{ TILE_K*TILE_N }}{{ DATA_STYPE }}( <{{ TILE_N*TILE_K }} x {{ DATA_TYPE }}>, i32, i32) #1 -{% endif %} -{% if X_transposed %} -declare <{{TILE_M * TILE_K}} x float> @llvm.matrix.transpose.v{{ TILE_M*TILE_K }}{{ DATA_STYPE }}( <{{ TILE_K*TILE_M }} x {{ DATA_TYPE }}>, i32, i32) #1 -{% endif %} -""" - -class LLVMGemmTemplate(LLVMTemplate): - def __init__(self, input_nodes, layout, input_reorder=None): - super().__init__("kernel", input_nodes, layout, input_reorder) - - def is_transposed(self, node): - if isinstance(node, ReinterpretView): - if node.layout.stride != node.data.layout.stride: - if node.layout.stride[-2] == node.data.layout.stride[-1] and node.layout.stride[-1] == node.data.layout.stride[-2]: - return True - else: - raise NotImplementedError("If the stride is not equal to the original stride, it should have been transposed.") - return False - - def render(self, - kernel: LLVMTemplateKernel, - template_buffer_node = None, - epilogue_nodes: Optional[List[IRNode]] = None, - **kwargs): - if template_buffer_node is not None: - self.output_node = template_buffer_node - if epilogue_nodes is not None and len(epilogue_nodes) > 0: - self.output_node = cast(Buffer, epilogue_nodes[-1]) - - X, W = self.input_nodes[0], self.input_nodes[1] - Y = self.output_node - Bias = None if len(self.input_nodes) == 2 else self.input_nodes[2] - - TILE_M = min(16, X.get_size()[0]) # TODO:: This should be determined by the size of the SRAM - TILE_N = min(16, W.get_size()[1]) # FIXME: 16 is hard-coded - TILE_K = min(16, X.get_size()[1]) - - W_transposed = self.is_transposed(W) - X_transposed = self.is_transposed(X) - - options = dict( - KERNEL_NAME=self.name, - kernel=kernel, - M=X.get_size()[0], - N=W.get_size()[1], - K=X.get_size()[1], - TILE_M=TILE_M, - TILE_N=TILE_N, - TILE_K=TILE_K, - DATA_TYPE="float", - DATA_STYPE="f32", - DATA_SIZE=4, - X = X, - W = W, - Y = Y, - Bias = Bias, - W_transposed = W_transposed, - X_transposed = X_transposed, - input_reorder = self.input_reorder - ) - code = self._template_from_string(GEMM_TEMPLATE).render(**options) - kernel.add_loop_info([options["M"], options["N"], options["K"]], [options["TILE_M"], options["TILE_N"], options["TILE_K"]]) - return code \ No newline at end of file diff --git a/PyTorchSimFrontend/llvm/llvm_lowering.py b/PyTorchSimFrontend/llvm/llvm_lowering.py deleted file mode 100644 index f33d61a4..00000000 --- a/PyTorchSimFrontend/llvm/llvm_lowering.py +++ /dev/null @@ -1,91 +0,0 @@ -from typing import List, Optional, Sequence - -import torch -from torch._inductor.lowering import lowerings -from torch._inductor.kernel.mm_common import mm_args -from torch._inductor import ir -from torch._inductor.virtualized import V -from torch._inductor.ir import TensorBox -from PyTorchSimFrontend.llvm.llvm_gemm_template import LLVMGemmTemplate -from PyTorchSimFrontend.llvm.llvm_conv_template import LLVMConvTemplate - -aten = torch.ops.aten - -def tuned_mm(mat1, mat2, * ,layout=None): - m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2, layout=layout) - llvm_template = LLVMGemmTemplate([mat1, mat2], layout) - - return llvm_template.generate().output_node() - -def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None): - m, n, k, layout, mat1, mat2, inp_expanded = mm_args(mat1, mat2, inp, layout=layout) - llvm_template = LLVMGemmTemplate([mat1, mat2, inp_expanded], layout) # TODO: add alpha and beta - - return llvm_template.generate().output_node() - -def conv_layout( - x: TensorBox, - weight: TensorBox, - bias: Optional[TensorBox], - stride: Sequence[int], - padding: tuple[int, ...], - dilation: tuple[int, ...], - transposed: bool, - output_padding: tuple[int, ...], - groups: int, -) -> ir.Layout: - """Determine output layout for a convolution""" - with V.graph.fake_mode: - output = torch.ops.aten.convolution( - ir.ir_node_to_tensor(x, guard_shape=True), - ir.ir_node_to_tensor(weight, guard_shape=True), - ir.ir_node_to_tensor(bias, guard_shape=True), - stride, - tuple(V.graph.sizevars.size_hint(p) for p in padding), - dilation, - transposed, - tuple(V.graph.sizevars.size_hint(p) for p in output_padding), - groups, - ) - sizes = ir.convert_shape_to_inductor(output.size()) - stride = ir.convert_shape_to_inductor(output.stride()) - - return ir.FixedLayout( - x.get_device(), - x.get_dtype(), - sizes, - stride, - ) - -def convolution( - x: TensorBox, - weight: TensorBox, - bias: TensorBox, - stride: List[int], - padding: List[int], - dilation: List[int], - transposed: bool, - output_padding: List[int], - groups: int, -): - stride = tuple(stride) - padding = tuple(padding) - dilation = tuple(dilation) - output_padding = tuple(output_padding) - - kwargs = { - "stride": stride, - "padding": padding, - "dilation": dilation, - "transposed": transposed, - "output_padding": output_padding, - "groups": groups, - } - - layout = conv_layout(x, weight, None, **kwargs) - llvm_template = LLVMConvTemplate([x, weight, bias], layout, **kwargs) - return llvm_template.generate().output_node() - -lowerings.update({getattr(aten.mm, overload): tuned_mm for overload in aten.mm.overloads()}) -lowerings.update({getattr(aten.addmm, overload): tuned_addmm for overload in aten.addmm.overloads()}) -lowerings.update({getattr(aten.convolution, overload): convolution for overload in aten.convolution.overloads()}) \ No newline at end of file diff --git a/PyTorchSimFrontend/llvm/llvm_template.py b/PyTorchSimFrontend/llvm/llvm_template.py deleted file mode 100644 index 4f85ef00..00000000 --- a/PyTorchSimFrontend/llvm/llvm_template.py +++ /dev/null @@ -1,234 +0,0 @@ -import functools -import itertools -from typing import List, Optional -from unittest.mock import patch - -from torch._inductor.codegen.common import KernelTemplate -from torch._inductor.codegen.common import ChoiceCaller -from torch._inductor.codegen.common import Kernel -from torch._inductor.codegen.common import OpOverrides -from torch._inductor.ir import Buffer -from torch._inductor.ir import IRNode -from torch._inductor.ir import TemplateBuffer -from torch._inductor.codegen.cuda.cuda_kernel import CUDATemplateCaller -from torch._inductor.autotune_process import TensorMeta -from torch._inductor.virtualized import V - -from PyTorchSimFrontend.llvm.llvm_autotune import LLVMBenchmarkRequest -from PyTorchSimFrontend.llvm.llvm_common import LLVMKernelArgs - -class LLVMTemplateKernel(Kernel): - overrides = OpOverrides - def __init__(self, kernel_name, - kernel_caller_function=None, - kernel_function_render=None, - kernel_arg_attributes=None) -> None: - super().__init__(LLVMKernelArgs()) - self.kernel_name = kernel_name - self.named_nodes = {} - self.loop_info = {} - self.load_desc = {} - self.store_desc = {} - self.kernel_caller_function = kernel_caller_function - self.kernel_function_render = kernel_function_render - self.kernel_arg_attributes = kernel_arg_attributes - - def load_matrix(self, row, col, stride, dtype, stype, ptr, base_addr, data_size): - suffix = f"v{row*col}{stype}.p0{stype}" - argument = f"(ptr {ptr}, i64 {stride}, i1 0, i32 {row}, i32 {col})" - code = f"<{row*col} x {dtype}> @llvm.matrix.column.major.load.{suffix} {argument}" - - self.add_desc(True, base_addr, data_size, [col, 1], [row, col]) - return f"call {code}" - - def store_matrix(self, row, col, stride, dtype, stype, ptr, vec, base_addr, data_size): - suffix = f"v{row*col}{stype}.p0{stype}" - argument = f"(<{row*col} x {dtype}> {vec}, ptr {ptr}, i64 {stride}, i1 0, i32 {row}, i32 {col})" - code = f"void @llvm.matrix.column.major.store.{suffix} {argument}" - - self.add_desc(False, base_addr, data_size, [col, 1], [row, col]) - return f"call {code}" - - def store_output(self, row, col, stride, dtype, stype, ptr, vec, base_addr, data_size): - code = "" - if len(self.args.input_buffers) > 2: - indexes = [f"i32 {i%row}" for i in range(row * col)] - mask = ", ".join(indexes) - code += f"%add.ptr23 = getelementptr inbounds {dtype}, ptr %Bias, i64 %indvars.iv47\n " - code += f"%call19 = " + self.load_matrix(1, row, 1, dtype, stype, "%add.ptr23", "Bias", data_size) + "\n " #FIXME: Hardcoded %call19 - code += f"%call20 = shufflevector <{row} x {dtype}> %call19, <{row} x {dtype}> undef, <{row*col} x i32> <{mask}>\n " - code += f"%call21 = fadd <{row*col} x {dtype}> %call18, %call20\n " - vec = "%call21" - code += self.store_matrix(row, col, stride, dtype, stype, ptr, vec, base_addr, data_size) - return code - - def add_desc(self, is_load, base_addr, element_size, stride_list, tile_size): - if is_load: - key = f"load{len(self.load_desc)}" - self.load_desc[key] = { - "base_addr": base_addr, - "element_size": element_size, - "stride_list": stride_list, - "tile_size": tile_size, - "tile_stride": stride_list[-2:] - } - else: - key = f"store{len(self.store_desc)}" - self.store_desc[key] = { - "base_addr": base_addr, - "element_size": element_size, - "stride_list": stride_list, - "tile_size": tile_size, - "tile_stride": stride_list[-2:] - } - - def add_loop_info(self, mat_size, tile_size): - for idx, (loop_size, stride) in enumerate(zip(mat_size, tile_size)): - self.loop_info[f"index{idx}"] = [0, loop_size, stride] - - def meta_kernel(self): - wrapper = V.graph.wrapper_code - arg_attributes = self.kernel_arg_attributes - if arg_attributes is None: - _, _, arg_attributes = self.args.llvm_argdefs() - wrapper.add_import_once('\nprint(f\'Wrapper Codegen Path = {__file__}\')') - wrapper.add_import_once(f'\nfrom extension_codecache import CustomAsyncCompile') - wrapper.add_import_once(f'\ncustom_async_compile = CustomAsyncCompile()') - # Dump loop and load/store information - wrapper.add_import_once(f"loop_info = {self.loop_info}") - wrapper.add_import_once(f"load_tile_info = {self.load_desc}") - wrapper.add_import_once(f"store_tile_info = {self.store_desc}") - wrapper.add_import_once(f"arg_attributes = {arg_attributes}") - - def call_kernel(self, kernel_name): - """ - Generates code to call the kernel through V.graph.wrapper_code. - used from within torch._inductor.wrapper.WrapperCodeGen - """ - wrapper = V.graph.wrapper_code - _, call_args, _ = self.args.python_argdefs() - wrapper.generate_kernel_call( - kernel_name if self.kernel_caller_function is None else self.kernel_caller_function, - call_args, - cuda=False, - ) - - def def_kernel( - self, - inputs: List[IRNode], - outputs: List[IRNode], - names_str: str = "", - input_reorder: Optional[List[int]] = None, - ) -> str: - names = [x.strip() for x in names_str.strip().split(",")] - if len(inputs) + len(outputs) != len(names): - raise RuntimeError( - f"{len(inputs) + len(outputs)=} != {len(names)=}, {inputs=}, {outputs=}, {names=}" - ) - - if input_reorder is not None: - assert len(inputs) == len(input_reorder) - else: - input_reorder = list(range(len(inputs))) - - for idx in input_reorder: - name = names[idx] - node = inputs[idx] - if node is not None: - self.named_nodes[name] = node - self.args.input_buffers[node.get_name()] = name - - for name, node in zip(names[len(inputs) : len(inputs) + len(outputs)], outputs): - if node is not None: - self.named_nodes[name] = node - self.args.output_buffers[node.get_name()] = name - - arg_defs, *_ = self.args.llvm_argdefs(only_args=True) - return f"({', '.join(arg_defs)})" - - def def_function(self): - _, call_args, _ = self.args.python_argdefs() - if self.kernel_function_render is not None: - return self.kernel_function_render(input_args=call_args) - -class LLVMTemplateCaller(CUDATemplateCaller): - def __str__(self): - return f"LLVMTemplateCaller(source_file={self.bmreq.source_file})" - - def call_name(self) -> str: - return f"llvm_template_kernels.{self.name}" - -class LLVMTemplate(KernelTemplate): - index_counter = itertools.count() - - def __init__(self, name, input_nodes, layout, input_reorder = None): - """ - Baseclass for LLVM Templates, derived from KernelTemplate. Not to be instantiated directly. - - Args: - name (str): The name of the CUDATemplate object. - input_nodes (List[IRNode]): A list of input IRNodes. - layout (Layout): The layout of the output buffer / tensor. - input_reorder (Optional[List[int]]): An optional list that specifies the order of the input nodes. - - """ - super().__init__(name) - self.input_nodes = input_nodes - self.output_node: Buffer = Buffer("buf_out", layout) - self.input_reorder = input_reorder - self.layout = layout - - def generate(self, **kwargs) -> ChoiceCaller: - kernel_name = f"llvm_{self.name}" - with patch.object(V.graph, "get_dtype", self._fake_get_dtype(self.output_node)): - kernel = LLVMTemplateKernel(kernel_name=kernel_name, - kernel_caller_function=self.function_name if hasattr(self, 'function_name') else None, - kernel_function_render=self.function_render if hasattr(self, 'function_render') else None, - kernel_arg_attributes=self.get_arg_attributes() if hasattr(self, 'get_arg_attributes') else None) - code = self.render(kernel=kernel, **kwargs) - - kernel_hash_name = f"llvm_{self.name}_{next(self.index_counter)}" - extra_args = [] - # create the BenchmarkRequest - bmreq = LLVMBenchmarkRequest( - kernel_name=kernel_name, - input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes), - output_tensor_meta=TensorMeta.from_irnodes(self.output_node), - extra_args=extra_args, - source_code=code, - ) - - def make_kernel_render( - template_node: TemplateBuffer, - epilogue_nodes: Optional[List[IRNode]] = None, - ): - kernel = LLVMTemplateKernel( - kernel_name=kernel_hash_name, - kernel_function_render=functools.partial( - self.function_render, - kernel_name=kernel_hash_name - ) if hasattr(self, 'function_render') else None, - kernel_caller_function=self.function_name if hasattr(self, 'function_name') else None, - kernel_arg_attributes=self.get_arg_attributes() if hasattr(self, 'get_arg_attributes') else None - ) - render = functools.partial( - self.render, - kernel=kernel, - template_buffer_node=template_node, - epilogue_nodes=epilogue_nodes, - **kwargs, # includes "op" argument in case of CUTLASSGemmTemplate - ) - return kernel, render - - return LLVMTemplateCaller( - kernel_hash_name, - self.name, - self.input_nodes, - self.output_node.get_layout(), - make_kernel_render, - bmreq, - self, - ) - - def render(self, **kwargs) -> str: - raise NotImplementedError From b87745a8d14eca815789cb0882df7420da9cb8b5 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 4 Aug 2025 05:07:26 +0000 Subject: [PATCH 2/3] [CI] Add CI Tests for public repo --- .github/workflows/pull-request2.yml | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .github/workflows/pull-request2.yml diff --git a/.github/workflows/pull-request2.yml b/.github/workflows/pull-request2.yml new file mode 100644 index 00000000..30330b4b --- /dev/null +++ b/.github/workflows/pull-request2.yml @@ -0,0 +1,36 @@ +name: PR test CI + +on: + pull_request: + branches: [ "master", "develop" ] + +jobs: + build-and-test: + runs-on: ubuntu-latest + + steps: + # Step 1: Checkout the repository + - name: Checkout Code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + submodules: recursive + + # Step 2: Set up Docker + - name: Set up Docker + uses: docker/setup-buildx-action@v3 + + # Step 3: Build Docker Image (no push) + - name: Build Docker Image + run: | + docker build \ + --build-arg TORCHSIM_SHA=${{ github.event.pull_request.head.sha }} \ + -t torchsim-ci:${{ github.sha }} . + + # Step 4: Run test_add.py + - name: Run test_add.py + run: | + echo "Running test_add.py" + docker run --rm \ + torchsim-ci:${{ github.sha }} \ + python3 PyTorchSim/tests/test_add.py \ No newline at end of file From eb9db6bb3a150957b70390713b75fa5ff48d5cf8 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Mon, 4 Aug 2025 05:20:56 +0000 Subject: [PATCH 3/3] [CI] CI refactor duplicated CI --- .github/workflows/docker-base-image.yml | 39 +- .github/workflows/docker-image.yml | 629 ++------------------ .github/workflows/pull-request.yml | 671 ---------------------- .github/workflows/pull-request2.yml | 36 -- .github/workflows/pull-request_mobile.yml | 658 --------------------- .github/workflows/pytorchsim_test.yml | 446 ++++++++++++++ .github/workflows/tag_release.yml | 38 +- Dockerfile | 65 --- Dockerfile.base | 42 +- 9 files changed, 565 insertions(+), 2059 deletions(-) delete mode 100644 .github/workflows/pull-request.yml delete mode 100644 .github/workflows/pull-request2.yml delete mode 100644 .github/workflows/pull-request_mobile.yml create mode 100644 .github/workflows/pytorchsim_test.yml diff --git a/.github/workflows/docker-base-image.yml b/.github/workflows/docker-base-image.yml index 5f2005b7..bb79925c 100644 --- a/.github/workflows/docker-base-image.yml +++ b/.github/workflows/docker-base-image.yml @@ -8,13 +8,11 @@ on: jobs: build: - runs-on: self-hosted + runs-on: ubuntu-latest permissions: contents: read packages: write - attestations: write - id-token: write steps: # Step 1: Checkout the repository @@ -27,13 +25,44 @@ jobs: with: registry: ghcr.io username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} + password: ${{ secrets.GITHUB_TOKEN }} - # Step 4: Build and Push Docker Image + # Step 2: Set environemnt + - name: Set environment + env: + GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + echo "IMAGE_TAG=torchsim-ci:${GITHUB_SHA}" >> $GITHUB_ENV + echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV + echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" + + gem5_response_file=/tmp/releases-gem5-latest.json + curl -s https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} + GEM5_ASSET_ID=$(jq ".assets[0].id" ${gem5_response_file}) + echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" + echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV + + llvm_response_file=/tmp/releases-gem5-latest.json + curl -s https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} + LLVM_ASSET_ID=$(jq ".assets[0].id" ${llvm_response_file}) + echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" + echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV + + spike_response_file=/tmp/releases-spike-latest.json + curl -s https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/latest > ${spike_response_file} + SPIKE_ASSET_ID=$(jq ".assets[0].id" ${spike_response_file}) + echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" + echo "SPIKE_ASSET_ID=$SPIKE_ASSET_ID" >> $GITHUB_ENV + + # Step 3: Build and Push Docker Image - name: Build and Push Docker Image uses: docker/build-push-action@v4 with: context: . file: ./Dockerfile.base push: true + build-args: | + GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} + LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} + SPIKE_ASSET_ID=${{ env.SPIKE_ASSET_ID }} tags: ghcr.io/psal-postech/torchsim_base:latest diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index ece4a6d5..61eb96e1 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -1,612 +1,69 @@ -name: Docker Image CI +name: Docker image CI on: - push: - branches: [ "master" ] + pull_request: + branches: [ "master", "develop" ] jobs: - build: - runs-on: [self-hosted, Linux] + build-and-test: + runs-on: ubuntu-latest permissions: contents: read packages: write - attestations: write - id-token: write steps: # Step 1: Checkout the repository - name: Checkout Code uses: actions/checkout@v4 with: - repository: PSAL-POSTECH/PyTorchSim - ref: ${{ env.GITHUB_SHA }} + ref: ${{ github.event.pull_request.head.sha }} submodules: recursive - - name: Log in to GitHub Container Registry + + # Step 2: Log in to GitHub Container Registry + - name: Login to GHCR uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - # Step 3: Pull the Cached Image - - name: Pull Cached Image & Set environment - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available" - echo "IMAGE_TAG=torchsim-ci:$GITHUB_SHA" >> $GITHUB_ENV - echo "GITHUB_SHA=$GITHUB_SHA" >> $GITHUB_ENV - - gem5_response_file=/tmp/releases-gem5-latest.json - response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} ) - GEM5_ASSET_ID=$(cat ${gem5_response_file} | jq ".assets[0]."id"") - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV - - llvm_response_file=/tmp/releases-gem5-latest.json - response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} ) - LLVM_ASSET_ID=$(cat ${llvm_response_file} | jq ".assets[0]."id"") - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV - - mkdir -p /tmp/torchsim-ci/${GITHUB_SHA} - echo "DUMP_PATH=/tmp/torchsim-ci/${GITHUB_SHA}" + password: ${{ secrets.GITHUB_TOKEN }} - # Step 4: Build and Push Docker Image + # Step 3: Build and Push Docker Image - name: Build and Push Docker Image uses: docker/build-push-action@v6 with: context: . file: ./Dockerfile push: true - build-args: | - GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} - LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} - TORCHSIM_SHA=${{ env.GITHUB_SHA }} - secrets: | - GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }} - tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG }} - - test_add: - name: Run test_add.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_add.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_add.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_add.py - - test_activation: - name: Run test_activation.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_activation.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_activation.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_activation.py - - test_batchnorm: - name: Run test_batchnorm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_batchnorm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_batchnorm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_batchnorm.py - - test_bmm: - name: Run test_bmm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_bmm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_bmm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_bmm.py - - test_cnn: - name: Run test_cnn.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_cnn.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_cnn.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_cnn.py - - test_conv2d: - name: Run test_conv2d.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_conv2d.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_conv2d.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_conv2d.py - - test_matmul: - name: Run test_matmul.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_matmul.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_matmul.py - - test_reduce: - name: Run test_reduce.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_reduce.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_reduce.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_reduce.py - - test_softmax: - name: Run test_softmax.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_softmax.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_softmax.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_softmax.py - - test_transpose2D: - name: Run test_transpose2D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transpose2D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transpose2D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose2D.py - - test_view3D_2D: - name: Run test_view3D_2D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_view3D_2D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_view3D_2D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_view3D_2D.py - - test_layernorm: - name: Run test_layernorm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_layernorm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_layernorm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_layernorm.py - - test_mlp: - name: Run test_mlp.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_mlp.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_mlp.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_mlp.py - - test_resnet: - name: Run test_resnet.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_resnet.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_resnet.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_resnet.py - - test_transformer: - name: Run test_transformer.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transformer.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transformer.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transformer.py - - test_transpose3D: - name: Run test_transpose3D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transpose3D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transpose3D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose3D.py - - test_sparsity: - name: Run test_sparsity.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_sparsity.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_sparsity.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_sparsity.py - - test_pool: - name: Run test_pool.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_pool.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_pool.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_pool.py - - test_perceptron: - name: Run test_perceptron.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_single_perceptron.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_single_perceptron.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_single_perceptron.py - - test_fusion: - name: Run test_fusion - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_addmm_residual.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_addmm_residual.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_matmul_activation.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul_activation.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_matmul_scalar.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul_scalar.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py - - test_moe: - name: Run test_moe - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_moe.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_moe.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/MoE/test_moe.py - - test_mistral: - name: Run test_mistral - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_mistral.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_mistral.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py - - test_indirect: - name: Run test_indirect - runs-on: self-hosted - needs: build - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - steps: - - name: Run test_indirect.py - run: | - echo "Running test_indirect.py" - echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_indirect_access.py - - test_scheduler: - name: Run test_scheduler - runs-on: self-hosted - needs: build - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - steps: - - name: Run test_scheduler.py - run: | - echo "Running test_scheduler.py" - echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_scheduler.py - - test_cleanup: - name: Clean test cases - runs-on: self-hosted - needs: [test_add, test_batchnorm, test_bmm, test_cnn, test_conv2d, - test_matmul, test_reduce, test_softmax, - test_transpose2D, test_view3D_2D, test_layernorm, - test_mlp, test_resnet, test_transformer, test_transpose3D, - test_sparsity, test_activation, test_pool, test_perceptron, - test_fusion, test_mistral, test_moe, test_indirect, test_scheduler] - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - name: Clean test case - run: | - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump \ No newline at end of file + tags: ghcr.io/psal-postech/torchsim-test:${{ github.sha }} + + # Step 4: Wait for GHCR propagation + - name: Wait for GHCR propagation + run: | + for i in {1..30}; do + echo "Checking if image exists in GHCR (attempt $i)..." + if docker manifest inspect ghcr.io/psal-postech/torchsim-test:${GITHUB_SHA} > /dev/null 2>&1; then + echo "Image is now available in GHCR." + exit 0 + fi + echo "Image not yet available, retrying in 30 seconds..." + sleep 20 + done + echo "Image did not become available in GHCR within expected time." + exit 1 + + test-pytorchsim-wrapper: + needs: build-and-test + uses: ./.github/workflows/pytorchsim_test.yml + with: + image_name: ghcr.io/psal-postech/torchsim-test:${{ github.sha }} + vector_lane: 128 + spad_size: 128 + +# call-test2: +# needs: build-and-test +# uses: ./.github/workflows/pytorchsim_test.yml +# with: +# image_name: ghcr.io/psal-postech/${GITHUB_SHA} +# vector_lane: 8 +# spad_size: 32 \ No newline at end of file diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml deleted file mode 100644 index ecdbf861..00000000 --- a/.github/workflows/pull-request.yml +++ /dev/null @@ -1,671 +0,0 @@ -name: PR test CI - -on: - pull_request: - branches: [ "master", "develop" ] - -jobs: - build: - runs-on: [self-hosted, Linux] - - permissions: - contents: read - packages: write - attestations: write - id-token: write - - steps: - # Step 1: Checkout the repository - - name: Checkout Code - uses: actions/checkout@v4 - with: - repository: PSAL-POSTECH/PyTorchSim - ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive - # Step 2: Log in to GitHub Container Registry (optional) - # If you need to push the built image, authenticate here. - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - # Step 3: Pull the Cached Image - - name: Pull Cached Image & Set environment - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available" - echo "IMAGE_TAG=torchsim-ci:${GITHUB_SHA}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" - gem5_response_file=/tmp/releases-gem5-latest.json - response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} ) - GEM5_ASSET_ID=$(cat ${gem5_response_file} | jq ".assets[0]."id"") - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV - - llvm_response_file=/tmp/releases-gem5-latest.json - response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} ) - LLVM_ASSET_ID=$(cat ${llvm_response_file} | jq ".assets[0]."id"") - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV - - mkdir -p /tmp/torchsim-ci/${GITHUB_SHA} - echo "DUMP_PATH=/tmp/torchsim-ci/${GITHUB_SHA}" - - # Step 4: Build and Push Docker Image - - name: Build and Push Docker Image - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile - push: true - build-args: | - GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} - LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} - TORCHSIM_SHA=${{ env.GITHUB_SHA }} - secrets: | - GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }} - tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}} - - test_add: - name: Run test_add.py - runs-on: self-hosted - - permissions: - contents: read - packages: write - attestations: write - id-token: write - needs: build - - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_add.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_add.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_add.py - - test_activation: - name: Run test_activation.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_activation.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_activation.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_activation.py - - test_batchnorm: - name: Run test_batchnorm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_batchnorm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_batchnorm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_batchnorm.py - - test_bmm: - name: Run test_bmm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_bmm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_bmm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_bmm.py - - test_cnn: - name: Run test_cnn.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_cnn.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_cnn.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_cnn.py - - test_conv2d: - name: Run test_conv2d.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_conv2d.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_conv2d.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_conv2d.py - - test_matmul: - name: Run test_matmul.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_matmul.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_matmul.py - - test_reduce: - name: Run test_reduce.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_reduce.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_reduce.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_reduce.py - - test_softmax: - name: Run test_softmax.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_softmax.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_softmax.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_softmax.py - - test_transpose2D: - name: Run test_transpose2D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transpose2D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transpose2D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose2D.py - - test_view3D_2D: - name: Run test_view3D_2D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_view3D_2D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_view3D_2D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_view3D_2D.py - - test_layernorm: - name: Run test_layernorm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_layernorm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_layernorm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_layernorm.py - - test_mlp: - name: Run test_mlp.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_mlp.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_mlp.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_mlp.py - - test_resnet: - name: Run test_resnet.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - - name: Run test_resnet18.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_resnet.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_resnet.py - - - name: Run test_resnet50.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_resnet.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50 - - test_transformer: - name: Run test_transformer.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transformer.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transformer.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transformer.py - - test_transpose3D: - name: Run test_transpose3D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transpose3D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transpose3D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose3D.py - - test_sparsity: - name: Run test_sparsity.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_sparsity.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_sparsity.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_sparsity.py - - test_pool: - name: Run test_pool.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_pool.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_pool.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_pool.py - - test_perceptron: - name: Run test_perceptron.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_single_perceptron.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_single_perceptron.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_single_perceptron.py - - test_fusion: - name: Run test_fusion - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_addmm_residual.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_addmm_residual.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py - - - name: Run test_matmul_activation.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul_activation.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py - - - name: Run test_matmul_scalar.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul_scalar.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py - - - name: Run test_matmul_reduction.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul_reduction.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py - - - name: Run test_bmm_reduction.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_bmm_reduction.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py - - - name: Run test_prologue_fusion.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_prologue_fusion.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py - - - name: Run test_transformer_fusion.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transformer_fusion.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py - - - name: Run test_conv_fusion.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_conv_fusion.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py - - test_moe: - name: Run test_moe - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_moe.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_moe.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/MoE/test_moe.py - - test_mistral: - name: Run test_mistral - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_mistral.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_mistral.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py - - test_indirect: - name: Run test_indirect - runs-on: self-hosted - needs: build - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - steps: - - name: Run test_indirect.py - run: | - echo "Running test_indirect.py" - echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_indirect_access.py - - test_scheduler: - name: Run test_scheduler - runs-on: self-hosted - needs: build - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - steps: - - name: Run test_scheduler.py - run: | - echo "Running test_scheduler.py" - echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_scheduler.py - - test_cleanup: - name: Clean test cases - runs-on: self-hosted - needs: [test_add, test_batchnorm, test_bmm, test_cnn, test_conv2d, - test_matmul, test_reduce, test_softmax, - test_transpose2D, test_view3D_2D, test_layernorm, - test_mlp, test_resnet, test_transformer, test_transpose3D, - test_sparsity, test_activation, test_pool, test_perceptron, - test_fusion, test_mistral, test_moe, test_indirect, test_scheduler] - steps: - - name: Checkout code - uses: actions/checkout@v3 - - name: Clean test case - run: | - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump \ No newline at end of file diff --git a/.github/workflows/pull-request2.yml b/.github/workflows/pull-request2.yml deleted file mode 100644 index 30330b4b..00000000 --- a/.github/workflows/pull-request2.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: PR test CI - -on: - pull_request: - branches: [ "master", "develop" ] - -jobs: - build-and-test: - runs-on: ubuntu-latest - - steps: - # Step 1: Checkout the repository - - name: Checkout Code - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.head.sha }} - submodules: recursive - - # Step 2: Set up Docker - - name: Set up Docker - uses: docker/setup-buildx-action@v3 - - # Step 3: Build Docker Image (no push) - - name: Build Docker Image - run: | - docker build \ - --build-arg TORCHSIM_SHA=${{ github.event.pull_request.head.sha }} \ - -t torchsim-ci:${{ github.sha }} . - - # Step 4: Run test_add.py - - name: Run test_add.py - run: | - echo "Running test_add.py" - docker run --rm \ - torchsim-ci:${{ github.sha }} \ - python3 PyTorchSim/tests/test_add.py \ No newline at end of file diff --git a/.github/workflows/pull-request_mobile.yml b/.github/workflows/pull-request_mobile.yml deleted file mode 100644 index 053e3eac..00000000 --- a/.github/workflows/pull-request_mobile.yml +++ /dev/null @@ -1,658 +0,0 @@ -name: PR test CI for mobile - -on: - pull_request: - branches: [ "master", "develop" ] - -jobs: - build: - runs-on: [self-hosted, Linux] - - permissions: - contents: read - packages: write - attestations: write - id-token: write - - steps: - # Step 1: Checkout the repository - - name: Checkout Code - uses: actions/checkout@v4 - with: - repository: PSAL-POSTECH/PyTorchSim - ref: ${{ env.github.event.pull_request.head.sha }} - submodules: recursive - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - # Step 3: Pull the Cached Image - - name: Pull Cached Image & Set environment - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available" - echo "IMAGE_TAG=torchsim-ci:${GITHUB_SHA}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" >> $GITHUB_ENV - echo "GITHUB_SHA=${{github.event.pull_request.head.sha}}" - gem5_response_file=/tmp/releases-gem5-latest.json - response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} ) - GEM5_ASSET_ID=$(cat ${gem5_response_file} | jq ".assets[0]."id"") - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV - - llvm_response_file=/tmp/releases-gem5-latest.json - response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} ) - LLVM_ASSET_ID=$(cat ${llvm_response_file} | jq ".assets[0]."id"") - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV - - mkdir -p /tmp/torchsim-ci/${GITHUB_SHA} - echo "DUMP_PATH=/tmp/torchsim-ci/${GITHUB_SHA}" - - # Step 4: Build and Push Docker Image - - name: Build and Push Docker Image - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile - push: true - build-args: | - GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} - LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} - TORCHSIM_SHA=${{ env.GITHUB_SHA }} - secrets: | - GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }} - tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}} - - test_add: - name: Run test_add.py - runs-on: self-hosted - - permissions: - contents: read - packages: write - attestations: write - id-token: write - needs: build - - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_add.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_add.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_add.py - - test_activation: - name: Run test_activation.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_activation.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_activation.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_activation.py - - test_batchnorm: - name: Run test_batchnorm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_batchnorm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_batchnorm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_batchnorm.py - - test_bmm: - name: Run test_bmm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_bmm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_bmm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_bmm.py - - test_cnn: - name: Run test_cnn.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_cnn.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_cnn.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_cnn.py - - test_conv2d: - name: Run test_conv2d.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_conv2d.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_conv2d.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_conv2d.py - - test_matmul: - name: Run test_matmul.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_matmul.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_matmul.py - - test_reduce: - name: Run test_reduce.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_reduce.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_reduce.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_reduce.py - - test_softmax: - name: Run test_softmax.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_softmax.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_softmax.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_softmax.py - - test_transpose2D: - name: Run test_transpose2D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transpose2D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transpose2D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose2D.py - - test_view3D_2D: - name: Run test_view3D_2D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_view3D_2D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_view3D_2D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_view3D_2D.py - - test_layernorm: - name: Run test_layernorm.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_layernorm.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_layernorm.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_layernorm.py - - test_mlp: - name: Run test_mlp.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_mlp.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_mlp.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_mlp.py - - test_resnet: - name: Run test_resnet.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_resnet.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_resnet.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_resnet.py - - test_transformer: - name: Run test_transformer.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transformer.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transformer.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transformer.py - - test_transpose3D: - name: Run test_transpose3D.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_transpose3D.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transpose3D.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_transpose3D.py - - test_sparsity: - name: Run test_sparsity.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_sparsity.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_sparsity.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_sparsity.py - - test_pool: - name: Run test_pool.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_pool.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_pool.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_pool.py - - test_perceptron: - name: Run test_perceptron.py - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_single_perceptron.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_single_perceptron.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_single_perceptron.py - - test_fusion: - name: Run test_fusion - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_addmm_residual.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_addmm_residual.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py - - - name: Run test_matmul_activation.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul_activation.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py - - - name: Run test_matmul_scalar.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul_scalar.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py - - - name: Run test_conv_fusion.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_conv_fusion.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py - - - name: Run test_matmul_reduction.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_matmul_reduction.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py - - - name: Run test_bmm_reduction.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_bmm_reduction.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py - - - name: Run test_prologue_fusion.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_prologue_fusion.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py - - - name: Run test_transformer_fusion.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_transformer_fusion.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py - - test_moe: - name: Run test_moe - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_moe.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_moe.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/MoE/test_moe.py - - test_mistral: - name: Run test_mistral - runs-on: self-hosted - needs: build - steps: - - name: Log in to GitHub Container Registry - uses: docker/login-action@v3 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GIT_ACCESS_TOKEN }} - - name: Run test_mistral.py - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - echo "Running test_mistral.py" - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py - - test_indirect: - name: Run test_indirect - runs-on: self-hosted - needs: build - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - steps: - - name: Run test_indirect.py - run: | - echo "Running test_indirect.py" - echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_indirect_access.py - - test_scheduler: - name: Run test_scheduler - runs-on: self-hosted - needs: build - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - steps: - - name: Run test_scheduler.py - run: | - echo "Running test_scheduler.py" - echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - -e TORCHSIM_DUMP_PATH=/dump -e TORCHSIM_VECTOR_LANE=8 -e TORCHSIM_SPAD_SIZE=32 \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} python3 PyTorchSim/tests/test_scheduler.py - - test_cleanup: - name: Clean test cases - runs-on: self-hosted - needs: [test_add, test_batchnorm, test_bmm, test_cnn, test_conv2d, - test_matmul, test_reduce, test_softmax, - test_transpose2D, test_view3D_2D, test_layernorm, - test_mlp, test_resnet, test_transformer, test_transpose3D, - test_sparsity, test_activation, test_pool, test_perceptron, - test_fusion, test_mistral, test_moe, test_indirect, test_scheduler] - steps: - - name: Checkout code - uses: actions/checkout@v3 - - name: Clean test case - run: | - docker run --rm \ - -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ - ghcr.io/psal-postech/torchsim-ci:${GITHUB_SHA} chown -R $(id -u):$(id -g) /dump diff --git a/.github/workflows/pytorchsim_test.yml b/.github/workflows/pytorchsim_test.yml new file mode 100644 index 00000000..25ce0a16 --- /dev/null +++ b/.github/workflows/pytorchsim_test.yml @@ -0,0 +1,446 @@ +name: PyTorchSim Tests + +on: + workflow_call: + inputs: + image_name: + required: true + type: string + vector_lane: + description: "Vector lane size (use empty string for server TPU)" + required: true + type: number + spad_size: + description: "SPAD size (use empty string for server TPU)" + required: true + type: number + +jobs: + test_add: + name: Run test_add.py + runs-on: self-hosted + steps: + - name: Run test_add.py + run: | + echo "Running test_add.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_add.py + + test_activation: + name: Run test_activation.py + runs-on: self-hosted + steps: + - name: Run test_activation.py + run: | + echo "Running test_activation.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_activation.py + + test_batchnorm: + name: Run test_batchnorm.py + runs-on: self-hosted + steps: + - name: Run test_batchnorm.py + run: | + echo "Running test_batchnorm.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_batchnorm.py + + test_bmm: + name: Run test_bmm.py + runs-on: self-hosted + steps: + - name: Run test_bmm.py + run: | + echo "Running test_bmm.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_bmm.py + + test_cnn: + name: Run test_cnn.py + runs-on: self-hosted + steps: + - name: Run test_cnn.py + run: | + echo "Running test_cnn.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_cnn.py + + test_conv2d: + name: Run test_conv2d.py + runs-on: self-hosted + steps: + - name: Run test_conv2d.py + run: | + echo "Running test_conv2d.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_conv2d.py + + test_matmul: + name: Run test_matmul.py + runs-on: self-hosted + steps: + - name: Run test_matmul.py + run: | + echo "Running test_matmul.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_matmul.py + + test_reduce: + name: Run test_reduce.py + runs-on: self-hosted + steps: + - name: Run test_reduce.py + run: | + echo "Running test_reduce.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_reduce.py + + test_softmax: + name: Run test_softmax.py + runs-on: self-hosted + steps: + - name: Run test_softmax.py + run: | + echo "Running test_softmax.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_softmax.py + + test_transpose2D: + name: Run test_transpose2D.py + runs-on: self-hosted + steps: + - name: Run test_transpose2D.py + run: | + echo "Running test_transpose2D.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose2D.py + + test_view3D_2D: + name: Run test_view3D_2D.py + runs-on: self-hosted + steps: + - name: Run test_view3D_2D.py + run: | + echo "Running test_view3D_2D.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_view3D_2D.py + + test_layernorm: + name: Run test_layernorm.py + runs-on: self-hosted + steps: + - name: Run test_layernorm.py + run: | + echo "Running test_layernorm.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_layernorm.py + + test_mlp: + name: Run test_mlp.py + runs-on: self-hosted + steps: + - name: Run test_mlp.py + run: | + echo "Running test_mlp.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_mlp.py + + test_resnet: + name: Run test_resnet.py + runs-on: self-hosted + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Run test_resnet18.py + run: | + echo "Running test_resnet.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py + + - name: Run test_resnet50.py + run: | + echo "Running test_resnet.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_resnet.py --model_type resnet50 + + test_transformer: + name: Run test_transformer.py + runs-on: self-hosted + steps: + - name: Run test_transformer.py + run: | + echo "Running test_transformer.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transformer.py + + test_transpose3D: + name: Run test_transpose3D.py + runs-on: self-hosted + steps: + - name: Run test_transpose3D.py + run: | + echo "Running test_transpose3D.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_transpose3D.py + + test_sparsity: + name: Run test_sparsity.py + runs-on: self-hosted + steps: + - name: Run test_sparsity.py + run: | + echo "Running test_sparsity.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_sparsity.py + + test_pool: + name: Run test_pool.py + runs-on: self-hosted + steps: + - name: Run test_pool.py + run: | + echo "Running test_pool.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_pool.py + + test_perceptron: + name: Run test_perceptron.py + runs-on: self-hosted + steps: + - name: Run test_single_perceptron.py + run: | + echo "Running test_single_perceptron.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_single_perceptron.py + + test_fusion: + name: Run test_fusion + runs-on: self-hosted + steps: + - name: Run test_addmm_residual.py + run: | + echo "Running test_addmm_residual.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_addmm_residual.py + + - name: Run test_matmul_activation.py + run: | + echo "Running test_matmul_activation.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_activation.py + + - name: Run test_matmul_scalar.py + run: | + echo "Running test_matmul_scalar.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_scalar.py + + - name: Run test_matmul_reduction.py + run: | + echo "Running test_matmul_reduction.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_matmul_reduction.py + + - name: Run test_bmm_reduction.py + run: | + echo "Running test_bmm_reduction.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_bmm_reduction.py + + - name: Run test_prologue_fusion.py + run: | + echo "Running test_prologue_fusion.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_prologue_fusion.py + + - name: Run test_transformer_fusion.py + run: | + echo "Running test_transformer_fusion.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_transformer_fusion.py + + - name: Run test_conv_fusion.py + run: | + echo "Running test_conv_fusion.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Fusion/test_conv_fusion.py + + test_moe: + name: Run test_moe + runs-on: self-hosted + steps: + - name: Run test_moe.py + run: | + echo "Running test_moe.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/MoE/test_moe.py + + test_mistral: + name: Run test_mistral + runs-on: self-hosted + steps: + - name: Run test_mistral.py + run: | + echo "Running test_mistral.py" + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/Mixtral_8x7B/test_attention.py + + test_indirect: + name: Run test_indirect + runs-on: self-hosted + env: + GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Run test_indirect.py + run: | + echo "Running test_indirect.py" + echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_indirect_access.py + + test_scheduler: + name: Run test_scheduler + runs-on: self-hosted + env: + GIT_ACCESS_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Run test_scheduler.py + run: | + echo "Running test_scheduler.py" + echo $GIT_ACCESS_TOKEN | docker login ghcr.io -u USERNAME --password-stdin + docker run --rm \ + -v /tmp/torchsim-ci/${GITHUB_SHA}:/dump \ + -e TORCHSIM_DUMP_PATH=/dump \ + -e TORCHSIM_VECTOR_LANE="${{ inputs.vector_lane }}" \ + -e TORCHSIM_SPAD_SIZE="${{ inputs.spad_size }}" \ + ${{ inputs.image_name }} python3 PyTorchSim/tests/test_scheduler.py diff --git a/.github/workflows/tag_release.yml b/.github/workflows/tag_release.yml index 258c0e40..fe7d0d02 100644 --- a/.github/workflows/tag_release.yml +++ b/.github/workflows/tag_release.yml @@ -7,12 +7,11 @@ on: jobs: build: - runs-on: self-hosted + runs-on: ubuntu-latest permissions: contents: read packages: write - id-token: write steps: - name: Checkout code @@ -29,42 +28,9 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - - name: Set Tag Environment - run: | - echo "IMAGE_TAG=torchsim-ci:${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV - echo "GITHUB_SHA=$GITHUB_SHA" >> $GITHUB_ENV - echo "GITHUB_SHA=$GITHUB_SHA" - - - name: Pull Cached Image & Set environment - env: - GIT_ACCESS_TOKEN: ${{ secrets.GIT_ACCESS_TOKEN }} - run: | - docker pull ghcr.io/psal-postech/torchsim_base:latest || echo "No cache available" - gem5_response_file=/tmp/releases-gem5-latest.json - response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/GEM5/releases/latest > ${gem5_response_file} ) - GEM5_ASSET_ID=$(cat ${gem5_response_file} | jq ".assets[0]."id"") - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" - echo "GEM5_ASSET_ID=$GEM5_ASSET_ID" >> $GITHUB_ENV - - llvm_response_file=/tmp/releases-gem5-latest.json - response=$(curl -sH "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/latest > ${llvm_response_file} ) - LLVM_ASSET_ID=$(cat ${llvm_response_file} | jq ".assets[0]."id"") - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" - echo "LLVM_ASSET_ID=$LLVM_ASSET_ID" >> $GITHUB_ENV - - mkdir -p /tmp/torchsim-ci/${GITHUB_SHA} - echo "DUMP_PATH=/tmp/torchsim-ci/${GITHUB_SHA}" - - name: Build and Push Docker Image uses: docker/build-push-action@v6 with: context: . file: ./Dockerfile - push: true - build-args: | - GEM5_ASSET_ID=${{ env.GEM5_ASSET_ID }} - LLVM_ASSET_ID=${{ env.LLVM_ASSET_ID }} - TORCHSIM_SHA=${{ env.GITHUB_SHA }} - secrets: | - GIT_ACCESS_TOKEN=${{ secrets.GIT_ACCESS_TOKEN }} - tags: ghcr.io/psal-postech/${{ env.IMAGE_TAG}} \ No newline at end of file + tags: ghcr.io/psal-postech/torchsim-release:${{ github.ref_name }} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 44f6fd5e..293dcb60 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,71 +1,6 @@ # syntax=docker/dockerfile:1.4 -# Copyright (c) 2020 The Regents of the University of California -# All Rights Reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. FROM ghcr.io/psal-postech/torchsim_base:latest -# Pass Access Token securely -ARG GEM5_ASSET_ID -ARG LLVM_ASSET_ID -ARG TORCHSIM_SHA -ENV PATH $PATH:/root/.local/bin -ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH - -# Download GEM5 for torchsim -RUN --mount=type=secret,id=GIT_ACCESS_TOKEN \ - GIT_ACCESS_TOKEN=$(cat /run/secrets/GIT_ACCESS_TOKEN) && \ - curl -L -H "Accept: application/octet-stream" -H "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/gem5/releases/assets/${GEM5_ASSET_ID} -o /tmp/gem5-release.tar.gz && \ - mkdir -p /gem5 && \ - tar -xzf /tmp/gem5-release.tar.gz -C /gem5 && \ - rm /tmp/gem5-release.tar.gz -ENV GEM5_PATH /gem5/release/gem5.opt - -# Download LLVM RISC-V for torchsim -RUN --mount=type=secret,id=GIT_ACCESS_TOKEN \ - GIT_ACCESS_TOKEN=$(cat /run/secrets/GIT_ACCESS_TOKEN) && \ - curl -L -H "Accept: application/octet-stream" -H "Authorization: Bearer ${GIT_ACCESS_TOKEN}" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/assets/${LLVM_ASSET_ID} -o /tmp/riscv-llvm-release.tar.gz && \ - tar -xzf /tmp/riscv-llvm-release.tar.gz -C / && \ - rm /tmp/riscv-llvm-release.tar.gz - -# Store RISC-V LLVM for TorchSim -ENV TORCHSIM_LLVM_PATH /riscv-llvm/bin -ENV TORCHSIM_LLVM_INCLUDE_PATH /riscv-llvm/include -ENV TORCHSIM_DIR /workspace/PyTorchSim -ENV LLVM_DIR /riscv-llvm - -# Install Spike simulator -RUN --mount=type=secret,id=GIT_ACCESS_TOKEN \ - GIT_ACCESS_TOKEN=$(cat /run/secrets/GIT_ACCESS_TOKEN) && \ - git clone https://$GIT_ACCESS_TOKEN@github.com/PSAL-POSTECH/riscv-isa-sim.git --branch TorchSim && cd riscv-isa-sim && mkdir build && cd build && \ - ../configure --prefix=$RISCV && make -j && make install && cd ../../ && rm -rf riscv-isa-sim - -# Install Proxy kernel -RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \ - cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \ - ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install - # Prepare PyTorchSim project COPY . /workspace/PyTorchSim diff --git a/Dockerfile.base b/Dockerfile.base index 1f760785..2b3d58d6 100644 --- a/Dockerfile.base +++ b/Dockerfile.base @@ -44,5 +44,43 @@ RUN wget https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2 # Install torchsim dependency RUN apt install ninja-build && pip install onnx matplotlib && pip install --user conan==1.56.0 -ENV RISCV /workspace/riscv -ENV PATH $RISCV/bin:$PATH \ No newline at end of file +ENV RISCV=/workspace/riscv +ENV PATH=$RISCV/bin:$PATH + +ARG GEM5_ASSET_ID +ARG LLVM_ASSET_ID +ARG SPIKE_ASSET_ID +ENV PATH=$PATH:/root/.local/bin +ENV LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/opt/conda/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH + +# Download GEM5 for torchsim +RUN curl -L -H "Accept: application/octet-stream" https://api.github.com/repos/PSAL-POSTECH/gem5/releases/assets/${GEM5_ASSET_ID} -o /tmp/gem5-release.tar.gz && \ + mkdir -p /gem5 && \ + tar -xzf /tmp/gem5-release.tar.gz -C /gem5 && \ + rm /tmp/gem5-release.tar.gz +ENV GEM5_PATH=/gem5/release/gem5.opt + +# Download LLVM RISC-V for torchsim +RUN curl -L -H "Accept: application/octet-stream" https://api.github.com/repos/PSAL-POSTECH/llvm-project/releases/assets/${LLVM_ASSET_ID} -o /tmp/riscv-llvm-release.tar.gz && \ + tar -xzf /tmp/riscv-llvm-release.tar.gz -C / && \ + rm /tmp/riscv-llvm-release.tar.gz + +# Store RISC-V LLVM for TorchSim +ENV TORCHSIM_LLVM_PATH=/riscv-llvm/bin +ENV TORCHSIM_LLVM_INCLUDE_PATH=/riscv-llvm/include +ENV TORCHSIM_DIR=/workspace/PyTorchSim +ENV LLVM_DIR=/riscv-llvm + +# Download Spike simulator +RUN curl -L -H "Accept: application/octet-stream" https://api.github.com/repos/PSAL-POSTECH/riscv-isa-sim/releases/assets/${SPIKE_ASSET_ID} -o /tmp/spike-release.tar.gz && \ + tar -xzf /tmp/spike-release.tar.gz -C / && rm /tmp/spike-release.tar.gz + +# Store SPIKE for TorchSim +ENV PATH="/release/bin:${PATH}" +ENV LD_LIBRARY_PATH="/release/lib:${LD_LIBRARY_PATH}" +ENV C_INCLUDE_PATH="/release/include:${C_INCLUDE_PATH}" + +# Install Proxy kernel +RUN git clone https://github.com/riscv-software-src/riscv-pk.git && \ + cd riscv-pk && git checkout 4f3debe4d04f56d31089c1c716a27e2d5245e9a1 && mkdir build && cd build && \ + ../configure --prefix=$RISCV --host=riscv64-unknown-elf && make -j && make install