From 42b1b97a4eebd23aa61e192de0b0ebaacdbb94a8 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Tue, 7 Jan 2025 12:06:28 +0000 Subject: [PATCH 1/4] [Fix] #90 Scalar-Vector operations --- .../mlir/mlir_codegen_backend.py | 4 ++-- tests/MoE/test_moe.py | 18 +++++++++--------- tests/test_single_perceptron.py | 5 +++-- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 99c39322..1c28a2a8 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -777,10 +777,10 @@ def reduction(self, dtype, src_dtype, reduction_type, value): shape = f"vector<{self.tile_desc.get_tile_size()}x{type_name}>" reduced_shape = type_name init = self.cse.generate(self.reduction_prefix, f"arith.constant {reduction_init(reduction_type, dtype)} : {type_name}") - if len(self.ranges) == 1: + if len(self.ranges) == 1: # 1-D vector to scalar axis = "0" acc_var = init - shape = f"vector<{self.tile_desc.get_tile_size_per_lane()}x{type_name}>" + shape = f"vector<{self.tile_desc.get_tile_size()}x{type_name}>" # use single vector lane elif len(self.ranges) == 2: vec_len = self.tile_desc.get_rows_per_lane() flattened_size = f"vector<{self.tile_desc.get_tile_size_per_lane()}x{type_name}>" diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index ff6dd00b..9a247669 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -453,15 +453,15 @@ def test_moe(device): total_cpu_loss.backward() print("MoE Backward Done!") - print("MoE Weight Bias print") - for i in range(num_experts): - print(f"\nExpert {i}") - print(f"FC1 Weight: {model.experts[i].fc1.weight.cpu()}") - print(f"FC1 Bias: {model.experts[i].fc1.bias.cpu()}") - print("\n") - print(f"FC2 Weight: {model.experts[i].fc2.weight.cpu()}") - print(f"FC2 Bias: {model.experts[i].fc2.bias.cpu()}") - print("\n") + # print("MoE Weight Bias print") + # for i in range(num_experts): + # print(f"\nExpert {i}") + # print(f"FC1 Weight: {model.experts[i].fc1.weight.cpu()}") + # print(f"FC1 Bias: {model.experts[i].fc1.bias.cpu()}") + # print("\n") + # print(f"FC2 Weight: {model.experts[i].fc2.weight.cpu()}") + # print(f"FC2 Bias: {model.experts[i].fc2.bias.cpu()}") + # print("\n") print("MoE Weight Bias Grad") for i in range(num_experts): diff --git a/tests/test_single_perceptron.py b/tests/test_single_perceptron.py index 7ab02656..78a6b117 100644 --- a/tests/test_single_perceptron.py +++ b/tests/test_single_perceptron.py @@ -41,13 +41,14 @@ def weight_update(a, b, lr): b2.requires_grad = True opt_mlp = torch.compile(dynamic=False)(perceptron) opt_w = torch.compile(dynamic=False)(weight_update) - opt_loss = torch.compile(dynamic=False)(torch.nn.MSELoss()) + loss_fn = torch.nn.MSELoss() + opt_loss = torch.compile(dynamic=False)(loss_fn) lr = torch.tensor(5e-2).to(device=device) # learning rate y = opt_mlp(w1, x1, b1) loss = opt_loss(y, y1) loss.backward() cpu_y = perceptron(x2, w2, b2) - cpu_loss = torch.nn.MSELoss()(cpu_y, y2) + cpu_loss = loss_fn(cpu_y, y2) cpu_loss.backward() test_result("Perceptron", y, cpu_y) test_result("Loss", loss, cpu_loss) From e1859c2fc04786f4a800c23e0ec32c719bdfe720 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Wed, 8 Jan 2025 10:23:29 +0000 Subject: [PATCH 2/4] [Frontend] #89 Efficient vector tile size --- .../mlir/mlir_codegen_backend.py | 37 ++++++++++++++++++- PyTorchSimFrontend/mlir/mlir_common.py | 3 ++ tests/MoE/test_moe.py | 8 ++-- tests/test_softmax.py | 24 ++++++++++++ 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 1c28a2a8..b9a5bb6f 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -638,6 +638,35 @@ def set_ranges(self, lengths, reduction_lengths, read_writes): # Adjust time size when it is vector self.adjust_tile_size() return ret + def get_constant_vector2(self, expr): + # Case 0. symbol ex) index 0 + # Case 1. inner product form ex) 16 * index0 + 1 * index1 + # Case 2. Complicated form ex) 16 * index0 + 8 * (index//4) + (index % 4) + constant_vector = [] + if expr.is_symbol: + constant_vector.append(tuple([1, expr])) + return constant_vector + + for arg in expr.args: + if arg.is_symbol: + constant_vector.append(tuple([1,arg])) + continue + if len(arg.args) == 0: #TODO: check this + continue + if arg.args[0].is_number: + constant_vector.append(arg.args) + else: + constant_vector.append([1, arg]) + + return constant_vector + + def find_node_by_name(self, name): + if name in V.graph.graph_inputs: + return V.graph.graph_inputs[name] + else: + for output_node in V.graph.graph_outputs: + if output_node.data.name == name: + return output_node def parse_indices(self, expr): if len(expr.args) == 0: @@ -999,6 +1028,9 @@ def get_dma_info(self, name, index, dtype): current_tile.tile_per_lane_layout = mlir_common.MLIRTile.TILE_PER_LANE_COL_WISE # Actually it is not needed in vector case chunk_size = current_tile.get_chunk_size() mm_stride = current_tile.n_col + if self.is_scalar(name): # scalar to vector broadcasting + mm_stride = 0 + current_tile.n_row, current_tile.n_col = current_tile.n_col, current_tile.n_row # Case 2. Tile is 1-D vector type with reduction elif len(cv) == 1 and len(cv) == self.reduction_depth + 1: # Use only one vectorlane to reduce a vector @@ -1009,6 +1041,9 @@ def get_dma_info(self, name, index, dtype): current_tile.used_vector_lane = 1 chunk_size = current_tile.get_chunk_size() mm_stride = 0 # don't care + tile_size_per_lane = current_tile.get_tile_size_per_lane() + if self.is_scalar(name): # scalar to vector broadcasting + current_tile.n_row, current_tile.n_col = current_tile.n_col, current_tile.n_row # Case 3. Tile is 2-D tile elif len(cv) == 2: is_reduction = self.reduction_depth == 1 @@ -1094,7 +1129,7 @@ def adjust_tile_size(self): # Case 1. vector kernel if len(self.itervars) == 1: - self.tile_desc.n_col = self.tile_desc.get_tile_size() + self.tile_desc.n_col = self.tile_desc.get_tile_size() if self.tile_desc.get_tile_size() < self.ranges[0] else self.ranges[0] # effective tile size self.tile_desc.n_row = 1 elif len(self.itervars) == 0: self.tile_desc.n_col = 1 diff --git a/PyTorchSimFrontend/mlir/mlir_common.py b/PyTorchSimFrontend/mlir/mlir_common.py index a949cb5d..21612a4c 100644 --- a/PyTorchSimFrontend/mlir/mlir_common.py +++ b/PyTorchSimFrontend/mlir/mlir_common.py @@ -369,6 +369,9 @@ def find_node_by_name(self, name): if output_node.data.name == name: return output_node + def is_scalar(self, name): + return self.buffer_types[name][1] == 1 + def roundup_vectorlane(self, size, amp=1): return ((size + self.vector_lane - 1) // self.vector_lane) * self.vector_lane * amp diff --git a/tests/MoE/test_moe.py b/tests/MoE/test_moe.py index 9a247669..d14bf5c6 100644 --- a/tests/MoE/test_moe.py +++ b/tests/MoE/test_moe.py @@ -420,15 +420,15 @@ def test_moe(device): x1 = copy.deepcopy(X).to(device=device) x2 = copy.deepcopy(X).to("cpu") - # model.train() - model.eval() + model.train() + # model.eval() model_device = model.to(device=device) opt_model = torch.compile(model_device, dynamic=False) y_hat, aux_loss = opt_model(x1) print("MoE Custom Device Done!") - # model_cpu.train() - model_cpu.eval() + model_cpu.train() + # model_cpu.eval() cpu_hat, cpu_aux_loss = model_cpu(x2) test_result("MoE Forward", y_hat, cpu_hat) test_result("MoE Aux Loss", aux_loss, cpu_aux_loss) diff --git a/tests/test_softmax.py b/tests/test_softmax.py index ca49953c..d68638f8 100644 --- a/tests/test_softmax.py +++ b/tests/test_softmax.py @@ -18,6 +18,29 @@ def test_softmax(device, size=(128, 128), dim=1): input = torch.randn(size) x1 = input.to(device=device) x2 = input.to("cpu") + + # split softmax into 3 steps + # def softmax1(x): # find max + # return x.max(dim=dim, keepdim=True).values + # def softmax2(x, max): + # return (x - max).exp().sum(dim=dim, keepdim=True) + # def softmax3(x, max, sum): + # return (x - max).exp().div(sum) + + # opt_fn1 = torch.compile(dynamic=False)(softmax1) + # opt_fn2 = torch.compile(dynamic=False)(softmax2) + # opt_fn3 = torch.compile(dynamic=False)(softmax3) + + # max = opt_fn1(x1) + # cpu_max = softmax1(x2) + # test_result("Softmax Max", max, cpu_max) + # sum = opt_fn2(x1, max) + # cpu_sum = softmax2(x2, cpu_max) + # test_result("Softmax Sum", sum, cpu_sum) + # y = opt_fn3(x1, max, sum) + # cpu_y = softmax3(x2, cpu_max, cpu_sum) + # test_result("Softmax", y, cpu_y) + opt_fn = torch.compile(dynamic=False)(torch.nn.functional.softmax) y = opt_fn(x1, dim=dim) cpu_y = torch.nn.functional.softmax(x2, dim=dim) @@ -33,3 +56,4 @@ def test_softmax(device, size=(128, 128), dim=1): device = module.custom_device() test_softmax(device, size=(64, 128)) test_softmax(device, size=(256, 128)) + test_softmax(device, size=(1, 16)) From 62460d8ed0f21090d6125c721c5dd13eb7f02335 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Thu, 9 Jan 2025 10:37:51 +0000 Subject: [PATCH 3/4] [Frontend] minimum 1D tile size --- PyTorchSimFrontend/mlir/mlir_codegen_backend.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index b9a5bb6f..12ede0d3 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -1129,7 +1129,11 @@ def adjust_tile_size(self): # Case 1. vector kernel if len(self.itervars) == 1: - self.tile_desc.n_col = self.tile_desc.get_tile_size() if self.tile_desc.get_tile_size() < self.ranges[0] else self.ranges[0] # effective tile size + tile_size = self.tile_desc.get_tile_size() + if tile_size < self.ranges[0]: + tile_size = self.ranges[0] + min_tile_size_unit = self.vector_lane * self.vlen # VCIX widening is not implemented + self.tile_desc.n_col = (tile_size + min_tile_size_unit - 1) // min_tile_size_unit self.tile_desc.n_row = 1 elif len(self.itervars) == 0: self.tile_desc.n_col = 1 From 7ae776e32f7d9e099a9ecad5da25aaf6d33e3050 Mon Sep 17 00:00:00 2001 From: Yunseon Shin Date: Wed, 15 Jan 2025 03:43:45 +0000 Subject: [PATCH 4/4] [Frontend] various padding type --- .../mlir/mlir_codegen_backend.py | 47 +++++-------------- 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py index 12ede0d3..c63f53bb 100644 --- a/PyTorchSimFrontend/mlir/mlir_codegen_backend.py +++ b/PyTorchSimFrontend/mlir/mlir_codegen_backend.py @@ -638,35 +638,15 @@ def set_ranges(self, lengths, reduction_lengths, read_writes): # Adjust time size when it is vector self.adjust_tile_size() return ret - def get_constant_vector2(self, expr): - # Case 0. symbol ex) index 0 - # Case 1. inner product form ex) 16 * index0 + 1 * index1 - # Case 2. Complicated form ex) 16 * index0 + 8 * (index//4) + (index % 4) - constant_vector = [] - if expr.is_symbol: - constant_vector.append(tuple([1, expr])) - return constant_vector - - for arg in expr.args: - if arg.is_symbol: - constant_vector.append(tuple([1,arg])) - continue - if len(arg.args) == 0: #TODO: check this - continue - if arg.args[0].is_number: - constant_vector.append(arg.args) - else: - constant_vector.append([1, arg]) - - return constant_vector - def find_node_by_name(self, name): - if name in V.graph.graph_inputs: - return V.graph.graph_inputs[name] - else: - for output_node in V.graph.graph_outputs: - if output_node.data.name == name: - return output_node + # padding type 0: zero-padding 1: negative-padding(-inf) ... + def get_padding_type(self): + ops = self.current_node.node.origins + if self.current_node.is_reduction(): + for op in ops: + if "exp" in op.name: # exponential reduciton case + return 1 + return 0 def parse_indices(self, expr): if len(expr.args) == 0: @@ -699,6 +679,7 @@ def parse_indices(self, expr): def load(self, name: str, index: sympy.Expr): index = self.rename_indexing(index) indices = self.parse_indices(index) + padding = self.get_padding_type() prefix = self.newvar_prefix if index.is_number: prefix = prefix + "c" @@ -725,7 +706,7 @@ def load(self, name: str, index: sympy.Expr): self.dma_cache[dma_key] = dmaType, stride, chunk self.tags.add(f"{name}_tag") self.consts.add(0) - code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[%c0, %c0], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32>" + code = f"affine.dma_start %{var}[{prefix}{indices}], %{buffer}[%c0, %c0], %{name}_tag[0], %c{dmaType}, %c{stride}, %c{chunk} : memref<{self.buffer_types[name][1]}x{type_name}>, memref<{dram_tile_shape}x{type_name}, 1>, memref<1xi32> {{padding = {padding}}}" self.cse.generate(self.loads, code, assignment = False) # FIXME: assignment = False does not support caching operation = "affine.vector_load" if tile_size_per_lane > 1 else "affine.load" @@ -1129,11 +1110,9 @@ def adjust_tile_size(self): # Case 1. vector kernel if len(self.itervars) == 1: - tile_size = self.tile_desc.get_tile_size() - if tile_size < self.ranges[0]: - tile_size = self.ranges[0] - min_tile_size_unit = self.vector_lane * self.vlen # VCIX widening is not implemented - self.tile_desc.n_col = (tile_size + min_tile_size_unit - 1) // min_tile_size_unit + tile_size = self.tile_desc.get_tile_size() if self.tile_desc.get_tile_size() < self.ranges[0] else self.ranges[0] + min_tile_size_unit = self.vector_lane * self.vlen # TODO: VCIX widening is not implemented + self.tile_desc.n_col = math.ceil(tile_size / min_tile_size_unit) * min_tile_size_unit # padding self.tile_desc.n_row = 1 elif len(self.itervars) == 0: self.tile_desc.n_col = 1