From 1c3b67c2d7c0e6f7fd128cd788b8a36137b3556e Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Sun, 28 Sep 2025 21:15:16 +0800 Subject: [PATCH 1/5] [RFC] Clean up debug code and improve output messages --- test_op.py | 2 +- test_op_mp.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test_op.py b/test_op.py index 963981f..dba6863 100644 --- a/test_op.py +++ b/test_op.py @@ -488,7 +488,7 @@ def get_tvm_source( bar_id = 0 dtype = "float16" if args.use_tc else "float32" for rprog in rprogs: - print(f"rProg[{idx}]: {rprog.Dump()}") + print("rProg: ", rprog.Dump()) block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1) grid_size = rprog.GetParallelism(0) blocks = (block_size, 1, 1) diff --git a/test_op_mp.py b/test_op_mp.py index 07cceb1..f584fe1 100644 --- a/test_op_mp.py +++ b/test_op_mp.py @@ -438,7 +438,7 @@ def compile_and_run_kernel( device_id, idx, ): - print(f"rProg[{idx}]: {rprog.Dump()}") + print("rProg: ", rprog.Dump()) block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1) grid_size = rprog.GetParallelism(0) blocks = (block_size, 1, 1) From 40aa0668482f889e8bfb95c3accede5aa1ede583 Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Sun, 28 Sep 2025 21:22:29 +0800 Subject: [PATCH 2/5] Update test_op_mp.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- test_op_mp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_op_mp.py b/test_op_mp.py index f584fe1..07cceb1 100644 --- a/test_op_mp.py +++ b/test_op_mp.py @@ -438,7 +438,7 @@ def compile_and_run_kernel( device_id, idx, ): - print("rProg: ", rprog.Dump()) + print(f"rProg[{idx}]: {rprog.Dump()}") block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1) grid_size = rprog.GetParallelism(0) blocks = (block_size, 1, 1) From abdc4485a9067cdbaa1d21e1153a7e09799c124f Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Sun, 28 Sep 2025 21:23:32 +0800 Subject: [PATCH 3/5] Update test_op.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- test_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_op.py b/test_op.py index dba6863..963981f 100644 --- a/test_op.py +++ b/test_op.py @@ -488,7 +488,7 @@ def get_tvm_source( bar_id = 0 dtype = "float16" if args.use_tc else "float32" for rprog in rprogs: - print("rProg: ", rprog.Dump()) + print(f"rProg[{idx}]: {rprog.Dump()}") block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1) grid_size = rprog.GetParallelism(0) blocks = (block_size, 1, 1) From cbfb403fe58ea4aa5ff06ae4ec787443057ec3d5 Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Mon, 29 Sep 2025 17:26:14 +0800 Subject: [PATCH 4/5] [Perf] Eenhance profiling output and add performance metrics - Add conditional compilation for verification code via `--gen-check-code` - Improve nvprof output parsing for different compute capabilities - Add TFLOPS calculation in performance reports - Enhance console output with formatted banners and progress indicators - Maintain multi-process evaluation support in test_op_mp.py --- test_op.py | 89 +++++++++++++++++++++++++---------- test_op_mp.py | 86 +++++++++++++++++++++++---------- tests/short-time-LatestTVM.sh | 4 +- tests/short-time-OldTVM.sh | 4 +- utils/commons.py | 20 +++++++- 5 files changed, 148 insertions(+), 55 deletions(-) diff --git a/test_op.py b/test_op.py index 963981f..415cb5c 100644 --- a/test_op.py +++ b/test_op.py @@ -171,7 +171,8 @@ def main_template( for d in dim: size *= d byte = size * type_size - s_size += " int output_size" + str(i) + " = " + str(size) + ";\n" + if args.gen_check_code: + s_size += " int output_size" + str(i) + " = " + str(size) + ";\n" s_hmalloc += " " + name + "h = (float*)malloc(" + str(byte) + ");\n" s_hfree += " free(" + name + "h);\n" s_dmalloc += " cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n" @@ -185,25 +186,26 @@ def main_template( + str(byte) + ", cudaMemcpyDeviceToHost);\n" ) - s_simple_check += ( - " float same_res = " - + name - + "h[0];\n" - + " for (int i = 1; i < output_size" - + str(i) - + "; ++i)\n" - + " {\n" - " if (" - + name - + "h[i] != same_res)\n" - + " {\n" - + ' printf("output[%d] = %f\\n", i, ' - + name - + "h[i]);\n" - + " exit(1);\n" - + " }\n" - + " }\n" - ) + if args.gen_check_code: + s_simple_check += ( + " float same_res = " + + name + + "h[0];\n" + + " for (int i = 1; i < output_size" + + str(i) + + "; ++i)\n" + + " {\n" + " if (" + + name + + "h[i] != same_res)\n" + + " {\n" + + ' printf("output[%d] = %f\\n", i, ' + + name + + "h[i]);\n" + + " exit(1);\n" + + " }\n" + + " }\n" + ) if backend == "antares": kernel_name = "template_op_kernel0" @@ -426,6 +428,7 @@ def get_tvm_source( if __name__ == "__main__": + printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace") print(args) expr = globals()[args.op] if args.fuse: @@ -474,11 +477,21 @@ def get_tvm_source( rprog.AddTile(0, rTile0) rprogs = [rprog] - print("-------------------use artificial rtile---------------------------") + printBanner( + row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile" + ) else: + printBanner( + row_symbol="-", col_symbol="|", length=100, context="Emiting configs" + ) rprogs = policy.emit_config_without_trails(args.topk) - print("Evaluating top {} configs".format(len(rprogs))) + printBanner( + row_symbol="-", + col_symbol="|", + length=100, + context="Evaluating top {} configs".format(len(rprogs)), + ) best_idx = -1 best_time = 1e100 idx = 0 @@ -488,7 +501,7 @@ def get_tvm_source( bar_id = 0 dtype = "float16" if args.use_tc else "float32" for rprog in rprogs: - print(f"rProg[{idx}]: {rprog.Dump()}") + print(f"[{rprogs.index(rprog)}] rProg: ", rprog.Dump()) block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1) grid_size = rprog.GetParallelism(0) blocks = (block_size, 1, 1) @@ -565,8 +578,24 @@ def get_tvm_source( os.system("rm {}.cu".format(file_name)) with open(log_name, "r") as f: - for line in f.readlines(): - print(line, end="") + print("Profiling result:") + lines = f.readlines() + if compute_capability >= "80": + for l in range(len(lines)): + if "Time (%)" in lines[l] and "Instances" in lines[l]: + print(lines[l] + lines[l + 2]) + break + else: + for l in range(len(lines)): + if "Type" in lines[l] and "Time(%)" in lines[l]: + print(lines[l], end="") + if ( + "default_function_kernel0" + if not LatestTVM + else "main_kernel" in lines[l] + ): + print(lines[l]) + break exec_time = get_time_from_nvprof_file(log_name) os.system("rm {}".format(log_name)) @@ -591,12 +620,22 @@ def get_tvm_source( evals.append(eval_results) bar_id += 1 + printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report") for topx, eval_results in zip(eval_bar, evals): print("Eval top {} configs".format(topx)) print("Compilation time: {}s".format(eval_results["compilation time"])) print("Best time: {}ms".format(eval_results["best time"])) + if LatestTVM: + print( + "Best perf: {} TFLOPS".format( + tvm.tir.analysis.estimate_tir_flops(rprog.sche.mod) + / eval_results["best time"] + * 1e-9 + ) + ) print("Best config: {}".format(eval_results["best config"])) print("Best idx: {}".format(eval_results["best idx"])) + print("-" * 100) cu_file_name = "roller_{}_{}.cu".format( args.op, "_".join([str(d) for d in args.shape]) diff --git a/test_op_mp.py b/test_op_mp.py index 07cceb1..8860a43 100644 --- a/test_op_mp.py +++ b/test_op_mp.py @@ -175,7 +175,8 @@ def main_template( for d in dim: size *= d byte = size * type_size - s_size += " int output_size" + str(i) + " = " + str(size) + ";\n" + if args.gen_check_code: + s_size += " int output_size" + str(i) + " = " + str(size) + ";\n" s_hmalloc += " " + name + "h = (float*)malloc(" + str(byte) + ");\n" s_hfree += " free(" + name + "h);\n" s_dmalloc += " cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n" @@ -189,25 +190,26 @@ def main_template( + str(byte) + ", cudaMemcpyDeviceToHost);\n" ) - s_simple_check += ( - " float same_res = " - + name - + "h[0];\n" - + " for (int i = 1; i < output_size" - + str(i) - + "; ++i)\n" - + " {\n" - " if (" - + name - + "h[i] != same_res)\n" - + " {\n" - + ' printf("output[%d] = %f\\n", i, ' - + name - + "h[i]);\n" - + " exit(1);\n" - + " }\n" - + " }\n" - ) + if args.gen_check_code: + s_simple_check += ( + " float same_res = " + + name + + "h[0];\n" + + " for (int i = 1; i < output_size" + + str(i) + + "; ++i)\n" + + " {\n" + " if (" + + name + + "h[i] != same_res)\n" + + " {\n" + + ' printf("output[%d] = %f\\n", i, ' + + name + + "h[i]);\n" + + " exit(1);\n" + + " }\n" + + " }\n" + ) if backend == "antares": kernel_name = "template_op_kernel0" @@ -438,7 +440,7 @@ def compile_and_run_kernel( device_id, idx, ): - print(f"rProg[{idx}]: {rprog.Dump()}") + print(f"[{idx}] rProg: ", rprog.Dump()) block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1) grid_size = rprog.GetParallelism(0) blocks = (block_size, 1, 1) @@ -518,8 +520,24 @@ def compile_and_run_kernel( os.system("rm {}.cu".format(file_name)) with open(log_name, "r") as f: - for line in f.readlines(): - print(line, end="") + print("Profiling result:") + lines = f.readlines() + if compute_capability >= "80": + for l in range(len(lines)): + if "Time (%)" in lines[l] and "Instances" in lines[l]: + print(lines[l] + lines[l + 2]) + break + else: + for l in range(len(lines)): + if "Type" in lines[l] and "Time(%)" in lines[l]: + print(lines[l], end="") + if ( + "default_function_kernel0" + if not LatestTVM + else "main_kernel" in lines[l] + ): + print(lines[l]) + break exec_time = get_time_from_nvprof_file(log_name) os.system("rm {}".format(log_name)) @@ -564,6 +582,7 @@ def eval_thread( if __name__ == "__main__": + printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace") print(args) expr = globals()[args.op] if args.fuse: @@ -613,11 +632,21 @@ def eval_thread( rprog.AddTile(0, rTile0) rprogs = [rprog] - print("-------------------use artificial rtile---------------------------") + printBanner( + row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile" + ) else: + printBanner( + row_symbol="-", col_symbol="|", length=100, context="Emiting configs" + ) rprogs = policy.emit_config_without_trails(args.topk) - print("Evaluating top {} configs".format(len(rprogs))) + printBanner( + row_symbol="-", + col_symbol="|", + length=100, + context="Evaluating top {} configs".format(len(rprogs)), + ) rprog_idx = alloc_configs_for_subprocess(args.num_threads, len(rprogs)) threads = [] @@ -650,8 +679,15 @@ def eval_thread( eval_time = time.time() - start_time + printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report") print("Top1 time: {} ms".format(top1_time)) print("Top10 time: {} ms".format(best_time)) + if LatestTVM: + print( + "Best perf: {} TFLOPS".format( + tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod) / best_time * 1e-9 + ) + ) print("Best idx: {}".format(best_idx)) print("Best config: {}".format(rprogs[best_idx].Dump())) print("Top1 compile time: {} s".format(emit_time)) diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh index a1742ee..493dc81 100755 --- a/tests/short-time-LatestTVM.sh +++ b/tests/short-time-LatestTVM.sh @@ -9,9 +9,9 @@ run_benchmarks() { local gpu_id=$1 local device_name=$2 - echo "========================================" + echo "====================================================================" echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)" - echo "========================================" + echo "====================================================================" export CUDA_VISIBLE_DEVICES=$gpu_id diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh index 9b04a14..04fdaa6 100755 --- a/tests/short-time-OldTVM.sh +++ b/tests/short-time-OldTVM.sh @@ -9,9 +9,9 @@ run_benchmarks() { local gpu_id=$1 local device_name=$2 - echo "========================================" + echo "====================================================================" echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)" - echo "========================================" + echo "====================================================================" export CUDA_VISIBLE_DEVICES=$gpu_id diff --git a/utils/commons.py b/utils/commons.py index 79f837d..95f9e53 100644 --- a/utils/commons.py +++ b/utils/commons.py @@ -33,6 +33,8 @@ compute_capability = tvm.runtime.cuda(0).compute_version.replace(".", "") +Backend = Literal["tvm", "antares"] + def deprecated(exit_immediately=True): """Deprecated decorator""" @@ -282,4 +284,20 @@ def get_result(self): return None -Backend = Literal["tvm", "antares"] +def printBanner( + row_symbol: str = "=", col_symbol: str = "||", length: int = 100, context: str = "" +): + banner = row_symbol * length + start_end_border = col_symbol + all_white_space_len = len(banner) - 2 * len(start_end_border) - len(context) + white_space_prefix = " " * (all_white_space_len // 2) + white_space_suffix = " " * (all_white_space_len - len(white_space_prefix)) + print(banner) + print( + start_end_border + + white_space_prefix + + context + + white_space_suffix + + start_end_border + ) + print(banner) From c3b3ecf8cb6a6ab8da6d92ddd36de83674a17444 Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Mon, 29 Sep 2025 18:18:41 +0800 Subject: [PATCH 5/5] fix bug There's a critical bug in this conditional statement due to incorrect parenthesization. The expression "main_kernel" in lines[l] is evaluated first, and its boolean result is used in the ternary expression. When not LatestTVM is true, the condition becomes if "default_function_kernel0":, which is always true because a non-empty string is truthy. This will cause the loop to break prematurely on the first line. The correct logic should be if ("default_function_kernel0" if not LatestTVM else "main_kernel") in lines[l] --- test_op.py | 6 ++---- test_op_mp.py | 10 +++++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/test_op.py b/test_op.py index 415cb5c..a49f37c 100644 --- a/test_op.py +++ b/test_op.py @@ -590,10 +590,8 @@ def get_tvm_source( if "Type" in lines[l] and "Time(%)" in lines[l]: print(lines[l], end="") if ( - "default_function_kernel0" - if not LatestTVM - else "main_kernel" in lines[l] - ): + "default_function_kernel0" if not LatestTVM else "main_kernel" + ) in lines[l]: print(lines[l]) break diff --git a/test_op_mp.py b/test_op_mp.py index 8860a43..156c717 100644 --- a/test_op_mp.py +++ b/test_op_mp.py @@ -532,10 +532,8 @@ def compile_and_run_kernel( if "Type" in lines[l] and "Time(%)" in lines[l]: print(lines[l], end="") if ( - "default_function_kernel0" - if not LatestTVM - else "main_kernel" in lines[l] - ): + "default_function_kernel0" if not LatestTVM else "main_kernel" + ) in lines[l]: print(lines[l]) break @@ -685,7 +683,9 @@ def eval_thread( if LatestTVM: print( "Best perf: {} TFLOPS".format( - tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod) / best_time * 1e-9 + tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod) + / best_time + * 1e-9 ) ) print("Best idx: {}".format(best_idx))