From 1c3b67c2d7c0e6f7fd128cd788b8a36137b3556e Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Sun, 28 Sep 2025 21:15:16 +0800
Subject: [PATCH 1/5] [RFC] Clean up debug code and improve output messages

---
 test_op.py    | 2 +-
 test_op_mp.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_op.py b/test_op.py
index 963981f..dba6863 100644
--- a/test_op.py
+++ b/test_op.py
@@ -488,7 +488,7 @@ def get_tvm_source(
     bar_id = 0
     dtype = "float16" if args.use_tc else "float32"
     for rprog in rprogs:
-        print(f"rProg[{idx}]: {rprog.Dump()}")
+        print("rProg: ", rprog.Dump())
         block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
         grid_size = rprog.GetParallelism(0)
         blocks = (block_size, 1, 1)
diff --git a/test_op_mp.py b/test_op_mp.py
index 07cceb1..f584fe1 100644
--- a/test_op_mp.py
+++ b/test_op_mp.py
@@ -438,7 +438,7 @@ def compile_and_run_kernel(
     device_id,
     idx,
 ):
-    print(f"rProg[{idx}]: {rprog.Dump()}")
+    print("rProg: ", rprog.Dump())
     block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
     grid_size = rprog.GetParallelism(0)
     blocks = (block_size, 1, 1)

From 40aa0668482f889e8bfb95c3accede5aa1ede583 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Sun, 28 Sep 2025 21:22:29 +0800
Subject: [PATCH 2/5] Update test_op_mp.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 test_op_mp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_op_mp.py b/test_op_mp.py
index f584fe1..07cceb1 100644
--- a/test_op_mp.py
+++ b/test_op_mp.py
@@ -438,7 +438,7 @@ def compile_and_run_kernel(
     device_id,
     idx,
 ):
-    print("rProg: ", rprog.Dump())
+    print(f"rProg[{idx}]: {rprog.Dump()}")
     block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
     grid_size = rprog.GetParallelism(0)
     blocks = (block_size, 1, 1)

From abdc4485a9067cdbaa1d21e1153a7e09799c124f Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Sun, 28 Sep 2025 21:23:32 +0800
Subject: [PATCH 3/5] Update test_op.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 test_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_op.py b/test_op.py
index dba6863..963981f 100644
--- a/test_op.py
+++ b/test_op.py
@@ -488,7 +488,7 @@ def get_tvm_source(
     bar_id = 0
     dtype = "float16" if args.use_tc else "float32"
     for rprog in rprogs:
-        print("rProg: ", rprog.Dump())
+        print(f"rProg[{idx}]: {rprog.Dump()}")
         block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
         grid_size = rprog.GetParallelism(0)
         blocks = (block_size, 1, 1)

From cbfb403fe58ea4aa5ff06ae4ec787443057ec3d5 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Mon, 29 Sep 2025 17:26:14 +0800
Subject: [PATCH 4/5] [Perf] Eenhance profiling output and add performance
 metrics

- Add conditional compilation for verification code via `--gen-check-code`
- Improve nvprof output parsing for different compute capabilities
- Add TFLOPS calculation in performance reports
- Enhance console output with formatted banners and progress indicators
- Maintain multi-process evaluation support in test_op_mp.py
---
 test_op.py                    | 89 +++++++++++++++++++++++++----------
 test_op_mp.py                 | 86 +++++++++++++++++++++++----------
 tests/short-time-LatestTVM.sh |  4 +-
 tests/short-time-OldTVM.sh    |  4 +-
 utils/commons.py              | 20 +++++++-
 5 files changed, 148 insertions(+), 55 deletions(-)

diff --git a/test_op.py b/test_op.py
index 963981f..415cb5c 100644
--- a/test_op.py
+++ b/test_op.py
@@ -171,7 +171,8 @@ def main_template(
         for d in dim:
             size *= d
         byte = size * type_size
-        s_size += "    int output_size" + str(i) + " = " + str(size) + ";\n"
+        if args.gen_check_code:
+            s_size += "    int output_size" + str(i) + " = " + str(size) + ";\n"
         s_hmalloc += "    " + name + "h = (float*)malloc(" + str(byte) + ");\n"
         s_hfree += "    free(" + name + "h);\n"
         s_dmalloc += "    cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n"
@@ -185,25 +186,26 @@ def main_template(
             + str(byte)
             + ", cudaMemcpyDeviceToHost);\n"
         )
-        s_simple_check += (
-            "    float same_res = "
-            + name
-            + "h[0];\n"
-            + "    for (int i = 1; i < output_size"
-            + str(i)
-            + "; ++i)\n"
-            + "    {\n"
-            "        if ("
-            + name
-            + "h[i] != same_res)\n"
-            + "        {\n"
-            + '            printf("output[%d] = %f\\n", i, '
-            + name
-            + "h[i]);\n"
-            + "            exit(1);\n"
-            + "        }\n"
-            + "    }\n"
-        )
+        if args.gen_check_code:
+            s_simple_check += (
+                "    float same_res = "
+                + name
+                + "h[0];\n"
+                + "    for (int i = 1; i < output_size"
+                + str(i)
+                + "; ++i)\n"
+                + "    {\n"
+                "        if ("
+                + name
+                + "h[i] != same_res)\n"
+                + "        {\n"
+                + '            printf("output[%d] = %f\\n", i, '
+                + name
+                + "h[i]);\n"
+                + "            exit(1);\n"
+                + "        }\n"
+                + "    }\n"
+            )
 
     if backend == "antares":
         kernel_name = "template_op_kernel0"
@@ -426,6 +428,7 @@ def get_tvm_source(
 
 
 if __name__ == "__main__":
+    printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace")
     print(args)
     expr = globals()[args.op]
     if args.fuse:
@@ -474,11 +477,21 @@ def get_tvm_source(
         rprog.AddTile(0, rTile0)
 
         rprogs = [rprog]
-        print("-------------------use artificial rtile---------------------------")
+        printBanner(
+            row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile"
+        )
     else:
+        printBanner(
+            row_symbol="-", col_symbol="|", length=100, context="Emiting configs"
+        )
         rprogs = policy.emit_config_without_trails(args.topk)
 
-    print("Evaluating top {} configs".format(len(rprogs)))
+    printBanner(
+        row_symbol="-",
+        col_symbol="|",
+        length=100,
+        context="Evaluating top {} configs".format(len(rprogs)),
+    )
     best_idx = -1
     best_time = 1e100
     idx = 0
@@ -488,7 +501,7 @@ def get_tvm_source(
     bar_id = 0
     dtype = "float16" if args.use_tc else "float32"
     for rprog in rprogs:
-        print(f"rProg[{idx}]: {rprog.Dump()}")
+        print(f"[{rprogs.index(rprog)}] rProg: ", rprog.Dump())
         block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
         grid_size = rprog.GetParallelism(0)
         blocks = (block_size, 1, 1)
@@ -565,8 +578,24 @@ def get_tvm_source(
         os.system("rm {}.cu".format(file_name))
 
         with open(log_name, "r") as f:
-            for line in f.readlines():
-                print(line, end="")
+            print("Profiling result:")
+            lines = f.readlines()
+            if compute_capability >= "80":
+                for l in range(len(lines)):
+                    if "Time (%)" in lines[l] and "Instances" in lines[l]:
+                        print(lines[l] + lines[l + 2])
+                        break
+            else:
+                for l in range(len(lines)):
+                    if "Type" in lines[l] and "Time(%)" in lines[l]:
+                        print(lines[l], end="")
+                    if (
+                        "default_function_kernel0"
+                        if not LatestTVM
+                        else "main_kernel" in lines[l]
+                    ):
+                        print(lines[l])
+                        break
 
         exec_time = get_time_from_nvprof_file(log_name)
         os.system("rm {}".format(log_name))
@@ -591,12 +620,22 @@ def get_tvm_source(
             evals.append(eval_results)
             bar_id += 1
 
+    printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report")
     for topx, eval_results in zip(eval_bar, evals):
         print("Eval top {} configs".format(topx))
         print("Compilation time: {}s".format(eval_results["compilation time"]))
         print("Best time: {}ms".format(eval_results["best time"]))
+        if LatestTVM:
+            print(
+                "Best perf: {} TFLOPS".format(
+                    tvm.tir.analysis.estimate_tir_flops(rprog.sche.mod)
+                    / eval_results["best time"]
+                    * 1e-9
+                )
+            )
         print("Best config: {}".format(eval_results["best config"]))
         print("Best idx: {}".format(eval_results["best idx"]))
+        print("-" * 100)
 
     cu_file_name = "roller_{}_{}.cu".format(
         args.op, "_".join([str(d) for d in args.shape])
diff --git a/test_op_mp.py b/test_op_mp.py
index 07cceb1..8860a43 100644
--- a/test_op_mp.py
+++ b/test_op_mp.py
@@ -175,7 +175,8 @@ def main_template(
         for d in dim:
             size *= d
         byte = size * type_size
-        s_size += "    int output_size" + str(i) + " = " + str(size) + ";\n"
+        if args.gen_check_code:
+            s_size += "    int output_size" + str(i) + " = " + str(size) + ";\n"
         s_hmalloc += "    " + name + "h = (float*)malloc(" + str(byte) + ");\n"
         s_hfree += "    free(" + name + "h);\n"
         s_dmalloc += "    cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n"
@@ -189,25 +190,26 @@ def main_template(
             + str(byte)
             + ", cudaMemcpyDeviceToHost);\n"
         )
-        s_simple_check += (
-            "    float same_res = "
-            + name
-            + "h[0];\n"
-            + "    for (int i = 1; i < output_size"
-            + str(i)
-            + "; ++i)\n"
-            + "    {\n"
-            "        if ("
-            + name
-            + "h[i] != same_res)\n"
-            + "        {\n"
-            + '            printf("output[%d] = %f\\n", i, '
-            + name
-            + "h[i]);\n"
-            + "            exit(1);\n"
-            + "        }\n"
-            + "    }\n"
-        )
+        if args.gen_check_code:
+            s_simple_check += (
+                "    float same_res = "
+                + name
+                + "h[0];\n"
+                + "    for (int i = 1; i < output_size"
+                + str(i)
+                + "; ++i)\n"
+                + "    {\n"
+                "        if ("
+                + name
+                + "h[i] != same_res)\n"
+                + "        {\n"
+                + '            printf("output[%d] = %f\\n", i, '
+                + name
+                + "h[i]);\n"
+                + "            exit(1);\n"
+                + "        }\n"
+                + "    }\n"
+            )
 
     if backend == "antares":
         kernel_name = "template_op_kernel0"
@@ -438,7 +440,7 @@ def compile_and_run_kernel(
     device_id,
     idx,
 ):
-    print(f"rProg[{idx}]: {rprog.Dump()}")
+    print(f"[{idx}] rProg: ", rprog.Dump())
     block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
     grid_size = rprog.GetParallelism(0)
     blocks = (block_size, 1, 1)
@@ -518,8 +520,24 @@ def compile_and_run_kernel(
     os.system("rm {}.cu".format(file_name))
 
     with open(log_name, "r") as f:
-        for line in f.readlines():
-            print(line, end="")
+        print("Profiling result:")
+        lines = f.readlines()
+        if compute_capability >= "80":
+            for l in range(len(lines)):
+                if "Time (%)" in lines[l] and "Instances" in lines[l]:
+                    print(lines[l] + lines[l + 2])
+                    break
+        else:
+            for l in range(len(lines)):
+                if "Type" in lines[l] and "Time(%)" in lines[l]:
+                    print(lines[l], end="")
+                if (
+                    "default_function_kernel0"
+                    if not LatestTVM
+                    else "main_kernel" in lines[l]
+                ):
+                    print(lines[l])
+                    break
 
     exec_time = get_time_from_nvprof_file(log_name)
     os.system("rm {}".format(log_name))
@@ -564,6 +582,7 @@ def eval_thread(
 
 
 if __name__ == "__main__":
+    printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace")
     print(args)
     expr = globals()[args.op]
     if args.fuse:
@@ -613,11 +632,21 @@ def eval_thread(
         rprog.AddTile(0, rTile0)
 
         rprogs = [rprog]
-        print("-------------------use artificial rtile---------------------------")
+        printBanner(
+            row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile"
+        )
     else:
+        printBanner(
+            row_symbol="-", col_symbol="|", length=100, context="Emiting configs"
+        )
         rprogs = policy.emit_config_without_trails(args.topk)
 
-    print("Evaluating top {} configs".format(len(rprogs)))
+    printBanner(
+        row_symbol="-",
+        col_symbol="|",
+        length=100,
+        context="Evaluating top {} configs".format(len(rprogs)),
+    )
 
     rprog_idx = alloc_configs_for_subprocess(args.num_threads, len(rprogs))
     threads = []
@@ -650,8 +679,15 @@ def eval_thread(
 
     eval_time = time.time() - start_time
 
+    printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report")
     print("Top1 time: {} ms".format(top1_time))
     print("Top10 time: {} ms".format(best_time))
+    if LatestTVM:
+        print(
+            "Best perf: {} TFLOPS".format(
+                tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod) / best_time * 1e-9
+            )
+        )
     print("Best idx: {}".format(best_idx))
     print("Best config: {}".format(rprogs[best_idx].Dump()))
     print("Top1 compile time: {} s".format(emit_time))
diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh
index a1742ee..493dc81 100755
--- a/tests/short-time-LatestTVM.sh
+++ b/tests/short-time-LatestTVM.sh
@@ -9,9 +9,9 @@ run_benchmarks() {
     local gpu_id=$1
     local device_name=$2
     
-    echo "========================================"
+    echo "===================================================================="
     echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)"
-    echo "========================================"
+    echo "===================================================================="
     
     export CUDA_VISIBLE_DEVICES=$gpu_id
     
diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh
index 9b04a14..04fdaa6 100755
--- a/tests/short-time-OldTVM.sh
+++ b/tests/short-time-OldTVM.sh
@@ -9,9 +9,9 @@ run_benchmarks() {
     local gpu_id=$1
     local device_name=$2
     
-    echo "========================================"
+    echo "===================================================================="
     echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)"
-    echo "========================================"
+    echo "===================================================================="
     
     export CUDA_VISIBLE_DEVICES=$gpu_id
     
diff --git a/utils/commons.py b/utils/commons.py
index 79f837d..95f9e53 100644
--- a/utils/commons.py
+++ b/utils/commons.py
@@ -33,6 +33,8 @@
 
 compute_capability = tvm.runtime.cuda(0).compute_version.replace(".", "")
 
+Backend = Literal["tvm", "antares"]
+
 
 def deprecated(exit_immediately=True):
     """Deprecated decorator"""
@@ -282,4 +284,20 @@ def get_result(self):
             return None
 
 
-Backend = Literal["tvm", "antares"]
+def printBanner(
+    row_symbol: str = "=", col_symbol: str = "||", length: int = 100, context: str = ""
+):
+    banner = row_symbol * length
+    start_end_border = col_symbol
+    all_white_space_len = len(banner) - 2 * len(start_end_border) - len(context)
+    white_space_prefix = " " * (all_white_space_len // 2)
+    white_space_suffix = " " * (all_white_space_len - len(white_space_prefix))
+    print(banner)
+    print(
+        start_end_border
+        + white_space_prefix
+        + context
+        + white_space_suffix
+        + start_end_border
+    )
+    print(banner)

From c3b3ecf8cb6a6ab8da6d92ddd36de83674a17444 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Mon, 29 Sep 2025 18:18:41 +0800
Subject: [PATCH 5/5] fix bug

There's a critical bug in this conditional statement due to incorrect parenthesization. The expression "main_kernel" in lines[l] is evaluated first, and its boolean result is used in the ternary expression. When not LatestTVM is true, the condition becomes if "default_function_kernel0":, which is always true because a non-empty string is truthy. This will cause the loop to break prematurely on the first line.

The correct logic should be if ("default_function_kernel0" if not LatestTVM else "main_kernel") in lines[l]
---
 test_op.py    |  6 ++----
 test_op_mp.py | 10 +++++-----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/test_op.py b/test_op.py
index 415cb5c..a49f37c 100644
--- a/test_op.py
+++ b/test_op.py
@@ -590,10 +590,8 @@ def get_tvm_source(
                     if "Type" in lines[l] and "Time(%)" in lines[l]:
                         print(lines[l], end="")
                     if (
-                        "default_function_kernel0"
-                        if not LatestTVM
-                        else "main_kernel" in lines[l]
-                    ):
+                        "default_function_kernel0" if not LatestTVM else "main_kernel"
+                    ) in lines[l]:
                         print(lines[l])
                         break
 
diff --git a/test_op_mp.py b/test_op_mp.py
index 8860a43..156c717 100644
--- a/test_op_mp.py
+++ b/test_op_mp.py
@@ -532,10 +532,8 @@ def compile_and_run_kernel(
                 if "Type" in lines[l] and "Time(%)" in lines[l]:
                     print(lines[l], end="")
                 if (
-                    "default_function_kernel0"
-                    if not LatestTVM
-                    else "main_kernel" in lines[l]
-                ):
+                    "default_function_kernel0" if not LatestTVM else "main_kernel"
+                ) in lines[l]:
                     print(lines[l])
                     break
 
@@ -685,7 +683,9 @@ def eval_thread(
     if LatestTVM:
         print(
             "Best perf: {} TFLOPS".format(
-                tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod) / best_time * 1e-9
+                tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod)
+                / best_time
+                * 1e-9
             )
         )
     print("Best idx: {}".format(best_idx))