ConvolutedDog · ConvolutedDog · Sep 29, 2025 · Sep 28, 2025 · Sep 28, 2025 · Sep 28, 2025
diff --git a/test_op.py b/test_op.py
@@ -171,7 +171,8 @@ def main_template(
         for d in dim:
             size *= d
         byte = size * type_size
-        s_size += "    int output_size" + str(i) + " = " + str(size) + ";\n"
+        if args.gen_check_code:
+            s_size += "    int output_size" + str(i) + " = " + str(size) + ";\n"
         s_hmalloc += "    " + name + "h = (float*)malloc(" + str(byte) + ");\n"
         s_hfree += "    free(" + name + "h);\n"
         s_dmalloc += "    cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n"
@@ -185,25 +186,26 @@ def main_template(
             + str(byte)
             + ", cudaMemcpyDeviceToHost);\n"
         )
-        s_simple_check += (
-            "    float same_res = "
-            + name
-            + "h[0];\n"
-            + "    for (int i = 1; i < output_size"
-            + str(i)
-            + "; ++i)\n"
-            + "    {\n"
-            "        if ("
-            + name
-            + "h[i] != same_res)\n"
-            + "        {\n"
-            + '            printf("output[%d] = %f\\n", i, '
-            + name
-            + "h[i]);\n"
-            + "            exit(1);\n"
-            + "        }\n"
-            + "    }\n"
-        )
+        if args.gen_check_code:
+            s_simple_check += (
+                "    float same_res = "
+                + name
+                + "h[0];\n"
+                + "    for (int i = 1; i < output_size"
+                + str(i)
+                + "; ++i)\n"
+                + "    {\n"
+                "        if ("
+                + name
+                + "h[i] != same_res)\n"
+                + "        {\n"
+                + '            printf("output[%d] = %f\\n", i, '
+                + name
+                + "h[i]);\n"
+                + "            exit(1);\n"
+                + "        }\n"
+                + "    }\n"
+            )
 
     if backend == "antares":
         kernel_name = "template_op_kernel0"
@@ -426,6 +428,7 @@ def get_tvm_source(
 
 
 if __name__ == "__main__":
+    printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace")
     print(args)
     expr = globals()[args.op]
     if args.fuse:
@@ -474,11 +477,21 @@ def get_tvm_source(
         rprog.AddTile(0, rTile0)
 
         rprogs = [rprog]
-        print("-------------------use artificial rtile---------------------------")
+        printBanner(
+            row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile"
+        )
     else:
+        printBanner(
+            row_symbol="-", col_symbol="|", length=100, context="Emiting configs"
+        )
         rprogs = policy.emit_config_without_trails(args.topk)
 
-    print("Evaluating top {} configs".format(len(rprogs)))
+    printBanner(
+        row_symbol="-",
+        col_symbol="|",
+        length=100,
+        context="Evaluating top {} configs".format(len(rprogs)),
+    )
     best_idx = -1
     best_time = 1e100
     idx = 0
@@ -488,7 +501,7 @@ def get_tvm_source(
     bar_id = 0
     dtype = "float16" if args.use_tc else "float32"
     for rprog in rprogs:
-        print(f"rProg[{idx}]: {rprog.Dump()}")
+        print(f"[{rprogs.index(rprog)}] rProg: ", rprog.Dump())
         block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
         grid_size = rprog.GetParallelism(0)
         blocks = (block_size, 1, 1)
@@ -565,8 +578,22 @@ def get_tvm_source(
         os.system("rm {}.cu".format(file_name))
 
         with open(log_name, "r") as f:
-            for line in f.readlines():
-                print(line, end="")
+            print("Profiling result:")
+            lines = f.readlines()
+            if compute_capability >= "80":
+                for l in range(len(lines)):
+                    if "Time (%)" in lines[l] and "Instances" in lines[l]:
+                        print(lines[l] + lines[l + 2])
+                        break
+            else:
+                for l in range(len(lines)):
+                    if "Type" in lines[l] and "Time(%)" in lines[l]:
+                        print(lines[l], end="")
+                    if (
+                        "default_function_kernel0" if not LatestTVM else "main_kernel"
+                    ) in lines[l]:
+                        print(lines[l])
+                        break
 
         exec_time = get_time_from_nvprof_file(log_name)
         os.system("rm {}".format(log_name))
@@ -591,12 +618,22 @@ def get_tvm_source(
             evals.append(eval_results)
             bar_id += 1
 
+    printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report")
     for topx, eval_results in zip(eval_bar, evals):
         print("Eval top {} configs".format(topx))
         print("Compilation time: {}s".format(eval_results["compilation time"]))
         print("Best time: {}ms".format(eval_results["best time"]))
+        if LatestTVM:
+            print(
+                "Best perf: {} TFLOPS".format(
+                    tvm.tir.analysis.estimate_tir_flops(rprog.sche.mod)
+                    / eval_results["best time"]
+                    * 1e-9
+                )
+            )
         print("Best config: {}".format(eval_results["best config"]))
         print("Best idx: {}".format(eval_results["best idx"]))
+        print("-" * 100)
 
     cu_file_name = "roller_{}_{}.cu".format(
         args.op, "_".join([str(d) for d in args.shape])

diff --git a/test_op_mp.py b/test_op_mp.py
@@ -175,7 +175,8 @@ def main_template(
         for d in dim:
             size *= d
         byte = size * type_size
-        s_size += "    int output_size" + str(i) + " = " + str(size) + ";\n"
+        if args.gen_check_code:
+            s_size += "    int output_size" + str(i) + " = " + str(size) + ";\n"
         s_hmalloc += "    " + name + "h = (float*)malloc(" + str(byte) + ");\n"
         s_hfree += "    free(" + name + "h);\n"
         s_dmalloc += "    cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n"
@@ -189,25 +190,26 @@ def main_template(
             + str(byte)
             + ", cudaMemcpyDeviceToHost);\n"
         )
-        s_simple_check += (
-            "    float same_res = "
-            + name
-            + "h[0];\n"
-            + "    for (int i = 1; i < output_size"
-            + str(i)
-            + "; ++i)\n"
-            + "    {\n"
-            "        if ("
-            + name
-            + "h[i] != same_res)\n"
-            + "        {\n"
-            + '            printf("output[%d] = %f\\n", i, '
-            + name
-            + "h[i]);\n"
-            + "            exit(1);\n"
-            + "        }\n"
-            + "    }\n"
-        )
+        if args.gen_check_code:
+            s_simple_check += (
+                "    float same_res = "
+                + name
+                + "h[0];\n"
+                + "    for (int i = 1; i < output_size"
+                + str(i)
+                + "; ++i)\n"
+                + "    {\n"
+                "        if ("
+                + name
+                + "h[i] != same_res)\n"
+                + "        {\n"
+                + '            printf("output[%d] = %f\\n", i, '
+                + name
+                + "h[i]);\n"
+                + "            exit(1);\n"
+                + "        }\n"
+                + "    }\n"
+            )
 
     if backend == "antares":
         kernel_name = "template_op_kernel0"
@@ -438,7 +440,7 @@ def compile_and_run_kernel(
     device_id,
     idx,
 ):
-    print(f"rProg[{idx}]: {rprog.Dump()}")
+    print(f"[{idx}] rProg: ", rprog.Dump())
     block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
     grid_size = rprog.GetParallelism(0)
     blocks = (block_size, 1, 1)
@@ -518,8 +520,22 @@ def compile_and_run_kernel(
     os.system("rm {}.cu".format(file_name))
 
     with open(log_name, "r") as f:
-        for line in f.readlines():
-            print(line, end="")
+        print("Profiling result:")
+        lines = f.readlines()
+        if compute_capability >= "80":
+            for l in range(len(lines)):
+                if "Time (%)" in lines[l] and "Instances" in lines[l]:
+                    print(lines[l] + lines[l + 2])
+                    break
+        else:
+            for l in range(len(lines)):
+                if "Type" in lines[l] and "Time(%)" in lines[l]:
+                    print(lines[l], end="")
+                if (
+                    "default_function_kernel0" if not LatestTVM else "main_kernel"
+                ) in lines[l]:
+                    print(lines[l])
+                    break
 
     exec_time = get_time_from_nvprof_file(log_name)
     os.system("rm {}".format(log_name))
@@ -564,6 +580,7 @@ def eval_thread(
 
 
 if __name__ == "__main__":
+    printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace")
     print(args)
     expr = globals()[args.op]
     if args.fuse:
@@ -613,11 +630,21 @@ def eval_thread(
         rprog.AddTile(0, rTile0)
 
         rprogs = [rprog]
-        print("-------------------use artificial rtile---------------------------")
+        printBanner(
+            row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile"
+        )
     else:
+        printBanner(
+            row_symbol="-", col_symbol="|", length=100, context="Emiting configs"
+        )
         rprogs = policy.emit_config_without_trails(args.topk)
 
-    print("Evaluating top {} configs".format(len(rprogs)))
+    printBanner(
+        row_symbol="-",
+        col_symbol="|",
+        length=100,
+        context="Evaluating top {} configs".format(len(rprogs)),
+    )
 
     rprog_idx = alloc_configs_for_subprocess(args.num_threads, len(rprogs))
     threads = []
@@ -650,8 +677,17 @@ def eval_thread(
 
     eval_time = time.time() - start_time
 
+    printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report")
     print("Top1 time: {} ms".format(top1_time))
     print("Top10 time: {} ms".format(best_time))
+    if LatestTVM:
+        print(
+            "Best perf: {} TFLOPS".format(
+                tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod)
+                / best_time
+                * 1e-9
+            )
+        )
     print("Best idx: {}".format(best_idx))
     print("Best config: {}".format(rprogs[best_idx].Dump()))
     print("Top1 compile time: {} s".format(emit_time))

diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh
@@ -9,9 +9,9 @@ run_benchmarks() {
     local gpu_id=$1
     local device_name=$2
 
-    echo "========================================"
+    echo "===================================================================="
     echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)"
-    echo "========================================"
+    echo "===================================================================="
 
     export CUDA_VISIBLE_DEVICES=$gpu_id
 

diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh
@@ -9,9 +9,9 @@ run_benchmarks() {
     local gpu_id=$1
     local device_name=$2
 
-    echo "========================================"
+    echo "===================================================================="
     echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)"
-    echo "========================================"
+    echo "===================================================================="
 
     export CUDA_VISIBLE_DEVICES=$gpu_id
 

diff --git a/utils/commons.py b/utils/commons.py
@@ -33,6 +33,8 @@
 
 compute_capability = tvm.runtime.cuda(0).compute_version.replace(".", "")
 
+Backend = Literal["tvm", "antares"]
+
 
 def deprecated(exit_immediately=True):
     """Deprecated decorator"""
@@ -282,4 +284,20 @@ def get_result(self):
             return None
 
 
-Backend = Literal["tvm", "antares"]
+def printBanner(
+    row_symbol: str = "=", col_symbol: str = "||", length: int = 100, context: str = ""
+):
+    banner = row_symbol * length
+    start_end_border = col_symbol
+    all_white_space_len = len(banner) - 2 * len(start_end_border) - len(context)
+    white_space_prefix = " " * (all_white_space_len // 2)
+    white_space_suffix = " " * (all_white_space_len - len(white_space_prefix))
+    print(banner)
+    print(
+        start_end_border
+        + white_space_prefix
+        + context
+        + white_space_suffix
+        + start_end_border
+    )
+    print(banner)