diff --git a/test_op.py b/test_op.py index 963981f..a49f37c 100644 --- a/test_op.py +++ b/test_op.py @@ -171,7 +171,8 @@ def main_template( for d in dim: size *= d byte = size * type_size - s_size += " int output_size" + str(i) + " = " + str(size) + ";\n" + if args.gen_check_code: + s_size += " int output_size" + str(i) + " = " + str(size) + ";\n" s_hmalloc += " " + name + "h = (float*)malloc(" + str(byte) + ");\n" s_hfree += " free(" + name + "h);\n" s_dmalloc += " cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n" @@ -185,25 +186,26 @@ def main_template( + str(byte) + ", cudaMemcpyDeviceToHost);\n" ) - s_simple_check += ( - " float same_res = " - + name - + "h[0];\n" - + " for (int i = 1; i < output_size" - + str(i) - + "; ++i)\n" - + " {\n" - " if (" - + name - + "h[i] != same_res)\n" - + " {\n" - + ' printf("output[%d] = %f\\n", i, ' - + name - + "h[i]);\n" - + " exit(1);\n" - + " }\n" - + " }\n" - ) + if args.gen_check_code: + s_simple_check += ( + " float same_res = " + + name + + "h[0];\n" + + " for (int i = 1; i < output_size" + + str(i) + + "; ++i)\n" + + " {\n" + " if (" + + name + + "h[i] != same_res)\n" + + " {\n" + + ' printf("output[%d] = %f\\n", i, ' + + name + + "h[i]);\n" + + " exit(1);\n" + + " }\n" + + " }\n" + ) if backend == "antares": kernel_name = "template_op_kernel0" @@ -426,6 +428,7 @@ def get_tvm_source( if __name__ == "__main__": + printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace") print(args) expr = globals()[args.op] if args.fuse: @@ -474,11 +477,21 @@ def get_tvm_source( rprog.AddTile(0, rTile0) rprogs = [rprog] - print("-------------------use artificial rtile---------------------------") + printBanner( + row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile" + ) else: + printBanner( + row_symbol="-", col_symbol="|", length=100, context="Emiting configs" + ) rprogs = policy.emit_config_without_trails(args.topk) - print("Evaluating top {} configs".format(len(rprogs))) + printBanner( + row_symbol="-", + col_symbol="|", + length=100, + context="Evaluating top {} configs".format(len(rprogs)), + ) best_idx = -1 best_time = 1e100 idx = 0 @@ -488,7 +501,7 @@ def get_tvm_source( bar_id = 0 dtype = "float16" if args.use_tc else "float32" for rprog in rprogs: - print(f"rProg[{idx}]: {rprog.Dump()}") + print(f"[{rprogs.index(rprog)}] rProg: ", rprog.Dump()) block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1) grid_size = rprog.GetParallelism(0) blocks = (block_size, 1, 1) @@ -565,8 +578,22 @@ def get_tvm_source( os.system("rm {}.cu".format(file_name)) with open(log_name, "r") as f: - for line in f.readlines(): - print(line, end="") + print("Profiling result:") + lines = f.readlines() + if compute_capability >= "80": + for l in range(len(lines)): + if "Time (%)" in lines[l] and "Instances" in lines[l]: + print(lines[l] + lines[l + 2]) + break + else: + for l in range(len(lines)): + if "Type" in lines[l] and "Time(%)" in lines[l]: + print(lines[l], end="") + if ( + "default_function_kernel0" if not LatestTVM else "main_kernel" + ) in lines[l]: + print(lines[l]) + break exec_time = get_time_from_nvprof_file(log_name) os.system("rm {}".format(log_name)) @@ -591,12 +618,22 @@ def get_tvm_source( evals.append(eval_results) bar_id += 1 + printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report") for topx, eval_results in zip(eval_bar, evals): print("Eval top {} configs".format(topx)) print("Compilation time: {}s".format(eval_results["compilation time"])) print("Best time: {}ms".format(eval_results["best time"])) + if LatestTVM: + print( + "Best perf: {} TFLOPS".format( + tvm.tir.analysis.estimate_tir_flops(rprog.sche.mod) + / eval_results["best time"] + * 1e-9 + ) + ) print("Best config: {}".format(eval_results["best config"])) print("Best idx: {}".format(eval_results["best idx"])) + print("-" * 100) cu_file_name = "roller_{}_{}.cu".format( args.op, "_".join([str(d) for d in args.shape]) diff --git a/test_op_mp.py b/test_op_mp.py index 07cceb1..156c717 100644 --- a/test_op_mp.py +++ b/test_op_mp.py @@ -175,7 +175,8 @@ def main_template( for d in dim: size *= d byte = size * type_size - s_size += " int output_size" + str(i) + " = " + str(size) + ";\n" + if args.gen_check_code: + s_size += " int output_size" + str(i) + " = " + str(size) + ";\n" s_hmalloc += " " + name + "h = (float*)malloc(" + str(byte) + ");\n" s_hfree += " free(" + name + "h);\n" s_dmalloc += " cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n" @@ -189,25 +190,26 @@ def main_template( + str(byte) + ", cudaMemcpyDeviceToHost);\n" ) - s_simple_check += ( - " float same_res = " - + name - + "h[0];\n" - + " for (int i = 1; i < output_size" - + str(i) - + "; ++i)\n" - + " {\n" - " if (" - + name - + "h[i] != same_res)\n" - + " {\n" - + ' printf("output[%d] = %f\\n", i, ' - + name - + "h[i]);\n" - + " exit(1);\n" - + " }\n" - + " }\n" - ) + if args.gen_check_code: + s_simple_check += ( + " float same_res = " + + name + + "h[0];\n" + + " for (int i = 1; i < output_size" + + str(i) + + "; ++i)\n" + + " {\n" + " if (" + + name + + "h[i] != same_res)\n" + + " {\n" + + ' printf("output[%d] = %f\\n", i, ' + + name + + "h[i]);\n" + + " exit(1);\n" + + " }\n" + + " }\n" + ) if backend == "antares": kernel_name = "template_op_kernel0" @@ -438,7 +440,7 @@ def compile_and_run_kernel( device_id, idx, ): - print(f"rProg[{idx}]: {rprog.Dump()}") + print(f"[{idx}] rProg: ", rprog.Dump()) block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1) grid_size = rprog.GetParallelism(0) blocks = (block_size, 1, 1) @@ -518,8 +520,22 @@ def compile_and_run_kernel( os.system("rm {}.cu".format(file_name)) with open(log_name, "r") as f: - for line in f.readlines(): - print(line, end="") + print("Profiling result:") + lines = f.readlines() + if compute_capability >= "80": + for l in range(len(lines)): + if "Time (%)" in lines[l] and "Instances" in lines[l]: + print(lines[l] + lines[l + 2]) + break + else: + for l in range(len(lines)): + if "Type" in lines[l] and "Time(%)" in lines[l]: + print(lines[l], end="") + if ( + "default_function_kernel0" if not LatestTVM else "main_kernel" + ) in lines[l]: + print(lines[l]) + break exec_time = get_time_from_nvprof_file(log_name) os.system("rm {}".format(log_name)) @@ -564,6 +580,7 @@ def eval_thread( if __name__ == "__main__": + printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace") print(args) expr = globals()[args.op] if args.fuse: @@ -613,11 +630,21 @@ def eval_thread( rprog.AddTile(0, rTile0) rprogs = [rprog] - print("-------------------use artificial rtile---------------------------") + printBanner( + row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile" + ) else: + printBanner( + row_symbol="-", col_symbol="|", length=100, context="Emiting configs" + ) rprogs = policy.emit_config_without_trails(args.topk) - print("Evaluating top {} configs".format(len(rprogs))) + printBanner( + row_symbol="-", + col_symbol="|", + length=100, + context="Evaluating top {} configs".format(len(rprogs)), + ) rprog_idx = alloc_configs_for_subprocess(args.num_threads, len(rprogs)) threads = [] @@ -650,8 +677,17 @@ def eval_thread( eval_time = time.time() - start_time + printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report") print("Top1 time: {} ms".format(top1_time)) print("Top10 time: {} ms".format(best_time)) + if LatestTVM: + print( + "Best perf: {} TFLOPS".format( + tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod) + / best_time + * 1e-9 + ) + ) print("Best idx: {}".format(best_idx)) print("Best config: {}".format(rprogs[best_idx].Dump())) print("Top1 compile time: {} s".format(emit_time)) diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh index a1742ee..493dc81 100755 --- a/tests/short-time-LatestTVM.sh +++ b/tests/short-time-LatestTVM.sh @@ -9,9 +9,9 @@ run_benchmarks() { local gpu_id=$1 local device_name=$2 - echo "========================================" + echo "====================================================================" echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)" - echo "========================================" + echo "====================================================================" export CUDA_VISIBLE_DEVICES=$gpu_id diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh index 9b04a14..04fdaa6 100755 --- a/tests/short-time-OldTVM.sh +++ b/tests/short-time-OldTVM.sh @@ -9,9 +9,9 @@ run_benchmarks() { local gpu_id=$1 local device_name=$2 - echo "========================================" + echo "====================================================================" echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)" - echo "========================================" + echo "====================================================================" export CUDA_VISIBLE_DEVICES=$gpu_id diff --git a/utils/commons.py b/utils/commons.py index 79f837d..95f9e53 100644 --- a/utils/commons.py +++ b/utils/commons.py @@ -33,6 +33,8 @@ compute_capability = tvm.runtime.cuda(0).compute_version.replace(".", "") +Backend = Literal["tvm", "antares"] + def deprecated(exit_immediately=True): """Deprecated decorator""" @@ -282,4 +284,20 @@ def get_result(self): return None -Backend = Literal["tvm", "antares"] +def printBanner( + row_symbol: str = "=", col_symbol: str = "||", length: int = 100, context: str = "" +): + banner = row_symbol * length + start_end_border = col_symbol + all_white_space_len = len(banner) - 2 * len(start_end_border) - len(context) + white_space_prefix = " " * (all_white_space_len // 2) + white_space_suffix = " " * (all_white_space_len - len(white_space_prefix)) + print(banner) + print( + start_end_border + + white_space_prefix + + context + + white_space_suffix + + start_end_border + ) + print(banner)