Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 62 additions & 25 deletions test_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ def main_template(
for d in dim:
size *= d
byte = size * type_size
s_size += " int output_size" + str(i) + " = " + str(size) + ";\n"
if args.gen_check_code:
s_size += " int output_size" + str(i) + " = " + str(size) + ";\n"
s_hmalloc += " " + name + "h = (float*)malloc(" + str(byte) + ");\n"
s_hfree += " free(" + name + "h);\n"
s_dmalloc += " cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n"
Expand All @@ -185,25 +186,26 @@ def main_template(
+ str(byte)
+ ", cudaMemcpyDeviceToHost);\n"
)
s_simple_check += (
" float same_res = "
+ name
+ "h[0];\n"
+ " for (int i = 1; i < output_size"
+ str(i)
+ "; ++i)\n"
+ " {\n"
" if ("
+ name
+ "h[i] != same_res)\n"
+ " {\n"
+ ' printf("output[%d] = %f\\n", i, '
+ name
+ "h[i]);\n"
+ " exit(1);\n"
+ " }\n"
+ " }\n"
)
if args.gen_check_code:
s_simple_check += (
" float same_res = "
+ name
+ "h[0];\n"
+ " for (int i = 1; i < output_size"
+ str(i)
+ "; ++i)\n"
+ " {\n"
" if ("
+ name
+ "h[i] != same_res)\n"
+ " {\n"
+ ' printf("output[%d] = %f\\n", i, '
+ name
+ "h[i]);\n"
+ " exit(1);\n"
+ " }\n"
+ " }\n"
)

if backend == "antares":
kernel_name = "template_op_kernel0"
Expand Down Expand Up @@ -426,6 +428,7 @@ def get_tvm_source(


if __name__ == "__main__":
printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace")
print(args)
expr = globals()[args.op]
if args.fuse:
Expand Down Expand Up @@ -474,11 +477,21 @@ def get_tvm_source(
rprog.AddTile(0, rTile0)

rprogs = [rprog]
print("-------------------use artificial rtile---------------------------")
printBanner(
row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile"
)
else:
printBanner(
row_symbol="-", col_symbol="|", length=100, context="Emiting configs"
)
rprogs = policy.emit_config_without_trails(args.topk)

print("Evaluating top {} configs".format(len(rprogs)))
printBanner(
row_symbol="-",
col_symbol="|",
length=100,
context="Evaluating top {} configs".format(len(rprogs)),
)
best_idx = -1
best_time = 1e100
idx = 0
Expand All @@ -488,7 +501,7 @@ def get_tvm_source(
bar_id = 0
dtype = "float16" if args.use_tc else "float32"
for rprog in rprogs:
print(f"rProg[{idx}]: {rprog.Dump()}")
print(f"[{rprogs.index(rprog)}] rProg: ", rprog.Dump())
Comment thread
ConvolutedDog marked this conversation as resolved.
block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
grid_size = rprog.GetParallelism(0)
blocks = (block_size, 1, 1)
Expand Down Expand Up @@ -565,8 +578,22 @@ def get_tvm_source(
os.system("rm {}.cu".format(file_name))

with open(log_name, "r") as f:
for line in f.readlines():
print(line, end="")
print("Profiling result:")
lines = f.readlines()
if compute_capability >= "80":
for l in range(len(lines)):
if "Time (%)" in lines[l] and "Instances" in lines[l]:
print(lines[l] + lines[l + 2])
break
else:
for l in range(len(lines)):
if "Type" in lines[l] and "Time(%)" in lines[l]:
print(lines[l], end="")
if (
"default_function_kernel0" if not LatestTVM else "main_kernel"
) in lines[l]:
print(lines[l])
break

exec_time = get_time_from_nvprof_file(log_name)
os.system("rm {}".format(log_name))
Expand All @@ -591,12 +618,22 @@ def get_tvm_source(
evals.append(eval_results)
bar_id += 1

printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report")
for topx, eval_results in zip(eval_bar, evals):
print("Eval top {} configs".format(topx))
print("Compilation time: {}s".format(eval_results["compilation time"]))
print("Best time: {}ms".format(eval_results["best time"]))
if LatestTVM:
print(
"Best perf: {} TFLOPS".format(
tvm.tir.analysis.estimate_tir_flops(rprog.sche.mod)
Comment thread
ConvolutedDog marked this conversation as resolved.
/ eval_results["best time"]
* 1e-9
)
)
print("Best config: {}".format(eval_results["best config"]))
print("Best idx: {}".format(eval_results["best idx"]))
print("-" * 100)

cu_file_name = "roller_{}_{}.cu".format(
args.op, "_".join([str(d) for d in args.shape])
Expand Down
86 changes: 61 additions & 25 deletions test_op_mp.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ def main_template(
for d in dim:
size *= d
byte = size * type_size
s_size += " int output_size" + str(i) + " = " + str(size) + ";\n"
if args.gen_check_code:
s_size += " int output_size" + str(i) + " = " + str(size) + ";\n"
s_hmalloc += " " + name + "h = (float*)malloc(" + str(byte) + ");\n"
s_hfree += " free(" + name + "h);\n"
s_dmalloc += " cudaMalloc((void **)&" + name + "d, " + str(byte) + ");\n"
Expand All @@ -189,25 +190,26 @@ def main_template(
+ str(byte)
+ ", cudaMemcpyDeviceToHost);\n"
)
s_simple_check += (
" float same_res = "
+ name
+ "h[0];\n"
+ " for (int i = 1; i < output_size"
+ str(i)
+ "; ++i)\n"
+ " {\n"
" if ("
+ name
+ "h[i] != same_res)\n"
+ " {\n"
+ ' printf("output[%d] = %f\\n", i, '
+ name
+ "h[i]);\n"
+ " exit(1);\n"
+ " }\n"
+ " }\n"
)
if args.gen_check_code:
s_simple_check += (
" float same_res = "
+ name
+ "h[0];\n"
+ " for (int i = 1; i < output_size"
+ str(i)
+ "; ++i)\n"
+ " {\n"
" if ("
+ name
+ "h[i] != same_res)\n"
+ " {\n"
+ ' printf("output[%d] = %f\\n", i, '
+ name
+ "h[i]);\n"
+ " exit(1);\n"
+ " }\n"
+ " }\n"
)

if backend == "antares":
kernel_name = "template_op_kernel0"
Expand Down Expand Up @@ -438,7 +440,7 @@ def compile_and_run_kernel(
device_id,
idx,
):
print(f"rProg[{idx}]: {rprog.Dump()}")
print(f"[{idx}] rProg: ", rprog.Dump())
Comment thread
ConvolutedDog marked this conversation as resolved.
block_size = rprog.GetParallelism(1) * (32 if args.use_tc else 1)
grid_size = rprog.GetParallelism(0)
blocks = (block_size, 1, 1)
Expand Down Expand Up @@ -518,8 +520,22 @@ def compile_and_run_kernel(
os.system("rm {}.cu".format(file_name))

with open(log_name, "r") as f:
for line in f.readlines():
print(line, end="")
print("Profiling result:")
lines = f.readlines()
if compute_capability >= "80":
for l in range(len(lines)):
if "Time (%)" in lines[l] and "Instances" in lines[l]:
print(lines[l] + lines[l + 2])
break
else:
for l in range(len(lines)):
if "Type" in lines[l] and "Time(%)" in lines[l]:
print(lines[l], end="")
if (
"default_function_kernel0" if not LatestTVM else "main_kernel"
) in lines[l]:
print(lines[l])
break

exec_time = get_time_from_nvprof_file(log_name)
os.system("rm {}".format(log_name))
Expand Down Expand Up @@ -564,6 +580,7 @@ def eval_thread(


if __name__ == "__main__":
printBanner(row_symbol="=", col_symbol="||", length=100, context="Namespace")
print(args)
expr = globals()[args.op]
if args.fuse:
Expand Down Expand Up @@ -613,11 +630,21 @@ def eval_thread(
rprog.AddTile(0, rTile0)

rprogs = [rprog]
print("-------------------use artificial rtile---------------------------")
printBanner(
row_symbol="-", col_symbol="|", length=100, context="Use artificial rtile"
)
else:
printBanner(
row_symbol="-", col_symbol="|", length=100, context="Emiting configs"
)
rprogs = policy.emit_config_without_trails(args.topk)

print("Evaluating top {} configs".format(len(rprogs)))
printBanner(
row_symbol="-",
col_symbol="|",
length=100,
context="Evaluating top {} configs".format(len(rprogs)),
)

rprog_idx = alloc_configs_for_subprocess(args.num_threads, len(rprogs))
threads = []
Expand Down Expand Up @@ -650,8 +677,17 @@ def eval_thread(

eval_time = time.time() - start_time

printBanner(row_symbol="v", col_symbol="|", length=100, context="Perf Report")
print("Top1 time: {} ms".format(top1_time))
print("Top10 time: {} ms".format(best_time))
if LatestTVM:
print(
"Best perf: {} TFLOPS".format(
tvm.tir.analysis.estimate_tir_flops(rprogs[0].sche.mod)
/ best_time
* 1e-9
)
)
print("Best idx: {}".format(best_idx))
print("Best config: {}".format(rprogs[best_idx].Dump()))
print("Top1 compile time: {} s".format(emit_time))
Expand Down
4 changes: 2 additions & 2 deletions tests/short-time-LatestTVM.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ run_benchmarks() {
local gpu_id=$1
local device_name=$2

echo "========================================"
echo "===================================================================="
echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)"
echo "========================================"
echo "===================================================================="

export CUDA_VISIBLE_DEVICES=$gpu_id

Expand Down
4 changes: 2 additions & 2 deletions tests/short-time-OldTVM.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ run_benchmarks() {
local gpu_id=$1
local device_name=$2

echo "========================================"
echo "===================================================================="
echo "Running benchmarks on $device_name (CUDA_VISIBLE_DEVICES=$gpu_id)"
echo "========================================"
echo "===================================================================="

export CUDA_VISIBLE_DEVICES=$gpu_id

Expand Down
20 changes: 19 additions & 1 deletion utils/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@

compute_capability = tvm.runtime.cuda(0).compute_version.replace(".", "")

Backend = Literal["tvm", "antares"]


def deprecated(exit_immediately=True):
"""Deprecated decorator"""
Expand Down Expand Up @@ -282,4 +284,20 @@ def get_result(self):
return None


Backend = Literal["tvm", "antares"]
def printBanner(
row_symbol: str = "=", col_symbol: str = "||", length: int = 100, context: str = ""
):
banner = row_symbol * length
start_end_border = col_symbol
all_white_space_len = len(banner) - 2 * len(start_end_border) - len(context)
white_space_prefix = " " * (all_white_space_len // 2)
white_space_suffix = " " * (all_white_space_len - len(white_space_prefix))
print(banner)
print(
start_end_border
+ white_space_prefix
+ context
+ white_space_suffix
+ start_end_border
)
print(banner)
Loading