diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh index 493dc81..db18673 100755 --- a/tests/short-time-LatestTVM.sh +++ b/tests/short-time-LatestTVM.sh @@ -15,22 +15,27 @@ run_benchmarks() { export CUDA_VISIBLE_DEVICES=$gpu_id - # Run conv benchmark - echo "Running fused_conv_expr_S1D1P1 (float16)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 + # Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090. + if [[ "$device_name" == "NVIDIA GV100" ]]; then + # Run conv benchmark + echo "Running fused_conv_expr_S1D1P1 (float16)..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 + + # Run matmul benchmarks + echo "Running matmul_expr (float32)..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024 + fi - # Run matmul benchmarks - echo "Running matmul_expr (float16) with tensor cores..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096 - - echo "Running matmul_expr (float32)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096 + # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100. + if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then + echo "Running matmul_expr (float16) with tensor cores..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024 + fi echo "" } # Run benchmarks on all GPUs -run_benchmarks 0 "NVIDIA H100" run_benchmarks 5 "NVIDIA RTX 4090" run_benchmarks 4 "NVIDIA GV100" diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh index 04fdaa6..23974b7 100755 --- a/tests/short-time-OldTVM.sh +++ b/tests/short-time-OldTVM.sh @@ -15,22 +15,27 @@ run_benchmarks() { export CUDA_VISIBLE_DEVICES=$gpu_id - # Run conv benchmark - echo "Running fused_conv_expr_S1D1P1 (float16)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 + # Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090. + if [[ "$device_name" == "NVIDIA GV100" ]]; then + # Run conv benchmark + echo "Running fused_conv_expr_S1D1P1 (float16)..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 + + # Run matmul benchmarks + echo "Running matmul_expr (float32)..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024 + fi - # Run matmul benchmarks - echo "Running matmul_expr (float16) with tensor cores..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096 - - echo "Running matmul_expr (float32)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096 + # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100. + if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then + echo "Running matmul_expr (float16) with tensor cores..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024 + fi echo "" } # Run benchmarks on all GPUs -run_benchmarks 0 "NVIDIA H100" run_benchmarks 5 "NVIDIA RTX 4090" run_benchmarks 4 "NVIDIA GV100"