From dbd2bda584ac044c8452f30120d8bc12550b1d40 Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Mon, 29 Sep 2025 20:30:19 +0800 Subject: [PATCH 1/2] [Test] Optimize the speed of tests - Restrict compilation rightness tests to GV100 only for faster validation - Use RTX 4090 exclusively for profiling functionality test - Reduce matmul problem size from 4096 to 1024 for quicker iterations - Remove H100 from test matrix to speedup the tests --- tests/short-time-LatestTVM.sh | 21 ++++++++++++--------- tests/short-time-OldTVM.sh | 21 ++++++++++++--------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh index 493dc81..358170e 100755 --- a/tests/short-time-LatestTVM.sh +++ b/tests/short-time-LatestTVM.sh @@ -15,22 +15,25 @@ run_benchmarks() { export CUDA_VISIBLE_DEVICES=$gpu_id - # Run conv benchmark - echo "Running fused_conv_expr_S1D1P1 (float16)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 - - # Run matmul benchmarks - echo "Running matmul_expr (float16) with tensor cores..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096 + # Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090. + if [[ "$device_name" == "NVIDIA GV100" ]]; then + # Run conv benchmark + echo "Running fused_conv_expr_S1D1P1 (float16)..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 + + # Run matmul benchmarks + echo "Running matmul_expr (float16) with tensor cores..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024 + fi + # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100. echo "Running matmul_expr (float32)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096 + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024 echo "" } # Run benchmarks on all GPUs -run_benchmarks 0 "NVIDIA H100" run_benchmarks 5 "NVIDIA RTX 4090" run_benchmarks 4 "NVIDIA GV100" diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh index 04fdaa6..f0754d6 100755 --- a/tests/short-time-OldTVM.sh +++ b/tests/short-time-OldTVM.sh @@ -15,22 +15,25 @@ run_benchmarks() { export CUDA_VISIBLE_DEVICES=$gpu_id - # Run conv benchmark - echo "Running fused_conv_expr_S1D1P1 (float16)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 - - # Run matmul benchmarks - echo "Running matmul_expr (float16) with tensor cores..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096 + # Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090. + if [[ "$device_name" == "NVIDIA GV100" ]]; then + # Run conv benchmark + echo "Running fused_conv_expr_S1D1P1 (float16)..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 + + # Run matmul benchmarks + echo "Running matmul_expr (float16) with tensor cores..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024 + fi + # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100. echo "Running matmul_expr (float32)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096 + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024 echo "" } # Run benchmarks on all GPUs -run_benchmarks 0 "NVIDIA H100" run_benchmarks 5 "NVIDIA RTX 4090" run_benchmarks 4 "NVIDIA GV100" From c5114af96e12b5d0a037e4a3e729429fca8aa234 Mon Sep 17 00:00:00 2001 From: ConvolutedDog Date: Mon, 29 Sep 2025 20:36:35 +0800 Subject: [PATCH 2/2] Speedup --- tests/short-time-LatestTVM.sh | 10 ++++++---- tests/short-time-OldTVM.sh | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh index 358170e..db18673 100755 --- a/tests/short-time-LatestTVM.sh +++ b/tests/short-time-LatestTVM.sh @@ -22,13 +22,15 @@ run_benchmarks() { python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 # Run matmul benchmarks - echo "Running matmul_expr (float16) with tensor cores..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024 + echo "Running matmul_expr (float32)..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024 fi # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100. - echo "Running matmul_expr (float32)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024 + if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then + echo "Running matmul_expr (float16) with tensor cores..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024 + fi echo "" } diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh index f0754d6..23974b7 100755 --- a/tests/short-time-OldTVM.sh +++ b/tests/short-time-OldTVM.sh @@ -22,13 +22,15 @@ run_benchmarks() { python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3 # Run matmul benchmarks - echo "Running matmul_expr (float16) with tensor cores..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024 + echo "Running matmul_expr (float32)..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024 fi # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100. - echo "Running matmul_expr (float32)..." - python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024 + if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then + echo "Running matmul_expr (float16) with tensor cores..." + python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024 + fi echo "" }