From dbd2bda584ac044c8452f30120d8bc12550b1d40 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Mon, 29 Sep 2025 20:30:19 +0800
Subject: [PATCH 1/2] [Test] Optimize the speed of tests

- Restrict compilation rightness tests to GV100 only for faster validation
- Use RTX 4090 exclusively for profiling functionality test
- Reduce matmul problem size from 4096 to 1024 for quicker iterations
- Remove H100 from test matrix to speedup the tests
---
 tests/short-time-LatestTVM.sh | 21 ++++++++++++---------
 tests/short-time-OldTVM.sh    | 21 ++++++++++++---------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh
index 493dc81..358170e 100755
--- a/tests/short-time-LatestTVM.sh
+++ b/tests/short-time-LatestTVM.sh
@@ -15,22 +15,25 @@ run_benchmarks() {
     
     export CUDA_VISIBLE_DEVICES=$gpu_id
     
-    # Run conv benchmark
-    echo "Running fused_conv_expr_S1D1P1 (float16)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
-    
-    # Run matmul benchmarks
-    echo "Running matmul_expr (float16) with tensor cores..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096
+    # Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090.
+    if [[ "$device_name" == "NVIDIA GV100" ]]; then
+        # Run conv benchmark
+        echo "Running fused_conv_expr_S1D1P1 (float16)..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
+        
+        # Run matmul benchmarks
+        echo "Running matmul_expr (float16) with tensor cores..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
+    fi
     
+    # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100.
     echo "Running matmul_expr (float32)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096
+    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
     
     echo ""
 }
 
 # Run benchmarks on all GPUs
-run_benchmarks 0 "NVIDIA H100"
 run_benchmarks 5 "NVIDIA RTX 4090"
 run_benchmarks 4 "NVIDIA GV100"
 
diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh
index 04fdaa6..f0754d6 100755
--- a/tests/short-time-OldTVM.sh
+++ b/tests/short-time-OldTVM.sh
@@ -15,22 +15,25 @@ run_benchmarks() {
     
     export CUDA_VISIBLE_DEVICES=$gpu_id
     
-    # Run conv benchmark
-    echo "Running fused_conv_expr_S1D1P1 (float16)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
-    
-    # Run matmul benchmarks
-    echo "Running matmul_expr (float16) with tensor cores..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096
+    # Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090.
+    if [[ "$device_name" == "NVIDIA GV100" ]]; then
+        # Run conv benchmark
+        echo "Running fused_conv_expr_S1D1P1 (float16)..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
+        
+        # Run matmul benchmarks
+        echo "Running matmul_expr (float16) with tensor cores..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
+    fi
     
+    # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100.
     echo "Running matmul_expr (float32)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096
+    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
     
     echo ""
 }
 
 # Run benchmarks on all GPUs
-run_benchmarks 0 "NVIDIA H100"
 run_benchmarks 5 "NVIDIA RTX 4090"
 run_benchmarks 4 "NVIDIA GV100"
 

From c5114af96e12b5d0a037e4a3e729429fca8aa234 Mon Sep 17 00:00:00 2001
From: ConvolutedDog <yangjianchao16@nudt.edu.cn>
Date: Mon, 29 Sep 2025 20:36:35 +0800
Subject: [PATCH 2/2] Speedup

---
 tests/short-time-LatestTVM.sh | 10 ++++++----
 tests/short-time-OldTVM.sh    | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh
index 358170e..db18673 100755
--- a/tests/short-time-LatestTVM.sh
+++ b/tests/short-time-LatestTVM.sh
@@ -22,13 +22,15 @@ run_benchmarks() {
         python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
         
         # Run matmul benchmarks
-        echo "Running matmul_expr (float16) with tensor cores..."
-        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
+        echo "Running matmul_expr (float32)..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
     fi
     
     # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100.
-    echo "Running matmul_expr (float32)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
+    if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then
+        echo "Running matmul_expr (float16) with tensor cores..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
+    fi
     
     echo ""
 }
diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh
index f0754d6..23974b7 100755
--- a/tests/short-time-OldTVM.sh
+++ b/tests/short-time-OldTVM.sh
@@ -22,13 +22,15 @@ run_benchmarks() {
         python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
         
         # Run matmul benchmarks
-        echo "Running matmul_expr (float16) with tensor cores..."
-        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
+        echo "Running matmul_expr (float32)..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
     fi
     
     # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100.
-    echo "Running matmul_expr (float32)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
+    if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then
+        echo "Running matmul_expr (float16) with tensor cores..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
+    fi
     
     echo ""
 }