ConvolutedDog · ConvolutedDog · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/tests/short-time-LatestTVM.sh b/tests/short-time-LatestTVM.sh
@@ -15,22 +15,27 @@ run_benchmarks() {
 
     export CUDA_VISIBLE_DEVICES=$gpu_id
 
-    # Run conv benchmark
-    echo "Running fused_conv_expr_S1D1P1 (float16)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
+    # Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090.
+    if [[ "$device_name" == "NVIDIA GV100" ]]; then
+        # Run conv benchmark
+        echo "Running fused_conv_expr_S1D1P1 (float16)..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
+
+        # Run matmul benchmarks
+        echo "Running matmul_expr (float32)..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
+    fi
 
-    # Run matmul benchmarks
-    echo "Running matmul_expr (float16) with tensor cores..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096
-
-    echo "Running matmul_expr (float32)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096
+    # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100.
+    if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then
+        echo "Running matmul_expr (float16) with tensor cores..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
+    fi
 
     echo ""
 }
 
 # Run benchmarks on all GPUs
-run_benchmarks 0 "NVIDIA H100"
 run_benchmarks 5 "NVIDIA RTX 4090"
 run_benchmarks 4 "NVIDIA GV100"
 

diff --git a/tests/short-time-OldTVM.sh b/tests/short-time-OldTVM.sh
@@ -15,22 +15,27 @@ run_benchmarks() {
 
     export CUDA_VISIBLE_DEVICES=$gpu_id
 
-    # Run conv benchmark
-    echo "Running fused_conv_expr_S1D1P1 (float16)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
+    # Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090.
+    if [[ "$device_name" == "NVIDIA GV100" ]]; then
+        # Run conv benchmark
+        echo "Running fused_conv_expr_S1D1P1 (float16)..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
+
+        # Run matmul benchmarks
+        echo "Running matmul_expr (float32)..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
+    fi
 
-    # Run matmul benchmarks
-    echo "Running matmul_expr (float16) with tensor cores..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096
-
-    echo "Running matmul_expr (float32)..."
-    python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096
+    # Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100.
+    if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then
+        echo "Running matmul_expr (float16) with tensor cores..."
+        python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
+    fi
 
     echo ""
 }
 
 # Run benchmarks on all GPUs
-run_benchmarks 0 "NVIDIA H100"
 run_benchmarks 5 "NVIDIA RTX 4090"
 run_benchmarks 4 "NVIDIA GV100"