Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions tests/short-time-LatestTVM.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,27 @@ run_benchmarks() {

export CUDA_VISIBLE_DEVICES=$gpu_id

# Run conv benchmark
echo "Running fused_conv_expr_S1D1P1 (float16)..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
# Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090.
if [[ "$device_name" == "NVIDIA GV100" ]]; then
# Run conv benchmark
echo "Running fused_conv_expr_S1D1P1 (float16)..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3

# Run matmul benchmarks
echo "Running matmul_expr (float32)..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
fi

# Run matmul benchmarks
echo "Running matmul_expr (float16) with tensor cores..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096

echo "Running matmul_expr (float32)..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096
# Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100.
if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then
echo "Running matmul_expr (float16) with tensor cores..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
fi

echo ""
}

# Run benchmarks on all GPUs
run_benchmarks 0 "NVIDIA H100"
run_benchmarks 5 "NVIDIA RTX 4090"
run_benchmarks 4 "NVIDIA GV100"

Expand Down
25 changes: 15 additions & 10 deletions tests/short-time-OldTVM.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,27 @@ run_benchmarks() {

export CUDA_VISIBLE_DEVICES=$gpu_id

# Run conv benchmark
echo "Running fused_conv_expr_S1D1P1 (float16)..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3
# Only test the compiling functionality on GV100, because its profiling implementation is faster than that on RTX 4090.
if [[ "$device_name" == "NVIDIA GV100" ]]; then
# Run conv benchmark
echo "Running fused_conv_expr_S1D1P1 (float16)..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/conv --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float16" --op fused_conv_expr_S1D1P1 --shape 128 128 28 28 128 3 3

# Run matmul benchmarks
echo "Running matmul_expr (float32)..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 1024 1024 1024
fi

# Run matmul benchmarks
echo "Running matmul_expr (float16) with tensor cores..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 4096 4096 4096

echo "Running matmul_expr (float32)..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --data_type "float32" --op matmul_expr --shape 4096 4096 4096
# Only test the profiling functionality on RTX 4090, the rightness of the matmul_expr has been tested on GV100.
if [[ "$device_name" == "NVIDIA RTX 4090" ]]; then
echo "Running matmul_expr (float16) with tensor cores..."
python -u test_op.py --gen_check_code --backend tvm --topk 1 --code_dir generated_source/matmul --smem_tiling --reg_tiling --codegen_input_reg_tiling --shared_fetch_vectorize --use_tc --data_type "float16" --op matmul_expr --shape 1024 1024 1024
fi

echo ""
}

# Run benchmarks on all GPUs
run_benchmarks 0 "NVIDIA H100"
run_benchmarks 5 "NVIDIA RTX 4090"
run_benchmarks 4 "NVIDIA GV100"

Expand Down