-
Notifications
You must be signed in to change notification settings - Fork 23
Description
Summary
Record a practical use case where ptoas --enable-insert-sync still has ~10% perf gap compared to a known manual-sync plan. Similar purpose as #226, but this case is a pure vector kernel, so perhaps easier to analyze.
Background
The chosen kernel is the NPU version of fast-hadamard-transform, used widely in 4-bit quantization, in FlashAttention-3, and also in DeepSeek v3.2 lightning indexer
See full PTO python code in fast_hadamard, and performance measurement in huawei-csl/pto-dsl#62. Auto-sync version generates 10% slower kernel than my manual plan.
Command line
ptoas --enable-insert-sync ./hadamard_auto_sync.pto -o ./hadamard_auto_sync.cpp
ptoas ./hadamard_manual_sync.pto -o ./hadamard_manual_sync.cpp # compare to manual syncReproduction input
The hadamard_auto_sync.pto file:
Details
module {
func.func @fast_hadamard_autosync(%arg0: !pto.ptr<f16>, %arg1: i32, %arg2: i32, %arg3: i32) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%0 = arith.index_cast %arg1 : i32 to index
%1 = arith.index_cast %arg2 : i32 to index
%2 = arith.index_cast %arg3 : i32 to index
%3 = pto.get_block_idx
%4 = pto.get_subblock_idx
%5 = pto.get_subblock_num
%6 = pto.get_block_num
%7 = arith.muli %3, %5 : i64
%8 = arith.addi %7, %4 : i64
%9 = arith.index_cast %8 : i64 to index
%10 = arith.muli %6, %5 : i64
%11 = arith.index_cast %10 : i64 to index
pto.section.vector {
%12 = arith.ceildivsi %0, %11 : index
%13 = arith.muli %9, %12 : index
%14 = arith.cmpi slt, %13, %0 : index
scf.if %14 {
%15 = arith.addi %13, %12 : index
%16 = arith.cmpi sgt, %15, %0 : index
%17 = arith.subi %0, %13 : index
%18 = arith.select %16, %17, %12 : index
%19 = arith.cmpi sgt, %18, %c0 : index
scf.if %19 {
%20 = arith.muli %0, %1 : index
%21 = pto.make_tensor_view %arg0, shape = [%20], strides = [%c1] : !pto.tensor_view<?xf16>
%22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%23 = arith.divsi %1, %c2 : index
%24 = pto.alloc_tile valid_col = %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%25 = arith.divsi %1, %c2 : index
%26 = pto.alloc_tile valid_col = %25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%27 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%28 = arith.divsi %1, %c2 : index
%29 = pto.alloc_tile valid_col = %28 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%30 = arith.divsi %1, %c2 : index
%31 = pto.alloc_tile valid_col = %30 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%32 = arith.divsi %1, %c2 : index
%33 = arith.ceildivsi %18, %c1 : index
scf.for %arg4 = %c0 to %33 step %c1 {
%34 = arith.muli %arg4, %c1 : index
%35 = arith.subi %18, %34 : index
%36 = arith.cmpi slt, %35, %c1 : index
%37 = arith.select %36, %35, %c1 : index
%38 = arith.cmpi sgt, %37, %c0 : index
scf.if %38 {
%39 = arith.addi %13, %34 : index
%40 = arith.muli %39, %1 : index
%41 = arith.remsi %arg4, %c2 : index
%42 = arith.cmpi eq, %41, %c0 : index
scf.if %42 {
scf.for %arg5 = %c0 to %37 step %c1 {
%43 = arith.muli %arg5, %1 : index
%44 = arith.addi %40, %43 : index
%45 = pto.partition_view %21, offsets = [%44], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
%46 = pto.subset %22[%c0, %c0] sizes [1, 8192] : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%47 = pto.subset %22[%c0, %32] sizes [1, 8192] : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tload ins(%45 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
scf.for %arg6 = %c0 to %2 step %c1 {
pto.tgather ins(%22, {maskPattern = #pto.mask_pattern<P0101>} : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tgather ins(%22, {maskPattern = #pto.mask_pattern<P1010>} : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tadd ins(%24, %26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%46 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tsub ins(%24, %26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%47 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
}
pto.tstore ins(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%45 : !pto.partition_tensor_view<1x16384xf16>)
}
} else {
scf.for %arg5 = %c0 to %37 step %c1 {
%43 = arith.muli %arg5, %1 : index
%44 = arith.addi %40, %43 : index
%45 = pto.partition_view %21, offsets = [%44], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
%46 = pto.subset %27[%c0, %c0] sizes [1, 8192] : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%47 = pto.subset %27[%c0, %32] sizes [1, 8192] : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.tload ins(%45 : !pto.partition_tensor_view<1x16384xf16>) outs(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
scf.for %arg6 = %c0 to %2 step %c1 {
pto.tgather ins(%27, {maskPattern = #pto.mask_pattern<P0101>} : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%29 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tgather ins(%27, {maskPattern = #pto.mask_pattern<P1010>} : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%31 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tadd ins(%29, %31 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%46 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tsub ins(%29, %31 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%47 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
}
pto.tstore ins(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%45 : !pto.partition_tensor_view<1x16384xf16>)
}
}
}
}
}
}
}
return
}
}My manual hadamard_manual_sync.pto
Details
module {
func.func @fast_hadamard_manualsync(%arg0: !pto.ptr<f16>, %arg1: i32, %arg2: i32, %arg3: i32) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c2 = arith.constant 2 : index
%0 = arith.index_cast %arg1 : i32 to index
%1 = arith.index_cast %arg2 : i32 to index
%2 = arith.index_cast %arg3 : i32 to index
%3 = pto.get_block_idx
%4 = pto.get_subblock_idx
%5 = pto.get_subblock_num
%6 = pto.get_block_num
%7 = arith.muli %3, %5 : i64
%8 = arith.addi %7, %4 : i64
%9 = arith.index_cast %8 : i64 to index
%10 = arith.muli %6, %5 : i64
%11 = arith.index_cast %10 : i64 to index
pto.section.vector {
%12 = arith.ceildivsi %0, %11 : index
%13 = arith.muli %9, %12 : index
%14 = arith.cmpi slt, %13, %0 : index
scf.if %14 {
%15 = arith.addi %13, %12 : index
%16 = arith.cmpi sgt, %15, %0 : index
%17 = arith.subi %0, %13 : index
%18 = arith.select %16, %17, %12 : index
%19 = arith.cmpi sgt, %18, %c0 : index
scf.if %19 {
%20 = arith.muli %0, %1 : index
%21 = pto.make_tensor_view %arg0, shape = [%20], strides = [%c1] : !pto.tensor_view<?xf16>
%22 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%23 = arith.divsi %1, %c2 : index
%24 = pto.alloc_tile valid_col = %23 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%25 = arith.divsi %1, %c2 : index
%26 = pto.alloc_tile valid_col = %25 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%27 = pto.alloc_tile valid_col = %1 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%28 = arith.divsi %1, %c2 : index
%29 = pto.alloc_tile valid_col = %28 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%30 = arith.divsi %1, %c2 : index
%31 = pto.alloc_tile valid_col = %30 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%32 = arith.divsi %1, %c2 : index
%33 = arith.ceildivsi %18, %c1 : index
pto.record_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TLOAD>, <EVENT_ID0>]
pto.record_event[#pto.sync_op_type<TSTORE_VEC>, #pto.sync_op_type<TVEC>, <EVENT_ID0>]
pto.record_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TLOAD>, <EVENT_ID1>]
pto.record_event[#pto.sync_op_type<TSTORE_VEC>, #pto.sync_op_type<TVEC>, <EVENT_ID1>]
scf.for %arg4 = %c0 to %33 step %c1 {
%34 = arith.muli %arg4, %c1 : index
%35 = arith.subi %18, %34 : index
%36 = arith.cmpi slt, %35, %c1 : index
%37 = arith.select %36, %35, %c1 : index
%38 = arith.cmpi sgt, %37, %c0 : index
scf.if %38 {
%39 = arith.addi %13, %34 : index
%40 = arith.muli %39, %1 : index
%41 = arith.remsi %arg4, %c2 : index
%42 = arith.cmpi eq, %41, %c0 : index
scf.if %42 {
scf.for %arg5 = %c0 to %37 step %c1 {
%43 = arith.muli %arg5, %1 : index
%44 = arith.addi %40, %43 : index
%45 = pto.partition_view %21, offsets = [%44], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
%46 = pto.subset %22[%c0, %c0] sizes [1, 8192] : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%47 = pto.subset %22[%c0, %32] sizes [1, 8192] : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.wait_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TLOAD>, <EVENT_ID0>]
pto.wait_event[#pto.sync_op_type<TSTORE_VEC>, #pto.sync_op_type<TVEC>, <EVENT_ID0>]
pto.tload ins(%45 : !pto.partition_tensor_view<1x16384xf16>) outs(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.record_event[#pto.sync_op_type<TLOAD>, #pto.sync_op_type<TVEC>, <EVENT_ID0>]
pto.wait_event[#pto.sync_op_type<TLOAD>, #pto.sync_op_type<TVEC>, <EVENT_ID0>]
scf.for %arg6 = %c0 to %2 step %c1 {
pto.tgather ins(%22, {maskPattern = #pto.mask_pattern<P0101>} : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%24 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tgather ins(%22, {maskPattern = #pto.mask_pattern<P1010>} : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.barrier_sync[<TVEC>]
pto.tadd ins(%24, %26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%46 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tsub ins(%24, %26 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%47 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.barrier_sync[<TVEC>]
}
pto.record_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TSTORE_VEC>, <EVENT_ID0>]
pto.wait_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TSTORE_VEC>, <EVENT_ID0>]
pto.tstore ins(%22 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%45 : !pto.partition_tensor_view<1x16384xf16>)
pto.record_event[#pto.sync_op_type<TSTORE_VEC>, #pto.sync_op_type<TVEC>, <EVENT_ID0>]
pto.record_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TLOAD>, <EVENT_ID0>]
}
} else {
scf.for %arg5 = %c0 to %37 step %c1 {
%43 = arith.muli %arg5, %1 : index
%44 = arith.addi %40, %43 : index
%45 = pto.partition_view %21, offsets = [%44], sizes = [%1] : !pto.tensor_view<?xf16> -> !pto.partition_tensor_view<1x16384xf16>
%46 = pto.subset %27[%c0, %c0] sizes [1, 8192] : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
%47 = pto.subset %27[%c0, %32] sizes [1, 8192] : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>
pto.wait_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TLOAD>, <EVENT_ID1>]
pto.wait_event[#pto.sync_op_type<TSTORE_VEC>, #pto.sync_op_type<TVEC>, <EVENT_ID1>]
pto.tload ins(%45 : !pto.partition_tensor_view<1x16384xf16>) outs(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.record_event[#pto.sync_op_type<TLOAD>, #pto.sync_op_type<TVEC>, <EVENT_ID1>]
pto.wait_event[#pto.sync_op_type<TLOAD>, #pto.sync_op_type<TVEC>, <EVENT_ID1>]
scf.for %arg6 = %c0 to %2 step %c1 {
pto.tgather ins(%27, {maskPattern = #pto.mask_pattern<P0101>} : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%29 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tgather ins(%27, {maskPattern = #pto.mask_pattern<P1010>} : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%31 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.barrier_sync[<TVEC>]
pto.tadd ins(%29, %31 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%46 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.tsub ins(%29, %31 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>, !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%47 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=8192, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>)
pto.barrier_sync[<TVEC>]
}
pto.record_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TSTORE_VEC>, <EVENT_ID1>]
pto.wait_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TSTORE_VEC>, <EVENT_ID1>]
pto.tstore ins(%27 : !pto.tile_buf<loc=vec, dtype=f16, rows=1, cols=16384, v_row=1, v_col=?, blayout=row_major, slayout=none_box, fractal=512, pad=0>) outs(%45 : !pto.partition_tensor_view<1x16384xf16>)
pto.record_event[#pto.sync_op_type<TSTORE_VEC>, #pto.sync_op_type<TVEC>, <EVENT_ID1>]
pto.record_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TLOAD>, <EVENT_ID1>]
}
}
}
}
pto.wait_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TLOAD>, <EVENT_ID0>]
pto.wait_event[#pto.sync_op_type<TSTORE_VEC>, #pto.sync_op_type<TVEC>, <EVENT_ID0>]
pto.wait_event[#pto.sync_op_type<TVEC>, #pto.sync_op_type<TLOAD>, <EVENT_ID1>]
pto.wait_event[#pto.sync_op_type<TSTORE_VEC>, #pto.sync_op_type<TVEC>, <EVENT_ID1>]
}
}
}
return
}
}The generated hadamard_auto_sync.cpp:
Details
#include "pto/pto-inst.hpp"
using namespace pto;
__global__ AICORE void fast_hadamard_autosync(__gm__ half* v1, int32_t v2, int32_t v3, int32_t v4) {
unsigned v5 = 16384;
unsigned v6 = 1;
unsigned v7 = 0;
int32_t v8 = 2;
int32_t v9 = 1;
int32_t v10 = 0;
int32_t v11 = 8192;
int64_t v12 = 0;
int64_t v13 = 32768;
int64_t v14 = 49152;
int64_t v15 = 65536;
int64_t v16 = 98304;
int64_t v17 = 114688;
using T = float;
size_t v18 = (size_t) v10;
size_t v19 = (size_t) v9;
size_t v20 = (size_t) v4;
int64_t v21 = get_block_idx();
int64_t v22 = get_subblockid();
int64_t v23 = get_subblockdim();
int64_t v24 = (int64_t) v23;
int64_t v25 = get_block_num();
int32_t v26 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v25) * (uint64_t) v24);
#if defined(__DAV_VEC__)
set_mask_norm();
set_vector_mask(-1, -1);
int32_t v27 = v2 / v26;
int32_t v28 = v2 % v26 != v10 && v2 < v10 == v26 < v10 ? v27 + v9 : v27;
int32_t v29 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v21) * (uint64_t) v24) + (uint64_t) ((int64_t) v22))) * (uint32_t) v28);
set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4);
if (v29 < v2) {
int32_t v30 = (int32_t) ((uint32_t) v29 + (uint32_t) v28) > v2 ? (int32_t) ((uint32_t) v2 - (uint32_t) v29) : v28;
if (v30 > v10) {
int32_t v31 = (int32_t) ((uint32_t) v2 * (uint32_t) v3);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v32 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v3);
TASSIGN(v32, v12);
int32_t v33 = v3 / v8;
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v34 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v33);
TASSIGN(v34, v13);
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v35 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v33);
TASSIGN(v35, v14);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v36 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v3);
TASSIGN(v36, v15);
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v37 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v33);
TASSIGN(v37, v16);
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v38 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v33);
TASSIGN(v38, v17);
for (size_t v39 = v18; v39 < ((size_t) v30); v39 += v19) {
int32_t v40 = (int32_t) v39;
int32_t v41 = (int32_t) ((uint32_t) v30 - (uint32_t) v40);
int32_t v42 = v41 < v9 ? v41 : v9;
size_t v43 = (size_t) v42;
if (v42 > v10) {
int32_t v44 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v29 + (uint32_t) v40) * (uint32_t) v3);
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
if (v40 % v8 == v10) {
for (size_t v45 = v18; v45 < v43; v45 += v19) {
unsigned v46 = (unsigned) v3 * v6;
pto::Shape<1, 1, 1, 1, -1> v47 = pto::Shape<1, 1, 1, 1, -1>(v3);
pto::Stride<-1, -1, -1, -1, 1> v48 = pto::Stride<-1, -1, -1, -1, 1>(v46, v46, v46, v46);
GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v49 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v7 + (unsigned) ((int32_t) (uint32_t) v44 + (uint32_t) ((int32_t) (uint32_t) ((int32_t) v45) * (uint32_t) v3)) * (unsigned) v9), v47, v48);
__ubuf__ half* v50 = v32.data();
int64_t v51 = (int64_t) v3;
int32_t v52 = (int32_t) ((int64_t) (uint64_t) v51 - (uint64_t) ((int64_t) (uint64_t) v12 % (uint64_t) v51));
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v53 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v52 < v11 ? v52 : v11);
uint64_t v54 = reinterpret_cast<uint64_t>((__ubuf__ half*) (v50 + (v7 + v7 * v5) + v7 * v6));
TASSIGN(v53, v54);
__ubuf__ half* v55 = v32.data();
int32_t v56 = (int32_t) ((int64_t) (uint64_t) v51 - (uint64_t) ((int64_t) (uint64_t) ((int64_t) v33) % (uint64_t) v51));
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v57 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v56 < v11 ? v56 : v11);
uint64_t v58 = reinterpret_cast<uint64_t>((__ubuf__ half*) (v55 + (v7 + v7 * v5) + (unsigned) v33 * v6));
TASSIGN(v57, v58);
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
TLOAD(v32, v49);
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
for (size_t v59 = v18; v59 < v20; v59 += v19) {
pipe_barrier(PIPE_V);
TGATHER<Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, MaskPattern::P0101>(v34, v32);
TGATHER<Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, MaskPattern::P1010>(v35, v32);
pipe_barrier(PIPE_V);
TADD(v53, v34, v35);
pipe_barrier(PIPE_V);
TSUB(v57, v34, v35);
};
set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
pipe_barrier(PIPE_MTE3);
TSTORE(v49, v32);
set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
};
} else {
for (size_t v60 = v18; v60 < v43; v60 += v19) {
unsigned v61 = (unsigned) v3 * v6;
pto::Shape<1, 1, 1, 1, -1> v62 = pto::Shape<1, 1, 1, 1, -1>(v3);
pto::Stride<-1, -1, -1, -1, 1> v63 = pto::Stride<-1, -1, -1, -1, 1>(v61, v61, v61, v61);
GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v64 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v7 + (unsigned) ((int32_t) (uint32_t) v44 + (uint32_t) ((int32_t) (uint32_t) ((int32_t) v60) * (uint32_t) v3)) * (unsigned) v9), v62, v63);
__ubuf__ half* v65 = v36.data();
int64_t v66 = (int64_t) v3;
int32_t v67 = (int32_t) ((int64_t) (uint64_t) v66 - (uint64_t) ((int64_t) (uint64_t) v12 % (uint64_t) v66));
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v68 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v67 < v11 ? v67 : v11);
uint64_t v69 = reinterpret_cast<uint64_t>((__ubuf__ half*) (v65 + (v7 + v7 * v5) + v7 * v6));
TASSIGN(v68, v69);
__ubuf__ half* v70 = v36.data();
int32_t v71 = (int32_t) ((int64_t) (uint64_t) v66 - (uint64_t) ((int64_t) (uint64_t) ((int64_t) v33) % (uint64_t) v66));
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v72 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v71 < v11 ? v71 : v11);
uint64_t v73 = reinterpret_cast<uint64_t>((__ubuf__ half*) (v70 + (v7 + v7 * v5) + (unsigned) v33 * v6));
TASSIGN(v72, v73);
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
TLOAD(v36, v64);
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
for (size_t v74 = v18; v74 < v20; v74 += v19) {
pipe_barrier(PIPE_V);
TGATHER<Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, MaskPattern::P0101>(v37, v36);
TGATHER<Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, MaskPattern::P1010>(v38, v36);
pipe_barrier(PIPE_V);
TADD(v68, v37, v38);
pipe_barrier(PIPE_V);
TSUB(v72, v37, v38);
};
set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
pipe_barrier(PIPE_MTE3);
TSTORE(v64, v36);
set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
};
};
set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
};
};
};
}
pipe_barrier(PIPE_ALL);
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID1);
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID2);
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID3);
wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID4);
#endif // __DAV_VEC__
return;
}The generated hadamard_manual_sync.cpp:
Details
#include "pto/pto-inst.hpp"
using namespace pto;
__global__ AICORE void fast_hadamard_manualsync(__gm__ half* v1, int32_t v2, int32_t v3, int32_t v4) {
unsigned v5 = 16384;
unsigned v6 = 1;
unsigned v7 = 0;
int32_t v8 = 2;
int32_t v9 = 1;
int32_t v10 = 0;
int32_t v11 = 8192;
int64_t v12 = 0;
int64_t v13 = 32768;
int64_t v14 = 49152;
int64_t v15 = 65536;
int64_t v16 = 98304;
int64_t v17 = 114688;
using T = float;
size_t v18 = (size_t) v10;
size_t v19 = (size_t) v9;
size_t v20 = (size_t) v4;
int64_t v21 = get_block_idx();
int64_t v22 = get_subblockid();
int64_t v23 = get_subblockdim();
int64_t v24 = (int64_t) v23;
int64_t v25 = get_block_num();
int32_t v26 = (int32_t) ((int64_t) (uint64_t) ((int64_t) v25) * (uint64_t) v24);
#if defined(__DAV_VEC__)
set_mask_norm();
set_vector_mask(-1, -1);
int32_t v27 = v2 / v26;
int32_t v28 = v2 % v26 != v10 && v2 < v10 == v26 < v10 ? v27 + v9 : v27;
int32_t v29 = (int32_t) ((uint32_t) ((int32_t) (int64_t) ((uint64_t) ((int64_t) (uint64_t) ((int64_t) v21) * (uint64_t) v24) + (uint64_t) ((int64_t) v22))) * (uint32_t) v28);
if (v29 < v2) {
int32_t v30 = (int32_t) ((uint32_t) v29 + (uint32_t) v28) > v2 ? (int32_t) ((uint32_t) v2 - (uint32_t) v29) : v28;
if (v30 > v10) {
int32_t v31 = (int32_t) ((uint32_t) v2 * (uint32_t) v3);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v32 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v3);
TASSIGN(v32, v12);
int32_t v33 = v3 / v8;
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v34 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v33);
TASSIGN(v34, v13);
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v35 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v33);
TASSIGN(v35, v14);
Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v36 = Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v3);
TASSIGN(v36, v15);
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v37 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v33);
TASSIGN(v37, v16);
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v38 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v33);
TASSIGN(v38, v17);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
for (size_t v39 = v18; v39 < ((size_t) v30); v39 += v19) {
int32_t v40 = (int32_t) v39;
int32_t v41 = (int32_t) ((uint32_t) v30 - (uint32_t) v40);
int32_t v42 = v41 < v9 ? v41 : v9;
size_t v43 = (size_t) v42;
if (v42 > v10) {
int32_t v44 = (int32_t) ((uint32_t) ((int32_t) (uint32_t) v29 + (uint32_t) v40) * (uint32_t) v3);
if (v40 % v8 == v10) {
for (size_t v45 = v18; v45 < v43; v45 += v19) {
unsigned v46 = (unsigned) v3 * v6;
pto::Shape<1, 1, 1, 1, -1> v47 = pto::Shape<1, 1, 1, 1, -1>(v3);
pto::Stride<-1, -1, -1, -1, 1> v48 = pto::Stride<-1, -1, -1, -1, 1>(v46, v46, v46, v46);
GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v49 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v7 + (unsigned) ((int32_t) (uint32_t) v44 + (uint32_t) ((int32_t) (uint32_t) ((int32_t) v45) * (uint32_t) v3)) * (unsigned) v9), v47, v48);
__ubuf__ half* v50 = v32.data();
int64_t v51 = (int64_t) v3;
int32_t v52 = (int32_t) ((int64_t) (uint64_t) v51 - (uint64_t) ((int64_t) (uint64_t) v12 % (uint64_t) v51));
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v53 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v52 < v11 ? v52 : v11);
uint64_t v54 = reinterpret_cast<uint64_t>((__ubuf__ half*) (v50 + (v7 + v7 * v5) + v7 * v6));
TASSIGN(v53, v54);
__ubuf__ half* v55 = v32.data();
int32_t v56 = (int32_t) ((int64_t) (uint64_t) v51 - (uint64_t) ((int64_t) (uint64_t) ((int64_t) v33) % (uint64_t) v51));
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v57 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v56 < v11 ? v56 : v11);
uint64_t v58 = reinterpret_cast<uint64_t>((__ubuf__ half*) (v55 + (v7 + v7 * v5) + (unsigned) v33 * v6));
TASSIGN(v57, v58);
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
TLOAD(v32, v49);
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
for (size_t v59 = v18; v59 < v20; v59 += v19) {
TGATHER<Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, MaskPattern::P0101>(v34, v32);
TGATHER<Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, MaskPattern::P1010>(v35, v32);
pipe_barrier(PIPE_V);
TADD(v53, v34, v35);
TSUB(v57, v34, v35);
pipe_barrier(PIPE_V);
};
set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
TSTORE(v49, v32);
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
};
} else {
for (size_t v60 = v18; v60 < v43; v60 += v19) {
unsigned v61 = (unsigned) v3 * v6;
pto::Shape<1, 1, 1, 1, -1> v62 = pto::Shape<1, 1, 1, 1, -1>(v3);
pto::Stride<-1, -1, -1, -1, 1> v63 = pto::Stride<-1, -1, -1, -1, 1>(v61, v61, v61, v61);
GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND> v64 = GlobalTensor<half, pto::Shape<1, 1, 1, 1, -1>, pto::Stride<-1, -1, -1, -1, 1>, pto::Layout::ND>(v1 + (v7 + (unsigned) ((int32_t) (uint32_t) v44 + (uint32_t) ((int32_t) (uint32_t) ((int32_t) v60) * (uint32_t) v3)) * (unsigned) v9), v62, v63);
__ubuf__ half* v65 = v36.data();
int64_t v66 = (int64_t) v3;
int32_t v67 = (int32_t) ((int64_t) (uint64_t) v66 - (uint64_t) ((int64_t) (uint64_t) v12 % (uint64_t) v66));
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v68 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v67 < v11 ? v67 : v11);
uint64_t v69 = reinterpret_cast<uint64_t>((__ubuf__ half*) (v65 + (v7 + v7 * v5) + v7 * v6));
TASSIGN(v68, v69);
__ubuf__ half* v70 = v36.data();
int32_t v71 = (int32_t) ((int64_t) (uint64_t) v66 - (uint64_t) ((int64_t) (uint64_t) ((int64_t) v33) % (uint64_t) v66));
Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null> v72 = Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>(v71 < v11 ? v71 : v11);
uint64_t v73 = reinterpret_cast<uint64_t>((__ubuf__ half*) (v70 + (v7 + v7 * v5) + (unsigned) v33 * v6));
TASSIGN(v72, v73);
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
TLOAD(v36, v64);
set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
for (size_t v74 = v18; v74 < v20; v74 += v19) {
TGATHER<Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, MaskPattern::P0101>(v37, v36);
TGATHER<Tile<TileType::Vec, half, 1, 8192, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, Tile<TileType::Vec, half, 1, 16384, BLayout::RowMajor, 1, -1, SLayout::NoneBox, 512, PadValue::Null>, MaskPattern::P1010>(v38, v36);
pipe_barrier(PIPE_V);
TADD(v68, v37, v38);
TSUB(v72, v37, v38);
pipe_barrier(PIPE_V);
};
set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
TSTORE(v64, v36);
set_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
set_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
};
};
};
};
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID0);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
wait_flag(PIPE_V, PIPE_MTE2, EVENT_ID1);
wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID1);
};
}
#endif // __DAV_VEC__
return;
}One obvious finding is that the manual plan only uses EVENT_ID0 and EVENT_ID1 (sufficient for double-buffer), while the auto-sync emits also EVENT_ID2 and EVENT_ID3.
Expected performance
Auto-sync should be ideally as fast as manual sync version. (or discover even faster pipelining?)
Actual performance
Auto-sync is 10% slower than a known manual-sync double-buffer, see the detailed PR (contains kernel launch and on-device performance measurement on 910B2): huawei-csl/pto-dsl#62
Git commit
Maybe this helps? #196