@@ -89,7 +89,7 @@ tile_load(tile_t& tile, payload_t& payload) {
89
89
90
90
static constexpr uint32_t num_block_x = tile_desc::num_block_x;
91
91
static constexpr uint32_t num_block_y = tile_desc::num_block_y;
92
- static constexpr uint32_t num_block = tile_desc::num_block;
92
+ // static constexpr uint32_t num_block = tile_desc::num_block;
93
93
94
94
static constexpr gpu_arch arch_tag = payload_t ::arch_tag;
95
95
@@ -176,24 +176,24 @@ tile_load(tile_t& tile, payload_t& payload) {
176
176
((block_size_y * sizeof (dtype)) % sizeof (load_dtype) == 0 ),
177
177
" check vnni limitation for DW transpose" );
178
178
179
- auto payload_2d = payload.payloads .xetla_format <uint32_t , num_block, 16 >();
179
+ // auto payload_2d = payload.payloads.xetla_format<uint32_t, num_block, 16>();
180
180
#pragma unroll
181
181
for (uint32_t i = 0 ; i < num_block_y; ++i) {
182
182
constexpr uint32_t load_block_elems = block_elems * arr_len;
183
183
int offset_y = i * block_size_y;
184
- auto payload_row =
185
- payload_2d.xetla_select <num_block_x, 1 , 16 , 1 >(i * num_block_x, 0 );
186
- detail::reset_tile_desc_core<
187
- num_block_x,
188
- block_size_x,
189
- ld_blk_size_y,
190
- scale_factor,
191
- arr_len,
192
- mem_transpose>(payload_row);
184
+ // auto payload_row =
185
+ // payload_2d.xetla_select<num_block_x, 1, 16, 1>(i * num_block_x, 0);
186
+ // detail::reset_tile_desc_core<
187
+ // num_block_x,
188
+ // block_size_x,
189
+ // ld_blk_size_y,
190
+ // scale_factor,
191
+ // arr_len,
192
+ // mem_transpose>(payload_row);
193
193
#pragma unroll
194
194
for (uint32_t j = 0 ; j < num_block_x; j += arr_len) {
195
- uint32_t offset_x = j * block_size_x;
196
- xetla_tdescriptor tdesc = payload_row.row (j);
195
+ int32_t offset_x = j * block_size_x;
196
+ // xetla_tdescriptor tdesc = payload_row.row(j);
197
197
auto reg_blk = tile.reg .xetla_select <load_block_elems, 1 >(
198
198
(i * num_block_x + j) * block_elems);
199
199
constexpr uint32_t ld_blk_height = (reg_transpose && trans)
@@ -203,7 +203,6 @@ tile_load(tile_t& tile, payload_t& payload) {
203
203
xetla_vector<dtype, tmp_size> reg_tmp;
204
204
#pragma unroll
205
205
for (uint32_t ii = 0 ; ii < block_size_y / ld_blk_size_y; ++ii) {
206
- // offset_y += ld_blk_size_y;
207
206
constexpr uint32_t load_elems = ld_blk_size_y * block_size_x * arr_len;
208
207
reg_tmp.xetla_format <native_type_t <load_dtype>>() = xetla_load_global<
209
208
native_type_t <load_dtype>,
@@ -220,12 +219,14 @@ tile_load(tile_t& tile, payload_t& payload) {
220
219
payload.surface_width ,
221
220
payload.surface_height ,
222
221
payload.surface_pitch ,
223
- mem_transpose
224
- // ? (payload.offset_x + offset_y / scale_factor)
225
- ? ::gpu::xetla::detail::xetla_get_tensor_offset_x (tdesc)
226
- : (payload.offset_x + offset_x / scale_factor),
227
-
228
- payload.offset_y + (mem_transpose ? offset_x : offset_y));
222
+ payload.offset_x +
223
+ (mem_transpose ? (offset_y / (int )scale_factor +
224
+ ii * ld_blk_size_y / (int )scale_factor)
225
+ : (offset_x / scale_factor)),
226
+
227
+ payload.offset_y +
228
+ (mem_transpose ? offset_x : (offset_y + ii * ld_blk_size_y)));
229
+
229
230
if constexpr (reg_transpose && trans) {
230
231
reg_blk.xetla_select <load_elems, 1 >(ii * load_elems)
231
232
.xetla_format <native_type_t <load_dtype>>() =
@@ -242,13 +243,6 @@ tile_load(tile_t& tile, payload_t& payload) {
242
243
} else {
243
244
reg_blk.xetla_select <tmp_size, 1 >(ii * tmp_size) = reg_tmp;
244
245
}
245
- if constexpr (mem_transpose) {
246
- xetla_update_tdesc_offsetx (
247
- tdesc.xetla_format <uint32_t >(), ld_blk_size_y / scale_factor);
248
- } else {
249
- xetla_update_tdesc_offsety (
250
- tdesc.xetla_format <uint32_t >(), ld_blk_size_y);
251
- }
252
246
}
253
247
// exceed HW limitation
254
248
if constexpr (block_size_y % ld_blk_size_y != 0 ) {
@@ -258,22 +252,22 @@ tile_load(tile_t& tile, payload_t& payload) {
258
252
remained_start_y * block_size_x * arr_len;
259
253
constexpr uint32_t remained_blk_size_y = block_size_y % ld_blk_size_y;
260
254
constexpr uint32_t load_elems =
261
- remained_blk_size_y * block_size_x * arr_len;
255
+ remained_blk_size_y * block_size_x * arr_len / scale_factor ;
262
256
263
257
constexpr uint8_t block_width =
264
- mem_transpose ? ( remained_blk_size_y / scale_factor) : block_size_x ;
258
+ ( mem_transpose ? remained_blk_size_y : block_size_x) / scale_factor ;
265
259
constexpr uint8_t block_height =
266
- trans ? block_size_x : remained_blk_size_y;
267
- constexpr uint32_t block_widthx_widthy_arrlen =
268
- (block_width - 1 ) | ((block_height - 1 ) << 8 );
269
- gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen (
270
- tdesc.xetla_format <uint32_t >(), block_widthx_widthy_arrlen);
260
+ mem_transpose ? block_size_x : remained_blk_size_y;
261
+ // constexpr uint32_t block_widthx_widthy_arrlen =
262
+ // (block_width - 1) | ((block_height - 1) << 8);
263
+ // gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen(
264
+ // tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
271
265
272
266
reg_blk.xetla_select <load_elems, 1 >(remained_start)
273
267
.xetla_format <native_type_t <load_dtype>>() = xetla_load_global<
274
268
native_type_t <load_dtype>,
275
- block_size_x / scale_factor ,
276
- remained_blk_size_y ,
269
+ block_width ,
270
+ block_height ,
277
271
arr_len,
278
272
trans,
279
273
mem_transform,
@@ -283,8 +277,9 @@ tile_load(tile_t& tile, payload_t& payload) {
283
277
payload.surface_width ,
284
278
payload.surface_height ,
285
279
payload.surface_pitch ,
286
- ::gpu::xetla::detail::xetla_get_tensor_offset_x (tdesc),
287
- ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
280
+ payload.offset_x + offset_x / scale_factor,
281
+ payload.offset_y + offset_y + remained_start_y);
282
+
288
283
// xetla_tload_global<
289
284
// load_dtype,
290
285
// (load_elems / scale_factor),
@@ -305,18 +300,19 @@ tile_load(tile_t& tile, payload_t& payload) {
305
300
(!reg_transpose && (remained_size_y > ld_blk_size_y_limit))
306
301
? ld_blk_size_y_limit
307
302
: remained_size_y;
308
- auto payload_row = payload_2d.xetla_select <num_block_x, 1 , 16 , 1 >(
309
- num_block_y * num_block_x, 0 );
310
- detail::reset_tile_desc_core<
311
- num_block_x,
312
- block_size_x,
313
- remained_ld_blk_size_y,
314
- scale_factor,
315
- arr_len,
316
- mem_transpose>(payload_row);
303
+ // auto payload_row = payload_2d.xetla_select<num_block_x, 1, 16, 1>(
304
+ // num_block_y * num_block_x, 0);
305
+ // detail::reset_tile_desc_core<
306
+ // num_block_x,
307
+ // block_size_x,
308
+ // remained_ld_blk_size_y,
309
+ // scale_factor,
310
+ // arr_len,
311
+ // mem_transpose>(payload_row);
317
312
#pragma unroll
318
313
for (uint32_t j = 0 ; j < num_block_x; j += arr_len) {
319
- xetla_tdescriptor tdesc = payload_row.row (j);
314
+ int32_t offset_x = j * block_size_x;
315
+ // xetla_tdescriptor tdesc = payload_row.row(j);
320
316
auto reg_blk = tile.reg .xetla_select <remained_block_elems * arr_len, 1 >(
321
317
processed_elems + j * remained_block_elems);
322
318
constexpr uint32_t ld_blk_height = (reg_transpose && trans)
@@ -343,8 +339,9 @@ tile_load(tile_t& tile, payload_t& payload) {
343
339
payload.surface_width ,
344
340
payload.surface_height ,
345
341
payload.surface_pitch ,
346
- ::gpu::xetla::detail::xetla_get_tensor_offset_x (tdesc),
347
- ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
342
+ payload.offset_x + offset_x / scale_factor,
343
+ payload.offset_y + num_block_y * block_size_y +
344
+ ii * remained_ld_blk_size_y);
348
345
// xetla_tload_global<
349
346
// load_dtype,
350
347
// (ld_blk_height * block_size_x * arr_len / scale_factor),
@@ -370,14 +367,14 @@ tile_load(tile_t& tile, payload_t& payload) {
370
367
} else {
371
368
reg_blk.xetla_select <tmp_size, 1 >(ii * tmp_size) = reg_tmp;
372
369
}
373
- if constexpr (mem_transpose) {
374
- xetla_update_tdesc_offsetx (
375
- tdesc.xetla_format <uint32_t >(),
376
- remained_ld_blk_size_y / scale_factor);
377
- } else {
378
- xetla_update_tdesc_offsety (
379
- tdesc.xetla_format <uint32_t >(), remained_ld_blk_size_y);
380
- }
370
+ // if constexpr (mem_transpose) {
371
+ // xetla_update_tdesc_offsetx(
372
+ // tdesc.xetla_format<uint32_t>(),
373
+ // remained_ld_blk_size_y / scale_factor);
374
+ // } else {
375
+ // xetla_update_tdesc_offsety(
376
+ // tdesc.xetla_format<uint32_t>(), remained_ld_blk_size_y);
377
+ // }
381
378
}
382
379
constexpr uint32_t final_ld_blk_size_y =
383
380
remained_size_y % remained_ld_blk_size_y;
@@ -388,18 +385,18 @@ tile_load(tile_t& tile, payload_t& payload) {
388
385
constexpr uint32_t final_load_elems =
389
386
final_ld_blk_size_y * block_size_x * arr_len;
390
387
constexpr uint8_t block_width =
391
- mem_transpose ? ( final_ld_blk_size_y / scale_factor) : block_size_x ;
388
+ ( mem_transpose ? final_ld_blk_size_y : block_size_x) / scale_factor ;
392
389
constexpr uint8_t block_height =
393
- trans ? block_size_x : final_ld_blk_size_y;
394
- constexpr uint32_t block_widthx_widthy_arrlen =
395
- (block_width - 1 ) | ((block_height - 1 ) << 8 );
396
- gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen (
397
- tdesc.xetla_format <uint32_t >(), block_widthx_widthy_arrlen);
390
+ mem_transpose ? block_size_x : final_ld_blk_size_y;
391
+ // constexpr uint32_t block_widthx_widthy_arrlen =
392
+ // (block_width - 1) | ((block_height - 1) << 8);
393
+ // gpu::xetla::detail::xetla_set_block_widthx_widthy_arrlen(
394
+ // tdesc.xetla_format<uint32_t>(), block_widthx_widthy_arrlen);
398
395
reg_blk.xetla_select <final_load_elems, 1 >(final_start)
399
396
.xetla_format <native_type_t <load_dtype>>() = xetla_load_global<
400
397
native_type_t <load_dtype>,
401
- block_size_x / scale_factor ,
402
- final_ld_blk_size_y ,
398
+ block_width ,
399
+ block_height ,
403
400
arr_len,
404
401
trans,
405
402
mem_transform,
@@ -409,8 +406,10 @@ tile_load(tile_t& tile, payload_t& payload) {
409
406
payload.surface_width ,
410
407
payload.surface_height ,
411
408
payload.surface_pitch ,
412
- ::gpu::xetla::detail::xetla_get_tensor_offset_x (tdesc),
413
- ::gpu::xetla::detail::xetla_get_tensor_offset_y(tdesc));
409
+ payload.offset_x + offset_x / scale_factor,
410
+ payload.offset_y + num_block_y * block_size_y +
411
+ remained_size_y / remained_ld_blk_size_y *
412
+ remained_ld_blk_size_y);
414
413
// xetla_tload_global<
415
414
// load_dtype,
416
415
// final_load_elems / scale_factor,
0 commit comments