@@ -256,7 +256,7 @@ constexpr __ESIMD_NS::atomic_op get_atomic_op(gpu::xetla::atomic_op ao) {
256
256
// /
257
257
template <
258
258
typename Ty,
259
- uint8_t NElts = 1 ,
259
+ int NElts = 1 ,
260
260
data_size DS = data_size::default_size,
261
261
cache_hint L1H = cache_hint::cached,
262
262
cache_hint L2H = cache_hint::cached,
@@ -293,7 +293,7 @@ __XETLA_API void xetla_prefetch_global(
293
293
// /
294
294
template <
295
295
typename Ty,
296
- uint8_t NElts = 1 ,
296
+ int NElts = 1 ,
297
297
data_size DS = data_size::default_size,
298
298
cache_hint L1H = cache_hint::cached,
299
299
cache_hint L2H = cache_hint::cached>
@@ -385,7 +385,7 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
385
385
// /
386
386
template <
387
387
typename Ty,
388
- uint8_t NElts = 1 ,
388
+ int NElts = 1 ,
389
389
data_size DS = data_size::default_size,
390
390
cache_hint L1H = cache_hint::none,
391
391
cache_hint L2H = cache_hint::none,
@@ -431,7 +431,7 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_global(
431
431
// /
432
432
template <
433
433
typename Ty,
434
- uint8_t NElts = 1 ,
434
+ int NElts = 1 ,
435
435
data_size DS = data_size::default_size,
436
436
cache_hint L1H = cache_hint::none,
437
437
cache_hint L2H = cache_hint::none,
@@ -653,7 +653,7 @@ __XETLA_API void xetla_local_init() {
653
653
// /
654
654
template <
655
655
typename Ty,
656
- uint8_t NElts = 1 ,
656
+ int NElts = 1 ,
657
657
data_size DS = data_size::default_size,
658
658
int N>
659
659
__XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local (
@@ -670,35 +670,31 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(
670
670
xetla_cvt<uint64_t , uint32_t >(offsets), pred);
671
671
}
672
672
673
- // / @brief SLM block load. (transposed gather with 1 channel).
674
- // / Collects elements located at slm and returns them as a single \ref
675
- // / xetla_vector object.
676
- // /
677
- // / Supported platforms: DG2, PVC
678
- // /
679
- // / VISA instruction: lsc_load.slm
680
- // /
681
- // / @tparam Ty is element type.
682
- // / @tparam NElts is the number of elements to load per address (i.e.
683
- // / vector_size per SIMD channel).
684
- // / @tparam DS is the data size.
685
- // / @param offset [in] is the zero-based offset for SLM buffer in bytes.
686
- // / @return is a xetla_vector of type T and size NElts.
687
- // /
688
- template <
689
- typename Ty,
690
- uint8_t NElts = 1 ,
691
- data_size DS = data_size::default_size>
673
+ // / Loads a contiguous block of SLM memory referenced by the given byte-offset
674
+ // / \p offset, then returns the loaded data as a simd object.
675
+ // / The generated code depends on the combination {T, N, Flags}.
676
+ // / Providing flags specifying the alignment of 16-bytes or more produces more
677
+ // / efficient code. If the alignment is smaller than 16-bytes, then less
678
+ // / efficient gather is generated. If the loaded vector is too long
679
+ // / for 1 flat-load GPU instruction, then a series of flat-loads and/or gathers
680
+ // / may be generated.
681
+ // / @tparam T Element type.
682
+ // / @tparam N Number of elements to load.
683
+ // / @tparam Flags The alignment specifier type tag.
684
+ // / @param byte_offset The byte-offset to load from.
685
+ // / @param Flags Specifies the alignment.
686
+ // / @return A vector of loaded elements.
687
+ // /
688
+ template <typename Ty, int NElts = 1 , data_size DS = data_size::default_size>
692
689
__XETLA_API xetla_vector<Ty, NElts> xetla_load_local (uint32_t offset) {
693
690
using T = native_type_t <Ty>;
694
- DEBUG_INVOKE (
695
- dbg_level::core,
696
- core::general_1d<gpu_arch::XeHpc, Ty>::template check_restriction<NElts>(
697
- (uint64_t )offset));
691
+ // DEBUG_INVOKE(
692
+ // dbg_level::core,
693
+ // core::general_1d<gpu_arch::XeHpc, Ty>::template
694
+ // check_restriction<NElts>(
695
+ // (uint64_t)offset));
698
696
699
- return __ESIMD_ENS::
700
- lsc_slm_block_load<T, NElts, gpu::xetla::detail::get_data_size (DS)>(
701
- offset);
697
+ return __ESIMD_NS::slm_block_load<T, NElts>(offset);
702
698
}
703
699
704
700
// / @brief SLM scattered store.
@@ -719,7 +715,7 @@ __XETLA_API xetla_vector<Ty, NElts> xetla_load_local(uint32_t offset) {
719
715
// /
720
716
template <
721
717
typename Ty,
722
- uint8_t NElts = 1 ,
718
+ int NElts = 1 ,
723
719
data_size DS = data_size::default_size,
724
720
int N>
725
721
__XETLA_API void xetla_store_local (
@@ -737,36 +733,38 @@ __XETLA_API void xetla_store_local(
737
733
offsets, vals, pred);
738
734
}
739
735
740
- // / @brief SLM block store (transposed SLM scatter with 1 channel).
741
- // / Scatters elements located to slm.
742
- // /
743
- // / Supported platforms: DG2, PVC
744
- // /
745
- // / VISA instruction: lsc_store.slm
746
- // /
747
- // / @tparam Ty is element type.
748
- // / @tparam NElts is the number of elements to store per address (i.e.
749
- // / vector_size per SIMD channel).
750
- // / @tparam DS is the data size.
751
- // / @param offset [in] is the zero-based offset for SLM buffer in bytes.
752
- // / @param vals [in] is values to store.
753
- // /
754
- template <
755
- typename Ty,
756
- uint8_t NElts = 1 ,
757
- data_size DS = data_size::default_size>
736
+ // / Stores elements of the vector \p vals to a contiguous block of SLM memory
737
+ // / at the given byte-offset \p offset.
738
+ // / The generated code depends on the combination {T, N, Flags}.
739
+ // / Providing flags specifying the alignment of 16-bytes or more produces more
740
+ // / efficient code. If the alignment is smaller than 16-bytes, then less
741
+ // / efficient scatter is generated. If the stored vector is too long
742
+ // / for 1 flat-store GPU instruction, then a series of flat-store and/or
743
+ // / scatters may be generated.
744
+ // / @tparam T Element type.
745
+ // / @tparam N Number of elements to store.
746
+ // / @tparam Flags The alignment specifier type tag.
747
+ // / @param offset The byte-offset to store at.
748
+ // / @param vals The vector to store.
749
+ // / @param Flags Specifies the alignment.
750
+ // /
751
+ template <typename Ty, int NElts = 1 , data_size DS = data_size::default_size>
758
752
__XETLA_API void xetla_store_local (
759
753
uint32_t offset,
760
754
xetla_vector<Ty, NElts> vals) {
761
- using T = native_type_t <Ty>;
762
- DEBUG_INVOKE (
763
- dbg_level::core,
764
- core::general_1d<gpu_arch::XeHpc, Ty>::template check_restriction<NElts>(
765
- offset));
766
-
767
- __ESIMD_ENS::
768
- lsc_slm_block_store<T, NElts, gpu::xetla::detail::get_data_size (DS)>(
769
- offset, vals);
755
+ // using T = native_type_t<Ty>;
756
+ // DEBUG_INVOKE(
757
+ // dbg_level::core,
758
+ // core::general_1d<gpu_arch::XeHpc, Ty>::template
759
+ // check_restriction<NElts>(
760
+ // offset));
761
+
762
+ // __ESIMD_ENS::
763
+ // lsc_slm_block_store<T, NElts, gpu::xetla::detail::get_data_size(DS)>(
764
+ // offset, vals);
765
+ // __ESIMD_NS::properties props{};
766
+
767
+ __ESIMD_NS::slm_block_store<Ty, NElts>(offset, vals);
770
768
}
771
769
772
770
// / @brief SLM scattered atomic (0 src).
0 commit comments