-
Notifications
You must be signed in to change notification settings - Fork 280
Proposal load/store masked #1162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| #include <type_traits> | ||
|
|
||
| #include "../types/xsimd_avx_register.hpp" | ||
| #include "../types/xsimd_batch_constant.hpp" | ||
|
|
||
| namespace xsimd | ||
| { | ||
|
|
@@ -871,6 +872,134 @@ namespace xsimd | |
| return _mm256_loadu_pd(mem); | ||
| } | ||
|
|
||
| // load_masked | ||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE batch<float, A> load_masked(float const* mem, batch_bool_constant<float, A, Values...> mask, convert<float>, Mode, requires_arch<avx>) noexcept | ||
| { | ||
| XSIMD_IF_CONSTEXPR(mask.none()) | ||
| { | ||
| return _mm256_setzero_ps(); | ||
| } | ||
| else XSIMD_IF_CONSTEXPR(mask.all()) | ||
| { | ||
| return load<A>(mem, Mode {}); | ||
| } | ||
| // confined to lower 128-bit half (4 lanes) → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask); | ||
| const auto lo = load_masked(mem, mlo, convert<float> {}, Mode {}, sse4_2 {}); | ||
| return batch<float, A>(detail::merge_sse(lo, batch<float, sse4_2>(0.f))); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could be a call to _mm256_castps128_ps256 instead of merge + batch of zero |
||
| } | ||
| // confined to upper 128-bit half (4 lanes) → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 4) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask); | ||
| const auto hi = load_masked(mem + 4, mhi, convert<float> {}, Mode {}, sse4_2 {}); | ||
| return batch<float, A>(detail::merge_sse(batch<float, sse4_2>(0.f), hi)); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know if we could use a similar trick to |
||
| } | ||
| else | ||
| { | ||
| // crossing 128-bit boundary → use 256-bit masked load | ||
| return _mm256_maskload_ps(mem, mask.as_batch()); | ||
| } | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE batch<double, A> load_masked(double const* mem, batch_bool_constant<double, A, Values...> mask, convert<double>, Mode, requires_arch<avx>) noexcept | ||
| { | ||
| XSIMD_IF_CONSTEXPR(mask.none()) | ||
| { | ||
| return _mm256_setzero_pd(); | ||
| } | ||
| else XSIMD_IF_CONSTEXPR(mask.all()) | ||
| { | ||
| return load<A>(mem, Mode {}); | ||
| } | ||
| // confined to lower 128-bit half (2 lanes) → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 2) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask); | ||
| const auto lo = load_masked(mem, mlo, convert<double> {}, Mode {}, sse4_2 {}); | ||
| return batch<double, A>(detail::merge_sse(lo, batch<double, sse4_2>(0.0))); | ||
| } | ||
| // confined to upper 128-bit half (2 lanes) → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 2) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask); | ||
| const auto hi = load_masked(mem + 2, mhi, convert<double> {}, Mode {}, sse4_2 {}); | ||
| return batch<double, A>(detail::merge_sse(batch<double, sse4_2>(0.0), hi)); | ||
| } | ||
| else | ||
| { | ||
| // crossing 128-bit boundary → use 256-bit masked load | ||
| return _mm256_maskload_pd(mem, mask.as_batch()); | ||
| } | ||
| } | ||
|
|
||
| // store_masked | ||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx>) noexcept | ||
| { | ||
| XSIMD_IF_CONSTEXPR(mask.none()) | ||
| { | ||
| return; | ||
| } | ||
| else XSIMD_IF_CONSTEXPR(mask.all()) | ||
| { | ||
| src.store(mem, Mode {}); | ||
| } | ||
| // confined to lower 128-bit half (4 lanes) → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 4) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask); | ||
| const batch<float, sse4_2> lo(_mm256_castps256_ps128(src)); | ||
| store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {}); | ||
| } | ||
| // confined to upper 128-bit half (4 lanes) → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 4) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask); | ||
| const batch<float, sse4_2> hi(_mm256_extractf128_ps(src, 1)); | ||
| store_masked<sse4_2>(mem + 4, hi, mhi, Mode {}, sse4_2 {}); | ||
| } | ||
| else | ||
| { | ||
| _mm256_maskstore_ps(mem, mask.as_batch(), src); | ||
| } | ||
| } | ||
|
|
||
| template <class A, bool... Values, class Mode> | ||
| XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<avx>) noexcept | ||
| { | ||
| XSIMD_IF_CONSTEXPR(mask.none()) | ||
| { | ||
| return; | ||
| } | ||
| else XSIMD_IF_CONSTEXPR(mask.all()) | ||
| { | ||
| src.store(mem, Mode {}); | ||
| } | ||
| // confined to lower 128-bit half (2 lanes) → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countl_zero() >= 2) | ||
| { | ||
| constexpr auto mlo = ::xsimd::detail::lower_half<sse2>(mask); | ||
| const batch<double, sse2> lo(_mm256_castpd256_pd128(src)); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's a lower_half, right? |
||
| store_masked<sse2>(mem, lo, mlo, Mode {}, sse4_2 {}); | ||
| } | ||
| // confined to upper 128-bit half (2 lanes) → forward to SSE2 | ||
| else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 2) | ||
| { | ||
| constexpr auto mhi = ::xsimd::detail::upper_half<sse2>(mask); | ||
| const batch<double, sse2> hi(_mm256_extractf128_pd(src, 1)); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's a upper_half, right? |
||
| store_masked<sse2>(mem + 2, hi, mhi, Mode {}, sse4_2 {}); | ||
| } | ||
| else | ||
| { | ||
| _mm256_maskstore_pd(mem, mask.as_batch(), src); | ||
| } | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. epiphany : if you express everything in terms of lower / upper half, you're pretty close to a type-generic implementation that would avoid a lot of redunduncy. just need to wrap the actual There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess you are right. It might be possible to express things in terms of size and halves. This might be recursive though and we might need to arch to half arch, similarly to what happens in widen. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll also have to think if the size of the element matters. I'll have a look next week There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason why I kept the implementation separate is as follows: The general implementation in terms of upper/lower ais possible. This haves the code in avx, avx2 still has to provide a new implementation so that it can take advantage of the new intrinsics so we for from 2 versions in avx down to 1 and 2 in avx2 down to 1. Same for avx512. But then for merging we can not use more optimized intrinsics than _mm256_castps128_ps256. Is it worth it? The solution here is to pass the MaskLoad/Store and the merge function as templates to this general implementation. Then we could have 1 general implementation maybe for all the kernels. But, I think it might be too much templating. In general also wrapping _mm256_maskstore* also makes a clean way to expose this to the used if we wish to do so. What do you suggest? |
||
|
|
||
| // lt | ||
| template <class A> | ||
| XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you need that change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I removed some includes form xsimd_common_fwd and replaced them with forward declarations. So know the concrete types needs to be included in some of the other header.
This speed up compilation a bit and I thought that fwd should really contain only forward declarations.