1818#include < type_traits>
1919
2020#include " ../types/xsimd_avx_register.hpp"
21+ #include " ../types/xsimd_batch_constant.hpp"
2122
2223namespace xsimd
2324{
@@ -36,20 +37,35 @@ namespace xsimd
3637
3738 namespace detail
3839 {
39- XSIMD_INLINE void split_avx (__m256i val, __m128i& low, __m128i& high ) noexcept
40+ XSIMD_INLINE __m128i lower_half (__m256i self ) noexcept
4041 {
41- low = _mm256_castsi256_si128 (val);
42- high = _mm256_extractf128_si256 (val, 1 );
42+ return _mm256_castsi256_si128 (self);
4343 }
44- XSIMD_INLINE void split_avx (__m256 val, __m128& low, __m128& high ) noexcept
44+ XSIMD_INLINE __m128 lower_half (__m256 self ) noexcept
4545 {
46- low = _mm256_castps256_ps128 (val);
47- high = _mm256_extractf128_ps (val, 1 );
46+ return _mm256_castps256_ps128 (self);
4847 }
49- XSIMD_INLINE void split_avx (__m256d val, __m128d& low, __m128d& high ) noexcept
48+ XSIMD_INLINE __m128d lower_half (__m256d self ) noexcept
5049 {
51- low = _mm256_castpd256_pd128 (val);
52- high = _mm256_extractf128_pd (val, 1 );
50+ return _mm256_castpd256_pd128 (self);
51+ }
52+ XSIMD_INLINE __m128i upper_half (__m256i self) noexcept
53+ {
54+ return _mm256_extractf128_si256 (self, 1 );
55+ }
56+ XSIMD_INLINE __m128 upper_half (__m256 self) noexcept
57+ {
58+ return _mm256_extractf128_ps (self, 1 );
59+ }
60+ XSIMD_INLINE __m128d upper_half (__m256d self) noexcept
61+ {
62+ return _mm256_extractf128_pd (self, 1 );
63+ }
64+ template <class Full , class Half >
65+ XSIMD_INLINE void split_avx (Full val, Half& low, Half& high) noexcept
66+ {
67+ low = lower_half (val);
68+ high = upper_half (val);
5369 }
5470 XSIMD_INLINE __m256i merge_sse (__m128i low, __m128i high) noexcept
5571 {
@@ -865,6 +881,134 @@ namespace xsimd
865881 return _mm256_loadu_pd (mem);
866882 }
867883
884+ // load_masked
885+ template <class A , bool ... Values, class Mode >
886+ XSIMD_INLINE batch<float , A> load_masked (float const * mem, batch_bool_constant<float , A, Values...> mask, convert<float >, Mode, requires_arch<avx>) noexcept
887+ {
888+ XSIMD_IF_CONSTEXPR (mask.none ())
889+ {
890+ return _mm256_setzero_ps ();
891+ }
892+ else XSIMD_IF_CONSTEXPR (mask.all ())
893+ {
894+ return load<A>(mem, Mode {});
895+ }
896+ // confined to lower 128-bit half (4 lanes) → forward to SSE2
897+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
898+ {
899+ constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
900+ const auto lo = load_masked (mem, mlo, convert<float > {}, Mode {}, sse4_2 {});
901+ return batch<float , A>(detail::merge_sse (lo, batch<float , sse4_2>(0 .f )));
902+ }
903+ // confined to upper 128-bit half (4 lanes) → forward to SSE2
904+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 4 )
905+ {
906+ constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
907+ const auto hi = load_masked (mem + 4 , mhi, convert<float > {}, Mode {}, sse4_2 {});
908+ return batch<float , A>(detail::merge_sse (batch<float , sse4_2>(0 .f ), hi));
909+ }
910+ else
911+ {
912+ // crossing 128-bit boundary → use 256-bit masked load
913+ return _mm256_maskload_ps (mem, mask.as_batch ());
914+ }
915+ }
916+
917+ template <class A , bool ... Values, class Mode >
918+ XSIMD_INLINE batch<double , A> load_masked (double const * mem, batch_bool_constant<double , A, Values...> mask, convert<double >, Mode, requires_arch<avx>) noexcept
919+ {
920+ XSIMD_IF_CONSTEXPR (mask.none ())
921+ {
922+ return _mm256_setzero_pd ();
923+ }
924+ else XSIMD_IF_CONSTEXPR (mask.all ())
925+ {
926+ return load<A>(mem, Mode {});
927+ }
928+ // confined to lower 128-bit half (2 lanes) → forward to SSE2
929+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 2 )
930+ {
931+ constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
932+ const auto lo = load_masked (mem, mlo, convert<double > {}, Mode {}, sse4_2 {});
933+ return batch<double , A>(detail::merge_sse (lo, batch<double , sse4_2>(0.0 )));
934+ }
935+ // confined to upper 128-bit half (2 lanes) → forward to SSE2
936+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 2 )
937+ {
938+ constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
939+ const auto hi = load_masked (mem + 2 , mhi, convert<double > {}, Mode {}, sse4_2 {});
940+ return batch<double , A>(detail::merge_sse (batch<double , sse4_2>(0.0 ), hi));
941+ }
942+ else
943+ {
944+ // crossing 128-bit boundary → use 256-bit masked load
945+ return _mm256_maskload_pd (mem, mask.as_batch ());
946+ }
947+ }
948+
949+ // store_masked
950+ template <class A , bool ... Values, class Mode >
951+ XSIMD_INLINE void store_masked (float * mem, batch<float , A> const & src, batch_bool_constant<float , A, Values...> mask, Mode, requires_arch<avx>) noexcept
952+ {
953+ XSIMD_IF_CONSTEXPR (mask.none ())
954+ {
955+ return ;
956+ }
957+ else XSIMD_IF_CONSTEXPR (mask.all ())
958+ {
959+ src.store (mem, Mode {});
960+ }
961+ // confined to lower 128-bit half (4 lanes) → forward to SSE2
962+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
963+ {
964+ constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
965+ const batch<float , sse4_2> lo (_mm256_castps256_ps128 (src));
966+ store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
967+ }
968+ // confined to upper 128-bit half (4 lanes) → forward to SSE2
969+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 4 )
970+ {
971+ constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
972+ const batch<float , sse4_2> hi (_mm256_extractf128_ps (src, 1 ));
973+ store_masked<sse4_2>(mem + 4 , hi, mhi, Mode {}, sse4_2 {});
974+ }
975+ else
976+ {
977+ _mm256_maskstore_ps (mem, mask.as_batch (), src);
978+ }
979+ }
980+
981+ template <class A , bool ... Values, class Mode >
982+ XSIMD_INLINE void store_masked (double * mem, batch<double , A> const & src, batch_bool_constant<double , A, Values...> mask, Mode, requires_arch<avx>) noexcept
983+ {
984+ XSIMD_IF_CONSTEXPR (mask.none ())
985+ {
986+ return ;
987+ }
988+ else XSIMD_IF_CONSTEXPR (mask.all ())
989+ {
990+ src.store (mem, Mode {});
991+ }
992+ // confined to lower 128-bit half (2 lanes) → forward to SSE2
993+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 2 )
994+ {
995+ constexpr auto mlo = ::xsimd::detail::lower_half<sse2>(mask);
996+ const batch<double , sse2> lo (_mm256_castpd256_pd128 (src));
997+ store_masked<sse2>(mem, lo, mlo, Mode {}, sse4_2 {});
998+ }
999+ // confined to upper 128-bit half (2 lanes) → forward to SSE2
1000+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 2 )
1001+ {
1002+ constexpr auto mhi = ::xsimd::detail::upper_half<sse2>(mask);
1003+ const batch<double , sse2> hi (_mm256_extractf128_pd (src, 1 ));
1004+ store_masked<sse2>(mem + 2 , hi, mhi, Mode {}, sse4_2 {});
1005+ }
1006+ else
1007+ {
1008+ _mm256_maskstore_pd (mem, mask.as_batch (), src);
1009+ }
1010+ }
1011+
8681012 // lt
8691013 template <class A >
8701014 XSIMD_INLINE batch_bool<float , A> lt (batch<float , A> const & self, batch<float , A> const & other, requires_arch<avx>) noexcept
0 commit comments