@@ -36,20 +36,35 @@ namespace xsimd
3636
3737 namespace detail
3838 {
39- XSIMD_INLINE void split_avx (__m256i val, __m128i& low, __m128i& high ) noexcept
39+ XSIMD_INLINE __m128i lower_half (__m256i self ) noexcept
4040 {
41- low = _mm256_castsi256_si128 (val);
42- high = _mm256_extractf128_si256 (val, 1 );
41+ return _mm256_castsi256_si128 (self);
4342 }
44- XSIMD_INLINE void split_avx (__m256 val, __m128& low, __m128& high ) noexcept
43+ XSIMD_INLINE __m128 lower_half (__m256 self ) noexcept
4544 {
46- low = _mm256_castps256_ps128 (val);
47- high = _mm256_extractf128_ps (val, 1 );
45+ return _mm256_castps256_ps128 (self);
4846 }
49- XSIMD_INLINE void split_avx (__m256d val, __m128d& low, __m128d& high ) noexcept
47+ XSIMD_INLINE __m128d lower_half (__m256d self ) noexcept
5048 {
51- low = _mm256_castpd256_pd128 (val);
52- high = _mm256_extractf128_pd (val, 1 );
49+ return _mm256_castpd256_pd128 (self);
50+ }
51+ XSIMD_INLINE __m128i upper_half (__m256i self) noexcept
52+ {
53+ return _mm256_extractf128_si256 (self, 1 );
54+ }
55+ XSIMD_INLINE __m128 upper_half (__m256 self) noexcept
56+ {
57+ return _mm256_extractf128_ps (self, 1 );
58+ }
59+ XSIMD_INLINE __m128d upper_half (__m256d self) noexcept
60+ {
61+ return _mm256_extractf128_pd (self, 1 );
62+ }
63+ template <class Full , class Half >
64+ XSIMD_INLINE void split_avx (Full val, Half& low, Half& high) noexcept
65+ {
66+ low = lower_half (val);
67+ high = upper_half (val);
5368 }
5469 XSIMD_INLINE __m256i merge_sse (__m128i low, __m128i high) noexcept
5570 {
@@ -63,6 +78,17 @@ namespace xsimd
6378 {
6479 return _mm256_insertf128_pd (_mm256_castpd128_pd256 (low), high, 1 );
6580 }
81+ template <class T >
82+ XSIMD_INLINE batch<T, sse4_2> lower_half (batch<T, avx> const & self) noexcept
83+ {
84+ return lower_half (self);
85+ }
86+ template <class T >
87+ XSIMD_INLINE batch<T, sse4_2> upper_half (batch<T, avx> const & self) noexcept
88+ {
89+ return upper_half (self);
90+ }
91+
6692 template <class F >
6793 XSIMD_INLINE __m256i fwd_to_sse (F f, __m256i self) noexcept
6894 {
@@ -865,6 +891,134 @@ namespace xsimd
865891 return _mm256_loadu_pd (mem);
866892 }
867893
894+ // load_masked
895+ template <class A , bool ... Values, class Mode >
896+ XSIMD_INLINE batch<float , A> load_masked (float const * mem, batch_bool_constant<float , A, Values...> mask, convert<float >, Mode, requires_arch<avx>) noexcept
897+ {
898+ XSIMD_IF_CONSTEXPR (mask.none ())
899+ {
900+ return _mm256_setzero_ps ();
901+ }
902+ else XSIMD_IF_CONSTEXPR (mask.all ())
903+ {
904+ return load<A>(mem, Mode {});
905+ }
906+ // confined to lower 128-bit half (4 lanes) → forward to SSE2
907+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
908+ {
909+ constexpr auto mlo = mask.template lower_half <sse4_2>();
910+ const auto lo = load_masked (mem, mlo, convert<float > {}, Mode {}, sse4_2 {});
911+ return batch<float , A>(detail::merge_sse (lo, batch<float , sse4_2>(0 .f )));
912+ }
913+ // confined to upper 128-bit half (4 lanes) → forward to SSE2
914+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 4 )
915+ {
916+ constexpr auto mhi = mask.template upper_half <sse4_2>();
917+ const auto hi = load_masked (mem + 4 , mhi, convert<float > {}, Mode {}, sse4_2 {});
918+ return batch<float , A>(detail::merge_sse (batch<float , sse4_2>(0 .f ), hi));
919+ }
920+ else
921+ {
922+ // crossing 128-bit boundary → use 256-bit masked load
923+ return _mm256_maskload_ps (mem, mask.as_batch ());
924+ }
925+ }
926+
927+ template <class A , bool ... Values, class Mode >
928+ XSIMD_INLINE batch<double , A> load_masked (double const * mem, batch_bool_constant<double , A, Values...> mask, convert<double >, Mode, requires_arch<avx>) noexcept
929+ {
930+ XSIMD_IF_CONSTEXPR (mask.none ())
931+ {
932+ return _mm256_setzero_pd ();
933+ }
934+ else XSIMD_IF_CONSTEXPR (mask.all ())
935+ {
936+ return load<A>(mem, Mode {});
937+ }
938+ // confined to lower 128-bit half (2 lanes) → forward to SSE2
939+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 2 )
940+ {
941+ constexpr auto mlo = mask.template lower_half <sse4_2>();
942+ const auto lo = load_masked (mem, mlo, convert<double > {}, Mode {}, sse4_2 {});
943+ return batch<double , A>(detail::merge_sse (lo, batch<double , sse4_2>(0.0 )));
944+ }
945+ // confined to upper 128-bit half (2 lanes) → forward to SSE2
946+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 2 )
947+ {
948+ constexpr auto mhi = mask.template upper_half <sse4_2>();
949+ const auto hi = load_masked (mem + 2 , mhi, convert<double > {}, Mode {}, sse4_2 {});
950+ return batch<double , A>(detail::merge_sse (batch<double , sse4_2>(0.0 ), hi));
951+ }
952+ else
953+ {
954+ // crossing 128-bit boundary → use 256-bit masked load
955+ return _mm256_maskload_pd (mem, mask.as_batch ());
956+ }
957+ }
958+
959+ // store_masked
960+ template <class A , bool ... Values, class Mode >
961+ XSIMD_INLINE void store_masked (float * mem, batch<float , A> const & src, batch_bool_constant<float , A, Values...> mask, Mode, requires_arch<avx>) noexcept
962+ {
963+ XSIMD_IF_CONSTEXPR (mask.none ())
964+ {
965+ return ;
966+ }
967+ else XSIMD_IF_CONSTEXPR (mask.all ())
968+ {
969+ src.store (mem, Mode {});
970+ }
971+ // confined to lower 128-bit half (4 lanes) → forward to SSE2
972+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 4 )
973+ {
974+ constexpr auto mlo = mask.template lower_half <sse4_2>();
975+ const batch<float , sse4_2> lo (_mm256_castps256_ps128 (src));
976+ store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
977+ }
978+ // confined to upper 128-bit half (4 lanes) → forward to SSE2
979+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 4 )
980+ {
981+ constexpr auto mhi = mask.template upper_half <sse4_2>();
982+ const batch<float , sse4_2> hi (_mm256_extractf128_ps (src, 1 ));
983+ store_masked<sse4_2>(mem + 4 , hi, mhi, Mode {}, sse4_2 {});
984+ }
985+ else
986+ {
987+ _mm256_maskstore_ps (mem, mask.as_batch (), src);
988+ }
989+ }
990+
991+ template <class A , bool ... Values, class Mode >
992+ XSIMD_INLINE void store_masked (double * mem, batch<double , A> const & src, batch_bool_constant<double , A, Values...> mask, Mode, requires_arch<avx>) noexcept
993+ {
994+ XSIMD_IF_CONSTEXPR (mask.none ())
995+ {
996+ return ;
997+ }
998+ else XSIMD_IF_CONSTEXPR (mask.all ())
999+ {
1000+ src.store (mem, Mode {});
1001+ }
1002+ // confined to lower 128-bit half (2 lanes) → forward to SSE2
1003+ else XSIMD_IF_CONSTEXPR (mask.countl_zero () >= 2 )
1004+ {
1005+ constexpr auto mlo = mask.template lower_half <sse2>();
1006+ const batch<double , sse2> lo (_mm256_castpd256_pd128 (src));
1007+ store_masked<sse2>(mem, lo, mlo, Mode {}, sse4_2 {});
1008+ }
1009+ // confined to upper 128-bit half (2 lanes) → forward to SSE2
1010+ else XSIMD_IF_CONSTEXPR (mask.countr_zero () >= 2 )
1011+ {
1012+ constexpr auto mhi = mask.template upper_half <sse2>();
1013+ const batch<double , sse2> hi (_mm256_extractf128_pd (src, 1 ));
1014+ store_masked<sse2>(mem + 2 , hi, mhi, Mode {}, sse4_2 {});
1015+ }
1016+ else
1017+ {
1018+ _mm256_maskstore_pd (mem, mask.as_batch (), src);
1019+ }
1020+ }
1021+
8681022 // lt
8691023 template <class A >
8701024 XSIMD_INLINE batch_bool<float , A> lt (batch<float , A> const & self, batch<float , A> const & other, requires_arch<avx>) noexcept
0 commit comments