1919#if defined(__i386__) || defined(__x86_64__)
2020
2121#include < array>
22+ #include < cstring>
2223#include < limits>
2324#include < type_traits>
2425
@@ -332,11 +333,10 @@ template <> struct ConvertToFloat<8> {
332333 // from float
333334 static __m256 load (const float * ptr) { return _mm256_loadu_ps (ptr); }
334335 static __m256 load (mask_t m, const float * ptr) {
335- // AVX2 doesn't have native masked load, so we load and then blend
336- auto data = _mm256_loadu_ps (ptr);
337- auto zero = _mm256_setzero_ps ();
338- auto mask_vec = create_blend_mask_avx2 (m);
339- return _mm256_blendv_ps (zero, data, mask_vec);
336+ // Full width load with blending may cause out-of-bounds read (SEGV)
337+ // Therefore we use _mm256_maskload_ps which safely handles masked loads
338+ auto mask_vec = _mm256_castps_si256 (create_blend_mask_avx2 (m));
339+ return _mm256_maskload_ps (ptr, mask_vec);
340340 }
341341
342342 // from float16
@@ -345,10 +345,10 @@ template <> struct ConvertToFloat<8> {
345345 }
346346
347347 static __m256 load (mask_t m, const Float16* ptr) {
348- auto data = _mm256_cvtph_ps ( _mm_loadu_si128 ( reinterpret_cast < const __m128i*>(ptr)));
349- auto zero = _mm256_setzero_ps ();
350- auto mask_vec = create_blend_mask_avx2 (m );
351- return _mm256_blendv_ps (zero, data, mask_vec );
348+ // Safe masked load using a temporary buffer to avoid SEGV
349+ __m128i buffer = _mm_setzero_si128 ();
350+ std::memcpy (&buffer, ptr, __builtin_popcount (m) * sizeof (Float16) );
351+ return _mm256_cvtph_ps (buffer );
352352 }
353353
354354 // from uint8
@@ -359,12 +359,10 @@ template <> struct ConvertToFloat<8> {
359359 }
360360
361361 static __m256 load (mask_t m, const uint8_t * ptr) {
362- auto data = _mm256_cvtepi32_ps (_mm256_cvtepu8_epi32 (
363- _mm_cvtsi64_si128 (*(reinterpret_cast <const int64_t *>(ptr)))
364- ));
365- auto zero = _mm256_setzero_ps ();
366- auto mask_vec = create_blend_mask_avx2 (m);
367- return _mm256_blendv_ps (zero, data, mask_vec);
362+ // Safe masked load using a temporary buffer to avoid SEGV
363+ int64_t buffer = 0 ;
364+ std::memcpy (&buffer, ptr, __builtin_popcount (m) * sizeof (uint8_t ));
365+ return _mm256_cvtepi32_ps (_mm256_cvtepu8_epi32 (_mm_cvtsi64_si128 (buffer)));
368366 }
369367
370368 // from int8
@@ -375,12 +373,10 @@ template <> struct ConvertToFloat<8> {
375373 }
376374
377375 static __m256 load (mask_t m, const int8_t * ptr) {
378- auto data = _mm256_cvtepi32_ps (_mm256_cvtepi8_epi32 (
379- _mm_cvtsi64_si128 (*(reinterpret_cast <const int64_t *>(ptr)))
380- ));
381- auto zero = _mm256_setzero_ps ();
382- auto mask_vec = create_blend_mask_avx2 (m);
383- return _mm256_blendv_ps (zero, data, mask_vec);
376+ // Safe masked load using a temporary buffer to avoid SEGV
377+ int64_t buffer = 0 ;
378+ std::memcpy (&buffer, ptr, __builtin_popcount (m) * sizeof (int8_t ));
379+ return _mm256_cvtepi32_ps (_mm256_cvtepi8_epi32 (_mm_cvtsi64_si128 (buffer)));
384380 }
385381
386382 // We do not need to treat the left or right-hand differently.
0 commit comments