From c2810c9b9056dd5aeff2e2ae869e9343ff4d388d Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Thu, 20 Nov 2025 15:03:48 +0000 Subject: [PATCH 01/15] initial avx512 implementation --- src/implementation/algorithm.rs | 32 +++- src/implementation/helpers.rs | 13 ++ src/implementation/x86/avx512.rs | 271 +++++++++++++++++++++++++++++++ src/implementation/x86/mod.rs | 21 ++- 4 files changed, 334 insertions(+), 3 deletions(-) create mode 100644 src/implementation/x86/avx512.rs diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index b0381aa7..55424f5a 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -182,7 +182,10 @@ macro_rules! algorithm_simd { unsafe fn check_block(&mut self, input: SimdInput) { // WORKAROUND // necessary because the for loop is not unrolled on ARM64 - if input.vals.len() == 2 { + if input.vals.len() == 1 { + self.check_bytes(*input.vals.as_ptr()); + self.incomplete = Self::is_incomplete(*input.vals.as_ptr()); + } else if input.vals.len() == 2 { self.check_bytes(*input.vals.as_ptr()); self.check_bytes(*input.vals.as_ptr().add(1)); self.incomplete = Self::is_incomplete(*input.vals.as_ptr().add(1)); @@ -573,3 +576,30 @@ macro_rules! simd_input_256_bit { } }; } + +macro_rules! simd_input_512_bit { + ($(#[$feat:meta])*) => { + #[repr(C)] + struct SimdInput { + vals: [SimdU8Value; 1], + } + + impl SimdInput { + $(#[$feat])* + #[inline] + unsafe fn new(ptr: *const u8) -> Self { + Self { + vals: [ + SimdU8Value::load_from(ptr), + ], + } + } + + $(#[$feat])* + #[inline] + unsafe fn is_ascii(&self) -> bool { + self.vals[0].is_ascii() + } + } + }; +} diff --git a/src/implementation/helpers.rs b/src/implementation/helpers.rs index 6fa0f18b..7796750f 100644 --- a/src/implementation/helpers.rs +++ b/src/implementation/helpers.rs @@ -139,6 +139,10 @@ impl TempSimdChunkA16 { #[allow(dead_code)] // only used if a 256-bit SIMD implementation is used pub(crate) struct TempSimdChunkA32(pub(crate) [u8; SIMD_CHUNK_SIZE]); +#[repr(C, align(64))] +#[allow(dead_code)] // only used if a 256-bit SIMD implementation is used +pub(crate) struct TempSimdChunkA64(pub(crate) [u8; SIMD_CHUNK_SIZE]); + #[allow(dead_code)] // only used if there is a SIMD implementation impl TempSimdChunkA32 { #[flexpect::e(clippy::inline_always)] @@ -148,6 +152,15 @@ impl TempSimdChunkA32 { } } +#[allow(dead_code)] // only used if there is a SIMD implementation +impl TempSimdChunkA64 { + #[flexpect::e(clippy::inline_always)] + #[inline(always)] // needs to be forced because otherwise it is not inlined on armv7 neo + pub(crate) const fn new() -> Self { + Self([0; SIMD_CHUNK_SIZE]) + } +} + #[derive(Clone, Copy)] #[allow(dead_code)] // only used if there is a SIMD implementation pub(crate) struct SimdU8Value(pub(crate) T) diff --git a/src/implementation/x86/avx512.rs b/src/implementation/x86/avx512.rs new file mode 100644 index 00000000..a1380ebb --- /dev/null +++ b/src/implementation/x86/avx512.rs @@ -0,0 +1,271 @@ +//! Contains the x86-64 AVX512 UTF-8 validation implementation. + +use core::arch::x86_64::{ + __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_cmpgt_epi8_mask, _mm512_loadu_si512, + _mm512_maskz_abs_epi8, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8, + _mm512_set_epi64, _mm512_setzero_si512, _mm512_shuffle_epi8, _mm512_srli_epi16, + _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, _MM_HINT_T0, +}; +use core::arch::x86_64::{_mm512_movepi8_mask, _mm512_set_epi8}; + +use crate::implementation::helpers::Utf8CheckAlgorithm; + +// AVX 2 SIMD primitives + +type SimdU8Value = crate::implementation::helpers::SimdU8Value<__m512i>; + +impl SimdU8Value { + #[flexpect::e(clippy::cast_possible_wrap)] + #[flexpect::e(clippy::too_many_arguments)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn from_32_cut_off_leading( + v0: u8, + v1: u8, + v2: u8, + v3: u8, + v4: u8, + v5: u8, + v6: u8, + v7: u8, + v8: u8, + v9: u8, + v10: u8, + v11: u8, + v12: u8, + v13: u8, + v14: u8, + v15: u8, + v16: u8, + v17: u8, + v18: u8, + v19: u8, + v20: u8, + v21: u8, + v22: u8, + v23: u8, + v24: u8, + v25: u8, + v26: u8, + v27: u8, + v28: u8, + v29: u8, + v30: u8, + v31: u8, + ) -> Self { + Self::from(_mm512_set_epi8( + v31 as i8, v30 as i8, v29 as i8, v28 as i8, v27 as i8, v26 as i8, v25 as i8, v24 as i8, + v23 as i8, v22 as i8, v21 as i8, v20 as i8, v19 as i8, v18 as i8, v17 as i8, v16 as i8, + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, + v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, + v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, + v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, v0 as i8, + )) + } + + #[flexpect::e(clippy::too_many_arguments)] + #[flexpect::e(clippy::cast_possible_wrap)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn repeat_16( + v0: u8, + v1: u8, + v2: u8, + v3: u8, + v4: u8, + v5: u8, + v6: u8, + v7: u8, + v8: u8, + v9: u8, + v10: u8, + v11: u8, + v12: u8, + v13: u8, + v14: u8, + v15: u8, + ) -> Self { + Self::from(_mm512_set_epi8( + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + v15 as i8, v14 as i8, v13 as i8, v12 as i8, v11 as i8, v10 as i8, v9 as i8, v8 as i8, + v7 as i8, v6 as i8, v5 as i8, v4 as i8, v3 as i8, v2 as i8, v1 as i8, v0 as i8, + )) + } + + #[flexpect::e(clippy::cast_ptr_alignment)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn load_from(ptr: *const u8) -> Self { + Self::from(_mm512_loadu_si512(ptr.cast::<__m512i>())) + } + + #[flexpect::e(clippy::too_many_arguments)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn lookup_16( + self, + v0: u8, + v1: u8, + v2: u8, + v3: u8, + v4: u8, + v5: u8, + v6: u8, + v7: u8, + v8: u8, + v9: u8, + v10: u8, + v11: u8, + v12: u8, + v13: u8, + v14: u8, + v15: u8, + ) -> Self { + Self::from(_mm512_shuffle_epi8( + Self::repeat_16( + v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + ) + .0, + self.0, + )) + } + + #[flexpect::e(clippy::cast_possible_wrap)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn splat(val: u8) -> Self { + Self::from(_mm512_set1_epi8(val as i8)) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn splat0() -> Self { + Self::from(_mm512_setzero_si512()) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn or(self, b: Self) -> Self { + Self::from(_mm512_or_si512(self.0, b.0)) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn and(self, b: Self) -> Self { + Self::from(_mm512_and_si512(self.0, b.0)) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn xor(self, b: Self) -> Self { + Self::from(_mm512_xor_si512(self.0, b.0)) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn saturating_sub(self, b: Self) -> Self { + Self::from(_mm512_subs_epu8(self.0, b.0)) + } + + // ugly but shr requires const generics + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn shr4(self) -> Self { + Self::from(_mm512_srli_epi16(self.0, 4)).and(Self::splat(0xFF >> 4)) + } + + // ugly but prev requires const generics + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn prev1(self, prev: Self) -> Self { + const SHIFT: i32 = 16 - 1; + return Self::from(_mm512_alignr_epi8( + self.0, + _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0), + SHIFT, + )); + } + // ugly but prev requires const generics + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn prev2(self, prev: Self) -> Self { + const SHIFT: i32 = 16 - 2; + return Self::from(_mm512_alignr_epi8( + self.0, + _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0), + SHIFT, + )); + } + + // ugly but prev requires const generics + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn prev3(self, prev: Self) -> Self { + const SHIFT: i32 = 16 - 3; + return Self::from(_mm512_alignr_epi8( + self.0, + _mm512_permutex2var_epi64(prev.0, _mm512_set_epi64(13, 12, 11, 10, 9, 8, 7, 6), self.0), + SHIFT, + )); + } + + #[flexpect::e(clippy::cast_possible_wrap)] + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn signed_gt(self, other: Self) -> Self { + Self::from(_mm512_maskz_abs_epi8( + _mm512_cmpgt_epi8_mask(self.0, other.0), + _mm512_set1_epi8(0x80u8 as i8), + )) + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn any_bit_set(self) -> bool { + _mm512_test_epi8_mask(self.0, self.0) != 0 + } + + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn is_ascii(self) -> bool { + _mm512_movepi8_mask(self.0) == 0 + } +} + +impl From<__m512i> for SimdU8Value { + #[inline] + fn from(val: __m512i) -> Self { + Self(val) + } +} + +impl Utf8CheckAlgorithm { + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value { + let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0b1110_0000 - 1)); + let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0b1111_0000 - 1)); + + is_third_byte + .or(is_fourth_byte) + .signed_gt(SimdU8Value::splat0()) + } +} + +#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] +#[inline] +unsafe fn simd_prefetch(ptr: *const u8) { + _mm_prefetch(ptr.cast::(), _MM_HINT_T0); +} + +const PREFETCH: bool = true; +use crate::implementation::helpers::TempSimdChunkA64 as TempSimdChunk; +simd_input_512_bit!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); +algorithm_simd!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index 23b7515e..6eaea03b 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -1,6 +1,10 @@ #[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))] pub(crate) mod avx2; +// TODO: require actually necessary AVX-512 features +#[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))] +pub(crate) mod avx512; + #[cfg(any( feature = "public_imp", all(feature = "std", not(target_feature = "avx2")), @@ -28,7 +32,14 @@ pub(crate) unsafe fn validate_utf8_basic( #[inline] fn get_fastest_available_implementation_basic() -> ValidateUtf8Fn { - if std::is_x86_feature_detected!("avx2") { + // Test for avx512vbmi2 to make sure we have a newer CPU with a non-throttling AVX-512 implementation + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512vbmi") + && std::is_x86_feature_detected!("avx512vbmi2") + { + avx512::validate_utf8_basic + } else if std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_basic } else if std::is_x86_feature_detected!("sse4.2") { sse42::validate_utf8_basic @@ -124,7 +135,13 @@ pub(crate) unsafe fn validate_utf8_compat( #[inline] fn get_fastest_available_implementation_compat() -> ValidateUtf8CompatFn { - if std::is_x86_feature_detected!("avx2") { + if std::is_x86_feature_detected!("avx512f") + && std::is_x86_feature_detected!("avx512bw") + && std::is_x86_feature_detected!("avx512vbmi") + && std::is_x86_feature_detected!("avx512vbmi2") + { + avx512::validate_utf8_compat + } else if std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_compat } else if std::is_x86_feature_detected!("sse4.2") { sse42::validate_utf8_compat From 69e70ed099e9cd7f82f9a153cd162a0de1bf93ba Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 21 Nov 2025 13:34:15 +0000 Subject: [PATCH 02/15] avx512: use masked load for in complete last block --- src/implementation/algorithm.rs | 52 +++++++++++++++++++++++--------- src/implementation/x86/avx512.rs | 15 ++++++--- 2 files changed, 48 insertions(+), 19 deletions(-) diff --git a/src/implementation/algorithm.rs b/src/implementation/algorithm.rs index 55424f5a..68f7fad1 100644 --- a/src/implementation/algorithm.rs +++ b/src/implementation/algorithm.rs @@ -240,13 +240,7 @@ macro_rules! algorithm_simd { } if idx < len { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input.as_ptr().add(idx), - tmpbuf.0.as_mut_ptr(), - len - idx, - ); - let simd_input = SimdInput::new(tmpbuf.0.as_ptr()); + let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx); algorithm.check_utf8(simd_input); } algorithm.check_incomplete_pending(); @@ -332,14 +326,7 @@ macro_rules! algorithm_simd { break; } if idx < len { - let mut tmpbuf = TempSimdChunk::new(); - crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( - input.as_ptr().add(idx), - tmpbuf.0.as_mut_ptr(), - len - idx, - ); - let simd_input = SimdInput::new(tmpbuf.0.as_ptr()); - + let simd_input = SimdInput::new_partial(input.as_ptr().add(idx), len-idx); algorithm.check_utf8(simd_input); } algorithm.check_incomplete_pending(); @@ -537,6 +524,18 @@ macro_rules! simd_input_128_bit { } } + $(#[$feat])* + #[inline] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = TempSimdChunk::new(); + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( + ptr, + tmpbuf.0.as_mut_ptr(), + len, + ); + Self::new(tmpbuf.0.as_ptr()) + } + $(#[$feat])* #[inline] unsafe fn is_ascii(&self) -> bool { @@ -568,6 +567,18 @@ macro_rules! simd_input_256_bit { } } + $(#[$feat])* + #[inline] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + let mut tmpbuf = TempSimdChunk::new(); + crate::implementation::helpers::memcpy_unaligned_nonoverlapping_inline_opt_lt_64( + ptr, + tmpbuf.0.as_mut_ptr(), + len, + ); + Self::new(tmpbuf.0.as_ptr()) + } + $(#[$feat])* #[inline] unsafe fn is_ascii(&self) -> bool { @@ -595,6 +606,17 @@ macro_rules! simd_input_512_bit { } } + + $(#[$feat])* + #[inline] + unsafe fn new_partial(ptr: *const u8, len: usize) -> Self { + Self { + vals: [ + SimdU8Value::load_from_partial(ptr, len), + ], + } + } + $(#[$feat])* #[inline] unsafe fn is_ascii(&self) -> bool { diff --git a/src/implementation/x86/avx512.rs b/src/implementation/x86/avx512.rs index a1380ebb..84de280f 100644 --- a/src/implementation/x86/avx512.rs +++ b/src/implementation/x86/avx512.rs @@ -2,9 +2,10 @@ use core::arch::x86_64::{ __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_cmpgt_epi8_mask, _mm512_loadu_si512, - _mm512_maskz_abs_epi8, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8, - _mm512_set_epi64, _mm512_setzero_si512, _mm512_shuffle_epi8, _mm512_srli_epi16, - _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, _MM_HINT_T0, + _mm512_maskz_abs_epi8, _mm512_maskz_loadu_epi8, _mm512_or_si512, _mm512_permutex2var_epi64, + _mm512_set1_epi8, _mm512_set_epi64, _mm512_setzero_si512, _mm512_shuffle_epi8, + _mm512_srli_epi16, _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, + _MM_HINT_T0, }; use core::arch::x86_64::{_mm512_movepi8_mask, _mm512_set_epi8}; @@ -106,6 +107,13 @@ impl SimdU8Value { Self::from(_mm512_loadu_si512(ptr.cast::<__m512i>())) } + #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] + #[inline] + unsafe fn load_from_partial(ptr: *const u8, len: usize) -> Self { + let res = _mm512_maskz_loadu_epi8(u64::MAX >> (64 - len), ptr.cast::()); + Self::from(res) + } + #[flexpect::e(clippy::too_many_arguments)] #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] #[inline] @@ -266,6 +274,5 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = true; -use crate::implementation::helpers::TempSimdChunkA64 as TempSimdChunk; simd_input_512_bit!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); algorithm_simd!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); From 9aa36da272f4921eac1d35e54a98706bc62bb143 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 21 Nov 2025 16:20:07 +0000 Subject: [PATCH 03/15] optimize must_be_2_3_continuation as in simdjson to avoid comparison and enable ternary logic optimization. Kudos to @Validark (see https://github.com/simdjson/simdjson/pull/2113) --- src/implementation/x86/avx512.rs | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/implementation/x86/avx512.rs b/src/implementation/x86/avx512.rs index 84de280f..a2f90ada 100644 --- a/src/implementation/x86/avx512.rs +++ b/src/implementation/x86/avx512.rs @@ -1,11 +1,10 @@ //! Contains the x86-64 AVX512 UTF-8 validation implementation. use core::arch::x86_64::{ - __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_cmpgt_epi8_mask, _mm512_loadu_si512, - _mm512_maskz_abs_epi8, _mm512_maskz_loadu_epi8, _mm512_or_si512, _mm512_permutex2var_epi64, - _mm512_set1_epi8, _mm512_set_epi64, _mm512_setzero_si512, _mm512_shuffle_epi8, - _mm512_srli_epi16, _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, - _MM_HINT_T0, + __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_loadu_si512, _mm512_maskz_loadu_epi8, + _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8, _mm512_set_epi64, + _mm512_setzero_si512, _mm512_shuffle_epi8, _mm512_srli_epi16, _mm512_subs_epu8, + _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, _MM_HINT_T0, }; use core::arch::x86_64::{_mm512_movepi8_mask, _mm512_set_epi8}; @@ -224,16 +223,6 @@ impl SimdU8Value { )); } - #[flexpect::e(clippy::cast_possible_wrap)] - #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] - #[inline] - unsafe fn signed_gt(self, other: Self) -> Self { - Self::from(_mm512_maskz_abs_epi8( - _mm512_cmpgt_epi8_mask(self.0, other.0), - _mm512_set1_epi8(0x80u8 as i8), - )) - } - #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] #[inline] unsafe fn any_bit_set(self) -> bool { @@ -258,12 +247,9 @@ impl Utf8CheckAlgorithm { #[target_feature(enable = "avx512f,avx512bw,avx512vbmi")] #[inline] unsafe fn must_be_2_3_continuation(prev2: SimdU8Value, prev3: SimdU8Value) -> SimdU8Value { - let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0b1110_0000 - 1)); - let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0b1111_0000 - 1)); - - is_third_byte - .or(is_fourth_byte) - .signed_gt(SimdU8Value::splat0()) + let is_third_byte = prev2.saturating_sub(SimdU8Value::splat(0xe0 - 0x80)); + let is_fourth_byte = prev3.saturating_sub(SimdU8Value::splat(0xf0 - 0x80)); + is_third_byte.or(is_fourth_byte) } } From 50efe66476a3fc96db823e22121d0fbf439dc218 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Fri, 21 Nov 2025 16:55:44 +0000 Subject: [PATCH 04/15] readd TempSimdChunk for public imp --- src/implementation/x86/avx512.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/implementation/x86/avx512.rs b/src/implementation/x86/avx512.rs index a2f90ada..6943e554 100644 --- a/src/implementation/x86/avx512.rs +++ b/src/implementation/x86/avx512.rs @@ -260,5 +260,6 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = true; +use crate::implementation::helpers::TempSimdChunkA64 as TempSimdChunk; simd_input_512_bit!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); algorithm_simd!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); From 9b251aee2ed39f362fb7774733f15dda7205c6d9 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Mon, 24 Nov 2025 12:09:16 +0000 Subject: [PATCH 05/15] avx512 compile time detection --- Cargo.toml | 3 +++ build.rs | 17 +++++++++++++++++ src/basic.rs | 10 ++++++++++ src/compat.rs | 5 +++++ 4 files changed, 35 insertions(+) create mode 100644 build.rs diff --git a/Cargo.toml b/Cargo.toml index f3ae8206..eccd5c1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,3 +53,6 @@ targets = ["aarch64-unknown-linux-gnu", "wasm32-unknown-unknown", "wasm32-wasip1 [dependencies] flexpect = "0.1.1" + +[build-dependencies] +rustversion = "1.0.22" diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..ca6ed851 --- /dev/null +++ b/build.rs @@ -0,0 +1,17 @@ +fn main() { + println!("cargo::rustc-check-cfg=cfg(avx512_stable)"); + // `if rustversion::cfg!(...)` is not supported in older Rust versions + if avx512_stable() { + println!("cargo:rustc-cfg=avx512_stable"); + } +} + +#[rustversion::since(1.89)] +fn avx512_stable() -> bool { + true +} + +#[rustversion::before(1.89)] +fn avx512_stable() -> bool { + false +} diff --git a/src/basic.rs b/src/basic.rs index 23a72951..02623d8b 100644 --- a/src/basic.rs +++ b/src/basic.rs @@ -197,6 +197,16 @@ pub mod imp { /// Includes the x86/x86-64 SIMD implementations. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub mod x86 { + /// Includes the validation implementation for AVX 512-compatible CPUs. + /// + /// Using the provided functionality on CPUs which do not support AVX 512 is undefined + /// behavior and will very likely cause a crash. + #[cfg(avx512_stable)] + pub mod avx512 { + pub use crate::implementation::x86::avx512::validate_utf8_basic as validate_utf8; + pub use crate::implementation::x86::avx512::ChunkedUtf8ValidatorImp; + pub use crate::implementation::x86::avx512::Utf8ValidatorImp; + } /// Includes the validation implementation for AVX 2-compatible CPUs. /// /// Using the provided functionality on CPUs which do not support AVX 2 is undefined diff --git a/src/compat.rs b/src/compat.rs index dadb4378..d9482fc2 100644 --- a/src/compat.rs +++ b/src/compat.rs @@ -105,6 +105,11 @@ pub mod imp { /// Includes the x86/x86-64 SIMD implementations. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub mod x86 { + /// Includes the validation implementation for AVX 512-compatible CPUs. + #[cfg(avx512_stable)] + pub mod avx512 { + pub use crate::implementation::x86::avx512::validate_utf8_compat as validate_utf8; + } /// Includes the validation implementation for AVX 2-compatible CPUs. pub mod avx2 { pub use crate::implementation::x86::avx2::validate_utf8_compat as validate_utf8; From d54cffb8d0c00028ee345f3e66e8d5803f59aae0 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Mon, 24 Nov 2025 12:09:27 +0000 Subject: [PATCH 06/15] avx512 public imp --- src/implementation/x86/avx512.rs | 1 + src/implementation/x86/mod.rs | 293 +++++++++++++++++++++++++++++-- 2 files changed, 278 insertions(+), 16 deletions(-) diff --git a/src/implementation/x86/avx512.rs b/src/implementation/x86/avx512.rs index 6943e554..3cf23f79 100644 --- a/src/implementation/x86/avx512.rs +++ b/src/implementation/x86/avx512.rs @@ -260,6 +260,7 @@ unsafe fn simd_prefetch(ptr: *const u8) { } const PREFETCH: bool = true; +#[cfg(feature = "public_imp")] use crate::implementation::helpers::TempSimdChunkA64 as TempSimdChunk; simd_input_512_bit!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); algorithm_simd!(#[target_feature(enable = "avx512f,avx512bw,avx512vbmi")]); diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index 6eaea03b..b6384930 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -1,15 +1,69 @@ -#[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))] -pub(crate) mod avx2; - -// TODO: require actually necessary AVX-512 features -#[cfg(any(feature = "std", feature = "public_imp", target_feature = "avx2"))] +#[cfg(all( + avx512_stable, + any( + feature = "public_imp", + // always availabe, except if no-std and no avx512 support + feature = "std", + all( + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ) +)))] pub(crate) mod avx512; #[cfg(any( feature = "public_imp", - all(feature = "std", not(target_feature = "avx2")), + // std: avx2 is available for auto-selection if no avx512 is present + all( + feature = "std", + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )) + ), + // no-std: no avx512 -> select avx2 all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ) +))] +pub(crate) mod avx2; + +#[cfg(any( + feature = "public_imp", + // std: sse 4.2 is available for auto-selection if no avx512 is present + all( + feature = "std", + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + ), + // no-std: no avx512, no avx2 -> select sse4.2 + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ) @@ -18,7 +72,17 @@ pub(crate) mod sse42; // validate_utf8_basic() std: implementation auto-selection -#[cfg(all(feature = "std", not(target_feature = "avx2")))] +#[cfg(all( + feature = "std", + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + not(all(not(avx512_stable), target_feature = "avx2")) +))] #[inline] pub(crate) unsafe fn validate_utf8_basic( input: &[u8], @@ -30,15 +94,30 @@ pub(crate) unsafe fn validate_utf8_basic( type FnRaw = *mut (); type ValidateUtf8Fn = unsafe fn(input: &[u8]) -> Result<(), crate::basic::Utf8Error>; + #[cfg(avx512_stable)] #[inline] - fn get_fastest_available_implementation_basic() -> ValidateUtf8Fn { + fn get_avx512_implementation() -> Option { // Test for avx512vbmi2 to make sure we have a newer CPU with a non-throttling AVX-512 implementation if std::is_x86_feature_detected!("avx512f") && std::is_x86_feature_detected!("avx512bw") && std::is_x86_feature_detected!("avx512vbmi") && std::is_x86_feature_detected!("avx512vbmi2") { - avx512::validate_utf8_basic + return Some(avx512::validate_utf8_basic); + } + None + } + + #[cfg(not(avx512_stable))] + #[inline] + fn get_avx512_implementation() -> Option { + None + } + + #[inline] + fn get_fastest_available_implementation_basic() -> ValidateUtf8Fn { + if let Some(fun) = get_avx512_implementation() { + fun } else if std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_basic } else if std::is_x86_feature_detected!("sse4.2") { @@ -66,7 +145,51 @@ pub(crate) unsafe fn validate_utf8_basic( // validate_utf8_basic() no-std: implementation selection by config -#[cfg(target_feature = "avx2")] +#[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" +))] +pub(crate) unsafe fn validate_utf8_basic( + input: &[u8], +) -> core::result::Result<(), crate::basic::Utf8Error> { + if input.len() < super::helpers::SIMD_CHUNK_SIZE { + return super::validate_utf8_basic_fallback(input); + } + + validate_utf8_basic_avx512(input) +} + +#[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" +))] +#[inline(never)] +unsafe fn validate_utf8_basic_avx512( + input: &[u8], +) -> core::result::Result<(), crate::basic::Utf8Error> { + avx512::validate_utf8_basic(input) +} + +#[cfg(any( + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ), + all(target_feature = "avx2", feature = "std", not(avx512_stable)) +))] pub(crate) unsafe fn validate_utf8_basic( input: &[u8], ) -> core::result::Result<(), crate::basic::Utf8Error> { @@ -77,7 +200,20 @@ pub(crate) unsafe fn validate_utf8_basic( validate_utf8_basic_avx2(input) } -#[cfg(target_feature = "avx2")] +#[cfg(any( + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ), + all(target_feature = "avx2", feature = "std", not(avx512_stable)) +))] #[inline(never)] unsafe fn validate_utf8_basic_avx2( input: &[u8], @@ -87,6 +223,13 @@ unsafe fn validate_utf8_basic_avx2( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ))] @@ -102,6 +245,13 @@ pub(crate) unsafe fn validate_utf8_basic( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ))] @@ -114,6 +264,13 @@ unsafe fn validate_utf8_basic_sse42( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), not(target_feature = "sse4.2") ))] @@ -121,7 +278,17 @@ pub(crate) use super::validate_utf8_basic_fallback as validate_utf8_basic; // validate_utf8_compat() std: implementation auto-selection -#[cfg(all(feature = "std", not(target_feature = "avx2")))] +#[cfg(all( + feature = "std", + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + not(all(not(avx512_stable), target_feature = "avx2")) +))] #[inline] pub(crate) unsafe fn validate_utf8_compat( input: &[u8], @@ -133,14 +300,30 @@ pub(crate) unsafe fn validate_utf8_compat( type FnRaw = *mut (); type ValidateUtf8CompatFn = unsafe fn(input: &[u8]) -> Result<(), crate::compat::Utf8Error>; + #[cfg(avx512_stable)] #[inline] - fn get_fastest_available_implementation_compat() -> ValidateUtf8CompatFn { + fn get_avx512_implementation() -> Option { + // Test for avx512vbmi2 to make sure we have a newer CPU with a non-throttling AVX-512 implementation if std::is_x86_feature_detected!("avx512f") && std::is_x86_feature_detected!("avx512bw") && std::is_x86_feature_detected!("avx512vbmi") && std::is_x86_feature_detected!("avx512vbmi2") { - avx512::validate_utf8_compat + return Some(avx512::validate_utf8_compat); + } + None + } + + #[cfg(not(avx512_stable))] + #[inline] + fn get_avx512_implementation() -> Option { + None + } + + #[inline] + fn get_fastest_available_implementation_compat() -> ValidateUtf8CompatFn { + if let Some(fun) = get_avx512_implementation() { + fun } else if std::is_x86_feature_detected!("avx2") { avx2::validate_utf8_compat } else if std::is_x86_feature_detected!("sse4.2") { @@ -168,7 +351,51 @@ pub(crate) unsafe fn validate_utf8_compat( // validate_utf8_basic() no-std: implementation selection by config -#[cfg(target_feature = "avx2")] +#[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" +))] +pub(crate) unsafe fn validate_utf8_compat( + input: &[u8], +) -> core::result::Result<(), crate::compat::Utf8Error> { + if input.len() < super::helpers::SIMD_CHUNK_SIZE { + return super::validate_utf8_compat_fallback(input); + } + + validate_utf8_compat_avx512(input) +} + +#[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" +))] +#[inline(never)] +unsafe fn validate_utf8_compat_avx512( + input: &[u8], +) -> core::result::Result<(), crate::compat::Utf8Error> { + avx512::validate_utf8_compat(input) +} + +#[cfg(any( + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ), + all(target_feature = "avx2", feature = "std", not(avx512_stable)) +))] pub(crate) unsafe fn validate_utf8_compat( input: &[u8], ) -> core::result::Result<(), crate::compat::Utf8Error> { @@ -179,7 +406,20 @@ pub(crate) unsafe fn validate_utf8_compat( validate_utf8_compat_avx2(input) } -#[cfg(target_feature = "avx2")] +#[cfg(any( + all( + not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), + target_feature = "avx2" + ), + all(target_feature = "avx2", feature = "std", not(avx512_stable)) +))] #[inline(never)] unsafe fn validate_utf8_compat_avx2( input: &[u8], @@ -189,6 +429,13 @@ unsafe fn validate_utf8_compat_avx2( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ))] @@ -204,6 +451,13 @@ pub(crate) unsafe fn validate_utf8_compat( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), target_feature = "sse4.2" ))] @@ -216,6 +470,13 @@ pub(crate) unsafe fn validate_utf8_compat_sse42( #[cfg(all( not(feature = "std"), + not(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + )), not(target_feature = "avx2"), not(target_feature = "sse4.2") ))] From 40ac1a7f26b5dcf6720a2e2521cffe801c78165a Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Mon, 24 Nov 2025 12:17:55 +0000 Subject: [PATCH 07/15] clippy --- src/implementation/x86/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index b6384930..1c396085 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -114,6 +114,7 @@ pub(crate) unsafe fn validate_utf8_basic( None } + #[flexpect::e(clippy::option_if_let_else)] #[inline] fn get_fastest_available_implementation_basic() -> ValidateUtf8Fn { if let Some(fun) = get_avx512_implementation() { @@ -320,6 +321,7 @@ pub(crate) unsafe fn validate_utf8_compat( None } + #[flexpect::e(clippy::option_if_let_else)] #[inline] fn get_fastest_available_implementation_compat() -> ValidateUtf8CompatFn { if let Some(fun) = get_avx512_implementation() { From 08739b82202996abdc171882e24a33e8ed859ef3 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Mon, 24 Nov 2025 12:20:55 +0000 Subject: [PATCH 08/15] x86 (not _64) fix --- src/implementation/x86/avx512.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/implementation/x86/avx512.rs b/src/implementation/x86/avx512.rs index 3cf23f79..f3b69a9e 100644 --- a/src/implementation/x86/avx512.rs +++ b/src/implementation/x86/avx512.rs @@ -1,12 +1,22 @@ //! Contains the x86-64 AVX512 UTF-8 validation implementation. +#[cfg(target_arch = "x86")] +use core::arch::x86::{ + __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_loadu_si512, _mm512_maskz_loadu_epi8, + _mm512_movepi8_mask, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8, + _mm512_set_epi64, _mm512_set_epi8, _mm512_setzero_si512, _mm512_shuffle_epi8, + _mm512_srli_epi16, _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, + _MM_HINT_T0, +}; + +#[cfg(target_arch = "x86_64")] use core::arch::x86_64::{ __m512i, _mm512_alignr_epi8, _mm512_and_si512, _mm512_loadu_si512, _mm512_maskz_loadu_epi8, - _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8, _mm512_set_epi64, - _mm512_setzero_si512, _mm512_shuffle_epi8, _mm512_srli_epi16, _mm512_subs_epu8, - _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, _MM_HINT_T0, + _mm512_movepi8_mask, _mm512_or_si512, _mm512_permutex2var_epi64, _mm512_set1_epi8, + _mm512_set_epi64, _mm512_set_epi8, _mm512_setzero_si512, _mm512_shuffle_epi8, + _mm512_srli_epi16, _mm512_subs_epu8, _mm512_test_epi8_mask, _mm512_xor_si512, _mm_prefetch, + _MM_HINT_T0, }; -use core::arch::x86_64::{_mm512_movepi8_mask, _mm512_set_epi8}; use crate::implementation::helpers::Utf8CheckAlgorithm; From 3743eeb5c31d167bc7abb5faec3d7427ee934af5 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Mon, 24 Nov 2025 12:36:15 +0000 Subject: [PATCH 09/15] fix: compile time implementation selection logic --- src/implementation/x86/mod.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/implementation/x86/mod.rs b/src/implementation/x86/mod.rs index 1c396085..cd279e37 100644 --- a/src/implementation/x86/mod.rs +++ b/src/implementation/x86/mod.rs @@ -15,7 +15,7 @@ pub(crate) mod avx512; #[cfg(any( feature = "public_imp", - // std: avx2 is available for auto-selection if no avx512 is present + // std: sse 4.2 is available for auto-selection unless avx512 is selected at compile time all( feature = "std", not(all( @@ -43,16 +43,16 @@ pub(crate) mod avx2; #[cfg(any( feature = "public_imp", - // std: sse 4.2 is available for auto-selection if no avx512 is present + // std: sse 4.2 is available for auto-selection unless avx512 or avx2 are selected at compile time all( feature = "std", - not(all( + not(any(all( avx512_stable, target_feature = "avx512f", target_feature = "avx512bw", target_feature = "avx512vbmi", target_feature = "avx512vbmi2" - )), + ),all(not(avx512_stable), target_feature = "avx2"))), ), // no-std: no avx512, no avx2 -> select sse4.2 all( From caaffe1021118f633d68e46c85fffb6a9e54ce8a Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Mon, 24 Nov 2025 16:59:05 +0000 Subject: [PATCH 10/15] inlining updates --- inlining/expected-methods-x86-nostd-avx512.txt | 5 +++++ inlining/expected-methods-x86-std-avx512.txt | 5 +++++ inlining/expected-methods-x86-std.txt | 2 ++ 3 files changed, 12 insertions(+) create mode 100644 inlining/expected-methods-x86-nostd-avx512.txt create mode 100644 inlining/expected-methods-x86-std-avx512.txt diff --git a/inlining/expected-methods-x86-nostd-avx512.txt b/inlining/expected-methods-x86-nostd-avx512.txt new file mode 100644 index 00000000..643c0462 --- /dev/null +++ b/inlining/expected-methods-x86-nostd-avx512.txt @@ -0,0 +1,5 @@ +simdutf8::implementation::helpers::get_compat_error +simdutf8::implementation::x86::validate_utf8_basic +simdutf8::implementation::x86::validate_utf8_basic_avx512 +simdutf8::implementation::x86::validate_utf8_compat +simdutf8::implementation::x86::validate_utf8_compat_avx512 diff --git a/inlining/expected-methods-x86-std-avx512.txt b/inlining/expected-methods-x86-std-avx512.txt new file mode 100644 index 00000000..643c0462 --- /dev/null +++ b/inlining/expected-methods-x86-std-avx512.txt @@ -0,0 +1,5 @@ +simdutf8::implementation::helpers::get_compat_error +simdutf8::implementation::x86::validate_utf8_basic +simdutf8::implementation::x86::validate_utf8_basic_avx512 +simdutf8::implementation::x86::validate_utf8_compat +simdutf8::implementation::x86::validate_utf8_compat_avx512 diff --git a/inlining/expected-methods-x86-std.txt b/inlining/expected-methods-x86-std.txt index 4cbccc66..6edecc76 100644 --- a/inlining/expected-methods-x86-std.txt +++ b/inlining/expected-methods-x86-std.txt @@ -3,6 +3,8 @@ simdutf8::implementation::validate_utf8_basic_fallback simdutf8::implementation::validate_utf8_compat_fallback simdutf8::implementation::x86::avx2::validate_utf8_basic simdutf8::implementation::x86::avx2::validate_utf8_compat +simdutf8::implementation::x86::avx512::validate_utf8_basic +simdutf8::implementation::x86::avx512::validate_utf8_compat simdutf8::implementation::x86::sse42::validate_utf8_basic simdutf8::implementation::x86::sse42::validate_utf8_compat simdutf8::implementation::x86::validate_utf8_basic::get_fastest From be6b51d849db3136025cce1ea0c02fed551d634f Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 25 Nov 2025 11:10:30 +0000 Subject: [PATCH 11/15] inlining tests --- .github/workflows/ci.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b7d1862b..c0217f1a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -140,9 +140,17 @@ jobs: run: | ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp" - RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx2.txt --no-default-features RUSTFLAGS="-C target-feature=+sse4.2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-sse42.txt --no-default-features + - name: Check x86_64 inlining with avx2 autoselection + run: | + RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt + if: ${{ matrix.toolchain == '1.38.0' }} + - name: Check x86_64 inlining with avx512 autoselection + run: | + RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx512.txt + RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx512.txt --no-default-features + if: ${{ matrix.toolchain != '1.38.0' }} - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.toolchain }} From 6b10e83aa46c5ac77142faa728342fc624b839ab Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 25 Nov 2025 11:17:37 +0000 Subject: [PATCH 12/15] inlining --- inlining/expected-methods-x86-std-old.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 inlining/expected-methods-x86-std-old.txt diff --git a/inlining/expected-methods-x86-std-old.txt b/inlining/expected-methods-x86-std-old.txt new file mode 100644 index 00000000..4cbccc66 --- /dev/null +++ b/inlining/expected-methods-x86-std-old.txt @@ -0,0 +1,9 @@ +simdutf8::implementation::helpers::get_compat_error +simdutf8::implementation::validate_utf8_basic_fallback +simdutf8::implementation::validate_utf8_compat_fallback +simdutf8::implementation::x86::avx2::validate_utf8_basic +simdutf8::implementation::x86::avx2::validate_utf8_compat +simdutf8::implementation::x86::sse42::validate_utf8_basic +simdutf8::implementation::x86::sse42::validate_utf8_compat +simdutf8::implementation::x86::validate_utf8_basic::get_fastest +simdutf8::implementation::x86::validate_utf8_compat::get_fastest From 528713af407fe481bd6a8452c71c7b704500ad29 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 25 Nov 2025 11:20:59 +0000 Subject: [PATCH 13/15] inlining --- .github/workflows/ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0217f1a..71ae8659 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -138,16 +138,18 @@ jobs: run: cargo +stable install rustfilt - name: Check x86_64 inlining run: | - ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt - ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp" RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx2.txt --no-default-features RUSTFLAGS="-C target-feature=+sse4.2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-sse42.txt --no-default-features - name: Check x86_64 inlining with avx2 autoselection run: | + ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-old.txt + ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-old.txt "--features public_imp" RUSTFLAGS="-C target-feature=+avx2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx2.txt if: ${{ matrix.toolchain == '1.38.0' }} - name: Check x86_64 inlining with avx512 autoselection run: | + ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt + ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std.txt "--features public_imp" RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-std-avx512.txt RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2" ./check-inlining.sh x86_64-unknown-linux-gnu expected-methods-x86-nostd-avx512.txt --no-default-features if: ${{ matrix.toolchain != '1.38.0' }} From 3b27412dbada1a95a05c36cbb3a152cc40a12e68 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 25 Nov 2025 11:29:41 +0000 Subject: [PATCH 14/15] avx512 specific tests --- tests/tests.rs | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/tests.rs b/tests/tests.rs index 082746c9..57431651 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -69,6 +69,23 @@ mod public_imp { #[allow(unused_variables)] // nothing to do if not SIMD implementation is available pub(super) fn test_valid(input: &[u8]) { if cfg!(any(target_arch = "x86", target_arch = "x86_64")) { + #[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ))] + unsafe { + assert!(simdutf8::basic::imp::x86::avx512::validate_utf8(input).is_ok()); + assert!(simdutf8::compat::imp::x86::avx512::validate_utf8(input).is_ok()); + + test_streaming::(input, true); + test_chunked_streaming::( + input, true, + ); + } + #[cfg(target_feature = "avx2")] unsafe { assert!(simdutf8::basic::imp::x86::avx2::validate_utf8(input).is_ok()); @@ -138,6 +155,24 @@ mod public_imp { #[allow(unused_variables)] // nothing to do if not SIMD implementation is available pub(super) fn test_invalid(input: &[u8], valid_up_to: usize, error_len: Option) { if cfg!(any(target_arch = "x86", target_arch = "x86_64")) { + #[cfg(all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ))] + unsafe { + assert!(simdutf8::basic::imp::x86::avx512::validate_utf8(input).is_err()); + let err = simdutf8::compat::imp::x86::avx512::validate_utf8(input).unwrap_err(); + assert_eq!(err.valid_up_to(), valid_up_to); + assert_eq!(err.error_len(), error_len); + + test_streaming::(input, false); + test_chunked_streaming::( + input, false, + ); + } #[cfg(target_feature = "avx2")] unsafe { assert!(simdutf8::basic::imp::x86::avx2::validate_utf8(input).is_err()); @@ -268,6 +303,24 @@ mod public_imp { } } + #[test] + #[should_panic] + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + all( + avx512_stable, + target_feature = "avx512f", + target_feature = "avx512bw", + target_feature = "avx512vbmi", + target_feature = "avx512vbmi2" + ) + ))] + fn test_avx2_chunked_panic() { + test_chunked_streaming_with_chunk_size::< + simdutf8::basic::imp::x86::avx512::ChunkedUtf8ValidatorImp, + >(b"abcd", 1, true); + } + #[test] #[should_panic] #[cfg(all( From 61d34b18d94ba56d1ea4b1a20b64e61c421ac183 Mon Sep 17 00:00:00 2001 From: Hans Kratz Date: Tue, 25 Nov 2025 15:24:15 +0000 Subject: [PATCH 15/15] AVX 512 doc --- README.md | 16 ++++++++++------ src/lib.rs | 13 ++++++++----- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index bcdd9d59..ede39012 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,12 @@ This library has been thoroughly tested with sample data as well as fuzzing and ## Features * `basic` API for the fastest validation, optimized for valid UTF-8 * `compat` API as a fully compatible replacement for `std::str::from_utf8()` +* 🆕 AVX 512 support on modern x86/x86-64 CPUs since Rust 1.89 * Supports AVX 2 and SSE 4.2 implementations on x86 and x86-64 * ARM64 (aarch64) SIMD is supported since Rust 1.61 * WASM (wasm32) SIMD is supported * 🆕 armv7 NEON support with the `armv7_neon` feature on nightly Rust -* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCI +* x86-64: Up to 23 times faster than the std library on valid non-ASCII, up to four times faster on ASCII * aarch64: Up to eleven times faster than the std library on valid non-ASCII, up to four times faster on ASCII (Apple Silicon) * Faster than the original simdjson implementation * Selects the fastest implementation at runtime based on CPU support (on x86) @@ -71,14 +72,17 @@ This comes at a slight performance penalty compared to the `basic` API even if t ## Implementation selection ### X86 -The fastest implementation is selected at runtime using the `std::is_x86_feature_detected!` macro, unless the CPU -targeted by the compiler supports the fastest available implementation. -So if you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine, the AVX 2 implementation is selected at -compile-time and runtime selection is disabled. +The fastest implementation is usually selected at runtime using the `std::is_x86_feature_detected!` macro. The AVX 512 +implementation is however only selected if the CPU support the VBMI2 features to avoid throttling happening with CPUs before +Intels Ice Lake microarchitecture. + +If you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine whichs support AVX 512 with Rust 1.89 or later, +the AVX 512 implementation is selected at compile-time and runtime selection is disabled. For no-std support (compiled with `--no-default-features`) the implementation is always selected at compile time based on the targeted CPU. Use `RUSTFLAGS="-C target-feature=+avx2"` for the AVX 2 implementation or `RUSTFLAGS="-C target-feature=+sse4.2"` -for the SSE 4.2 implementation. +for the SSE 4.2 implementation. For AVX 512 use `RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2"` with +Rust 1.89 or later. ### ARM64 The SIMD implementation is used automatically since Rust 1.61. diff --git a/src/lib.rs b/src/lib.rs index 1f05450c..0ef442fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -95,14 +95,17 @@ //! ## Implementation selection //! //! ### X86 -//! The fastest implementation is selected at runtime using the `std::is_x86_feature_detected!` macro, unless the CPU -//! targeted by the compiler supports the fastest available implementation. -//! So if you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine, the AVX 2 implementation is selected at -//! compile-time and runtime selection is disabled. +//! The fastest implementation is usually selected at runtime using the `std::is_x86_feature_detected!` macro. The AVX 512 +//! implementation is however only selected if the CPU support the VBMI2 features to avoid throttling happening with CPUs before +//! Intels Ice Lake microarchitecture. +//! +//! If you compile with `RUSTFLAGS="-C target-cpu=native"` on a recent x86-64 machine whichs support AVX 512 with Rust 1.89 or later, +//! the AVX 512 implementation is selected at compile-time and runtime selection is disabled. //! //! For no-std support (compiled with `--no-default-features`) the implementation is always selected at compile time based on //! the targeted CPU. Use `RUSTFLAGS="-C target-feature=+avx2"` for the AVX 2 implementation or `RUSTFLAGS="-C target-feature=+sse4.2"` -//! for the SSE 4.2 implementation. +//! for the SSE 4.2 implementation. For AVX 512 use `RUSTFLAGS="-C target-feature=+avx512f,+avx512bw,+avx512vbmi,+avx512vbmi2"` with +//! Rust 1.89 or later. //! //! ### ARM64 //! The SIMD implementation is used automatically since Rust 1.61.