From 0fa7d739bc6f4b729bea1b666767d5b51fc57aa5 Mon Sep 17 00:00:00 2001 From: hsqStephenZhang Date: Sun, 15 Feb 2026 12:11:23 +0100 Subject: [PATCH 1/7] tmp --- .cargo/config.toml | 7 +- sonic-simd/src/bits.rs | 147 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 1 deletion(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index e9d1744..9ddf6b1 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -10,4 +10,9 @@ rustflags = ["-g", "-C", "target-cpu=native"] rustflags = ["-C", "target-feature=+simd128"] [target.wasm32-wasi] -rustflags = ["-C", "target-feature=+simd128"] \ No newline at end of file +rustflags = ["-C", "target-feature=+simd128"] + +[target.aarch64-unknown-linux-gnu] +runner = "qemu-aarch64 -L /usr/aarch64-linux-gnu -cpu max,sve=on" +rustflags = ["-C", "target-feature=+sve2"] +linker = "aarch64-linux-gnu-gcc" \ No newline at end of file diff --git a/sonic-simd/src/bits.rs b/sonic-simd/src/bits.rs index e2263ac..d5be939 100644 --- a/sonic-simd/src/bits.rs +++ b/sonic-simd/src/bits.rs @@ -100,3 +100,150 @@ impl BitMask for NeonBits { Self(self.0 & u64::MAX >> (n * 4)) } } + +#[cfg(target_feature = "sve2")] +#[derive(Debug, Clone, Copy)] +pub struct SveBits(usize); + +#[cfg(target_feature = "sve2")] +impl SveBits { + #[inline(always)] + pub fn new(u: usize) -> Self { + Self(u) + } +} + +#[cfg(target_feature = "sve2")] +impl BitMask for SveBits { + const LEN: usize = 16; + + #[inline(always)] + fn first_offset(&self) -> usize { + self.0 + } + + #[inline(always)] + fn before(&self, rhs: &Self) -> bool { + self.0 < rhs.0 + } + + #[inline(always)] + fn all_zero(&self) -> bool { + self.0 == 16 + } + + #[inline(always)] + fn as_little_endian(&self) -> Self { + *self + } + + #[inline(always)] + fn clear_high_bits(&self, n: usize) -> Self { + let nb = 16 - n; + + if self.0 >= nb { + Self(16) + } else { + *self + } + } +} + +#[cfg(test)] +#[cfg(target_feature = "sve2")] +#[cfg(target_arch = "aarch64")] +mod tests { + use super::*; + + #[derive(Debug)] + struct SVEStringBlock { + bs_bits: SveBits, + quote_bits: SveBits, + unescaped_bits: SveBits, + } + + impl SVEStringBlock { + #[inline(always)] + pub fn new_sve(ptr: *const u8) -> Self { + let (q, bs, un): (u64, u64, u64); + + unsafe { + core::arch::asm!( + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + + // " + "mov z1.b, #34", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {q_idx}, p0, p1.b", + + // / + "mov z1.b, #92", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {bs_idx}, p0, p1.b", + + // ascii control characters + "mov z1.b, #31", + "cmple p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {un_idx}, p0, p1.b", + + ptr = in(reg) ptr, + q_idx = out(reg) q, + bs_idx = out(reg) bs, + un_idx = out(reg) un, + out("z0") _, out("z1") _, + out("p0") _, out("p1") _, + ); + } + + Self { + quote_bits: SveBits::new(q as usize), + bs_bits: SveBits::new(bs as usize), + unescaped_bits: SveBits::new(un as usize), + } + } + } + + impl SVEStringBlock { + #[inline(always)] + pub fn has_unescaped(&self) -> bool { + self.unescaped_bits.0 < self.quote_bits.0 + } + + #[inline(always)] + pub fn has_quote_first(&self) -> bool { + self.quote_bits.0 < self.bs_bits.0 && !self.has_unescaped() + } + + #[inline(always)] + pub fn has_backslash(&self) -> bool { + self.bs_bits.0 < self.quote_bits.0 + } + + #[inline(always)] + pub fn quote_index(&self) -> usize { + self.quote_bits.0 + } + } + + #[test] + fn test_sve_bits() { + let s = b"\"\\\t\n"; + let block = SVEStringBlock::new_sve(s.as_ptr()); + assert_eq!(block.quote_bits.0, 0); + assert_eq!(block.bs_bits.0, 1); + assert_eq!(block.unescaped_bits.0, 2); + + let block = SVEStringBlock::new_sve(unsafe { + { + s.as_ptr().add(2) + } + }); + assert_eq!(block.quote_bits.0, 16); + assert_eq!(block.bs_bits.0, 16); + assert_eq!(block.unescaped_bits.0, 0); + } +} From bdc31961bf1c0981547cc22a42ad8feabfbeae2e Mon Sep 17 00:00:00 2001 From: hsqStephenZhang Date: Sun, 15 Feb 2026 15:39:43 +0100 Subject: [PATCH 2/7] feat(experiment): support sve2 --- src/parser.rs | 51 ++--- src/util/string.rs | 503 ++++++++++++++++++++++++++++++--------------- 2 files changed, 358 insertions(+), 196 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index b568e0a..71a63f3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -12,7 +12,7 @@ use faststr::FastStr; use serde::de::{self, Expected, Unexpected}; use sonic_number::{parse_number, ParserNumber}; #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] -use sonic_simd::bits::NeonBits; +// use sonic_simd::bits::NeonBits; // not used with unified u32 path use sonic_simd::{i8x32, m8x32, u8x32, u8x64, Mask, Simd}; use crate::{ @@ -900,36 +900,29 @@ where &mut self, buf: &'own mut Vec, ) -> Result> { - #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] - let mut block: StringBlock; - #[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] - let mut block: StringBlock; - self.parse_escaped_char(buf)?; - while let Some(chunk) = self.read.peek_n(StringBlock::LANES) { - buf.reserve(StringBlock::LANES); - let v = unsafe { load(chunk.as_ptr()) }; - block = StringBlock::new(&v); - + while let Some(chunk) = self.read.peek_n(crate::util::string::STRING_BLOCK_LANES) { + buf.reserve(crate::util::string::STRING_BLOCK_LANES); + let v = crate::util::string::load_v(chunk.as_ptr()); + let block = crate::util::string::build_block(&v); + // write the chunk to buf, we will set new_len later + let dst_chunk = from_raw_parts_mut( + buf.as_mut_ptr().add(buf.len()), + crate::util::string::STRING_BLOCK_LANES, + ); + v.write_to_slice_unaligned_unchecked(dst_chunk); if block.has_unescaped() { self.read.eat(block.unescaped_index()); return perr!(self, ControlCharacterWhileParsingString); } - - // write the chunk to buf, we will set new_len later - let chunk = from_raw_parts_mut(buf.as_mut_ptr().add(buf.len()), StringBlock::LANES); - v.write_to_slice_unaligned_unchecked(chunk); - if block.has_quote_first() { let cnt = block.quote_index(); buf.set_len(buf.len() + cnt); - // skip the right quote self.read.eat(cnt + 1); return Ok(ParsedSlice::Copied(buf)); } - if block.has_backslash() { // TODO: loop unrooling here let cnt = block.bs_index(); @@ -938,9 +931,10 @@ where buf.set_len(buf.len() + cnt); self.parse_escaped_char(buf)?; } else { - buf.set_len(buf.len() + StringBlock::LANES); - self.read.eat(StringBlock::LANES); + buf.set_len(buf.len() + crate::util::string::STRING_BLOCK_LANES); + self.read.eat(crate::util::string::STRING_BLOCK_LANES); } + continue; } // scalar codes @@ -974,27 +968,21 @@ where ) -> Result> { // now reader is start after `"`, so we can directly skipstring let start = self.read.index(); - #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] - let mut block: StringBlock; - #[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] - let mut block: StringBlock; - - while let Some(chunk) = self.read.peek_n(StringBlock::LANES) { - let v = unsafe { load(chunk.as_ptr()) }; - block = StringBlock::new(&v); + // use arch-aware block builder to keep lanes consistent + while let Some(chunk) = self.read.peek_n(crate::util::string::STRING_BLOCK_LANES) { + let v = crate::util::string::load_v(chunk.as_ptr()); + let block = crate::util::string::build_block(&v); if block.has_quote_first() { let cnt = block.quote_index(); self.read.eat(cnt + 1); let slice = self.read.slice_unchecked(start, self.read.index() - 1); return Ok(ParsedSlice::Borrowed { slice, buf }); } - if block.has_unescaped() { self.read.eat(block.unescaped_index()); return perr!(self, ControlCharacterWhileParsingString); } - if block.has_backslash() { let cnt = block.bs_index(); // skip the backslash @@ -1006,8 +994,7 @@ where return unsafe { self.parse_string_escaped(buf) }; } - - self.read.eat(StringBlock::LANES); + self.read.eat(crate::util::string::STRING_BLOCK_LANES); continue; } diff --git a/src/util/string.rs b/src/util/string.rs index 8a79791..c5cf33f 100644 --- a/src/util/string.rs +++ b/src/util/string.rs @@ -1,13 +1,5 @@ -use std::{ - mem::MaybeUninit, - slice::{from_raw_parts, from_raw_parts_mut}, - str::from_utf8_unchecked, -}; +use std::{mem::MaybeUninit, slice::from_raw_parts, str::from_utf8_unchecked}; -#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] -use sonic_simd::u8x32; -#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] -use sonic_simd::{bits::NeonBits, u8x16}; use sonic_simd::{BitMask, Mask, Simd}; use crate::{ @@ -22,18 +14,20 @@ pub unsafe fn str_from_raw_parts<'a>(ptr: *const u8, len: usize) -> &'a str { from_utf8_unchecked(from_raw_parts(ptr, len)) } -pub const ESCAPED_TAB: [u8; 256] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, b'"', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, b'/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - b'\\', 0, 0, 0, 0, 0, b'\x08', /* \b */ - 0, 0, 0, b'\x0c', /* \f */ - 0, 0, 0, 0, 0, 0, 0, b'\n', 0, 0, 0, b'\r', 0, b'\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -]; +const fn build_escaped_tab() -> [u8; 256] { + let mut arr = [0u8; 256]; + arr[b'"' as usize] = b'"'; + arr[b'/' as usize] = b'/'; + arr[b'\\' as usize] = b'\\'; + arr[b'b' as usize] = 0x08; + arr[b'f' as usize] = 0x0c; + arr[b'n' as usize] = 0x0a; + arr[b'r' as usize] = 0x0d; + arr[b't' as usize] = 0x09; + arr +} + +pub const ESCAPED_TAB: [u8; 256] = build_escaped_tab(); #[derive(Debug)] pub(crate) struct StringBlock { @@ -42,34 +36,6 @@ pub(crate) struct StringBlock { pub(crate) unescaped_bits: B, } -#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] -impl StringBlock { - pub(crate) const LANES: usize = 32; - - #[inline] - pub fn new(v: &u8x32) -> Self { - Self { - bs_bits: (v.eq(&u8x32::splat(b'\\'))).bitmask(), - quote_bits: (v.eq(&u8x32::splat(b'"'))).bitmask(), - unescaped_bits: (v.le(&u8x32::splat(0x1f))).bitmask(), - } - } -} - -#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] -impl StringBlock { - pub(crate) const LANES: usize = 16; - - #[inline] - pub fn new(v: &u8x16) -> Self { - Self { - bs_bits: (v.eq(&u8x16::splat(b'\\'))).bitmask(), - quote_bits: (v.eq(&u8x16::splat(b'"'))).bitmask(), - unescaped_bits: (v.le(&u8x16::splat(0x1f))).bitmask(), - } - } -} - impl StringBlock { #[inline(always)] pub fn has_unescaped(&self) -> bool { @@ -102,12 +68,126 @@ impl StringBlock { } } +impl StringBlock { + #[allow(unused)] + #[inline(always)] + pub(crate) fn new(v: &sonic_simd::u8x32) -> Self { + let v_bs = v.eq(&sonic_simd::u8x32::splat(b'\\')); + let v_quote = v.eq(&sonic_simd::u8x32::splat(b'"')); + let v_cc = v.le(&sonic_simd::u8x32::splat(0x1f)); + Self { + bs_bits: v_bs.bitmask(), + quote_bits: v_quote.bitmask(), + unescaped_bits: v_cc.bitmask(), + } + } +} + +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +impl StringBlock { + #[allow(unused)] + #[inline(always)] + pub(crate) fn new(v: &sonic_simd::u8x16) -> Self { + use sonic_simd::u8x16; + let v_bs = v.eq(&u8x16::splat(b'\\')); + let v_quote = v.eq(&u8x16::splat(b'"')); + let v_cc = v.le(&u8x16::splat(0x1f)); + Self { + bs_bits: v_bs.bitmask(), + quote_bits: v_quote.bitmask(), + unescaped_bits: v_cc.bitmask(), + } + } +} + #[inline(always)] pub(crate) unsafe fn load(ptr: *const u8) -> V { let chunk = from_raw_parts(ptr, V::LANES); V::from_slice_unaligned_unchecked(chunk) } +// build_block is defined per-arch via cfg_if below +// Detect SVE2 first, then Neon, then fallback +cfg_if::cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { + use sonic_simd::bits::SveBits; + use sonic_simd::u8x16; + + #[inline(always)] + pub(crate) fn build_block(ptr: &u8x16) -> StringBlock { + let (q, bs, un): (u64, u64, u64); + unsafe { + core::arch::asm!( + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + + // '"' + "mov z1.b, #34", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {q_idx}, p0, p1.b", + + // '\\' + "mov z1.b, #92", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {bs_idx}, p0, p1.b", + + // ascii control characters (<= 0x1f) using unsigned compare + "mov z1.b, #31", + "cmpls p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {un_idx}, p0, p1.b", + + ptr = in(reg) ptr, + q_idx = out(reg) q, + bs_idx = out(reg) bs, + un_idx = out(reg) un, + out("z0") _, out("z1") _, + out("p0") _, out("p1") _, + ); + } + StringBlock { + quote_bits: SveBits::new(q as usize), + bs_bits: SveBits::new(bs as usize), + unescaped_bits: SveBits::new(un as usize), + } + } + + pub(crate) fn load_v(ptr: *const u8) -> u8x16 { + unsafe { load::(ptr) } + } + + pub const STRING_BLOCK_LANES: usize = 16; + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + use sonic_simd::{bits::NeonBits, u8x16}; + + pub(crate) fn load_v(ptr: *const u8) -> u8x16 { + unsafe { load::(ptr) } + } + + #[inline(always)] + pub(crate) fn build_block(v: &u8x16) -> StringBlock { + StringBlock::::new(v) + } + + pub const STRING_BLOCK_LANES: usize = 16; + } else { + use sonic_simd::u8x32; + + pub(crate) fn load_v(ptr: *const u8) -> u8x32 { + unsafe { load::(ptr) } + } + + #[inline(always)] + pub(crate) fn build_block(v: &u8x32) -> StringBlock { + StringBlock::::new(v) + } + + pub const STRING_BLOCK_LANES: usize = 32; + } +} + /// Return the size of the actual parsed string, `repr` means repr invalid UTF16 surrogate with /// `\uFFFD` /// TODO: fix me, there are repeat codes!!! @@ -116,17 +196,13 @@ pub(crate) unsafe fn parse_string_inplace( src: &mut *mut u8, repr: bool, ) -> std::result::Result { - #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] - let mut block: StringBlock; - #[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] - let mut block: StringBlock; - let sdst = *src; let src: &mut *const u8 = std::mem::transmute(src); - // loop for string without escaped chars + // loop for string without escaped chars (original control flow) + let mut v = load_v(*src); + let mut block = build_block(&v); loop { - block = StringBlock::new(&unsafe { load(*src) }); if block.has_quote_first() { let idx = block.quote_index(); *src = src.add(idx + 1); @@ -138,14 +214,16 @@ pub(crate) unsafe fn parse_string_inplace( if block.has_backslash() { break; } - *src = src.add(StringBlock::LANES); + *src = src.add(STRING_BLOCK_LANES); + v = load_v(*src); + block = build_block(&v); } let bs_dist = block.bs_index(); *src = src.add(bs_dist); let mut dst = sdst.add((*src as usize) - sdst as usize); - // loop for string with escaped chars + // loop for string with escaped chars (original control flow) loop { 'escape: loop { let escaped_char: u8 = *src.add(1); @@ -170,8 +248,8 @@ pub(crate) unsafe fn parse_string_inplace( } 'find_and_move: loop { - let v = unsafe { load(*src) }; - let block = StringBlock::new(&v); + let v = load_v(*src); + let block = build_block(&v); if block.has_quote_first() { while **src != b'"' { *dst = **src; @@ -185,13 +263,14 @@ pub(crate) unsafe fn parse_string_inplace( return Err(ControlCharacterWhileParsingString); } if !block.has_backslash() { - let chunk = from_raw_parts_mut(dst, StringBlock::LANES); + // copy a full chunk without escapes using SIMD store + let chunk = std::slice::from_raw_parts_mut(dst, STRING_BLOCK_LANES); v.write_to_slice_unaligned_unchecked(chunk); - *src = src.add(StringBlock::LANES); - dst = dst.add(StringBlock::LANES); + *src = src.add(STRING_BLOCK_LANES); + dst = dst.add(STRING_BLOCK_LANES); continue 'find_and_move; } - // TODO: loop unrooling here + // TODO: loop unrolling here while **src != b'\\' { *dst = **src; dst = dst.add(1); @@ -202,6 +281,54 @@ pub(crate) unsafe fn parse_string_inplace( } // slow loop for escaped chars } +const NEED_ESCAPED: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +]; + +#[inline(always)] +unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { + assert!(*nb >= 1); + loop { + let ch = *(*src); + let cnt = QUOTE_TAB[ch as usize].0 as usize; + assert!( + cnt != 0, + "char is {}, cnt is {}, NEED_ESCAPED is {}", + ch as char, + cnt, + NEED_ESCAPED[ch as usize] + ); + std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8); + (*dst) = (*dst).add(cnt); + (*src) = (*src).add(1); + (*nb) -= 1; + if (*nb) == 0 || NEED_ESCAPED[*(*src) as usize] == 0 { + return; + } + } +} + +#[inline(always)] +fn check_cross_page(ptr: *const u8, step: usize) -> bool { + #[cfg(any(target_os = "linux", target_os = "macos"))] + { + let page_size = 4096; + ((ptr as usize & (page_size - 1)) + step) > page_size + } + + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + true + } +} + pub const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ // 0x00 ~ 0x1f (6, *b"\\u0000\0\0"), @@ -467,88 +594,79 @@ pub const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ (0, [0; 8]), ]; -const NEED_ESCAPED: [u8; 256] = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -]; - -// only check the src length. -#[inline(always)] -unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { - assert!(*nb >= 1); - loop { - let ch = *(*src); - let cnt = QUOTE_TAB[ch as usize].0 as usize; - assert!( - cnt != 0, - "char is {}, cnt is {}, NEED_ESCAPED is {}", - ch as char, - cnt, - NEED_ESCAPED[ch as usize] - ); - std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8); - (*dst) = (*dst).add(cnt); - (*src) = (*src).add(1); - (*nb) -= 1; - if (*nb) == 0 || NEED_ESCAPED[*(*src) as usize] == 0 { - return; - } - } -} - -#[inline(always)] -fn check_cross_page(ptr: *const u8, step: usize) -> bool { - #[cfg(any(target_os = "linux", target_os = "macos"))] - { - let page_size = 4096; - ((ptr as usize & (page_size - 1)) + step) > page_size - } - - #[cfg(not(any(target_os = "linux", target_os = "macos")))] - { - // not check page cross in fallback envs, always true - true - } -} - #[inline(always)] pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) -> usize { assert!(dst.len() >= value.len() * 6 + 32 + 3); - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - let mut v: u8x16; - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - let mut v: u8x32; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - const LANES: usize = 16; - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - const LANES: usize = 32; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - #[inline] - fn escaped_mask(v: u8x16) -> NeonBits { - let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x16::splat(b'\\'); - let quote = u8x16::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() - } - - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - #[inline] - fn escaped_mask(v: u8x32) -> u32 { - let x1f = u8x32::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x32::splat(b'\\'); - let quote = u8x32::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() + cfg_if::cfg_if! { + // Prefer SVE2 when available + if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { + use sonic_simd::{bits::SveBits, u8x16}; + let mut v: u8x16; + const LANES: usize = 16; + + #[inline] + fn escaped_mask_at(ptr: *const u8) -> SveBits { + let (q, bs, un): (u64, u64, u64); + unsafe { + core::arch::asm!( + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + + // '"' + "mov z1.b, #34", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {q_idx}, p0, p1.b", + + // '\\' + "mov z1.b, #92", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {bs_idx}, p0, p1.b", + + // ascii control characters (<= 0x1f) + "mov z1.b, #31", + "cmpls p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {un_idx}, p0, p1.b", + + ptr = in(reg) ptr, + q_idx = out(reg) q, + bs_idx = out(reg) bs, + un_idx = out(reg) un, + out("z0") _, out("z1") _, + out("p0") _, out("p1") _, + ); + } + let idx = core::cmp::min(q, core::cmp::min(bs, un)) as usize; + SveBits::new(idx) + } + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + use sonic_simd::{bits::NeonBits, u8x16}; + let mut v: u8x16; + const LANES: usize = 16; + #[inline] + fn escaped_mask(v: u8x16) -> NeonBits { + let x1f = u8x16::splat(0x1f); + let blash = u8x16::splat(b'\\'); + let quote = u8x16::splat(b'\"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() + } + } else { + use sonic_simd::u8x32; + let mut v: u8x32; + const LANES: usize = 32; + #[inline] + fn escaped_mask(v: u8x32) -> u32 { + let x1f = u8x32::splat(0x1f); + let blash = u8x32::splat(b'\\'); + let quote = u8x32::splat(b'\"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() + } + } } unsafe { @@ -565,17 +683,47 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) while nb >= LANES { v = load(sptr); v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); - let mask = escaped_mask(v); - if mask.all_zero() { - nb -= LANES; - dptr = dptr.add(LANES); - sptr = sptr.add(LANES); - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); + cfg_if::cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { + let mask = escaped_mask_at(sptr); + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + let mask = escaped_mask(v); + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } else { + let mask = escaped_mask(v); + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } } } @@ -587,7 +735,6 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) } else { #[cfg(not(any(debug_assertions, feature = "sanitize")))] { - // disable memory sanitizer here load(sptr) } #[cfg(any(debug_assertions, feature = "sanitize"))] @@ -598,16 +745,44 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) }; v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); - let mask = escaped_mask(v).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); + cfg_if::cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { + let mask = escaped_mask_at(sptr).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + let mask = escaped_mask(v).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } else { + let mask = escaped_mask(v).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } } } if need_quote { From 48746d812e64cd1e83f6a9e17e5ea6bb01ce4f87 Mon Sep 17 00:00:00 2001 From: hsqStephenZhang Date: Sun, 15 Feb 2026 19:18:37 +0100 Subject: [PATCH 3/7] feat(experiment): support sve2 based whitespace skipper --- src/parser.rs | 158 ++++++++++++++++++++++++++------------- src/util/arch/aarch64.rs | 42 ++++++++++- src/util/arch/mod.rs | 16 ++-- 3 files changed, 151 insertions(+), 65 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 71a63f3..41ce4c3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -11,7 +11,6 @@ use std::{ use faststr::FastStr; use serde::de::{self, Expected, Unexpected}; use sonic_number::{parse_number, ParserNumber}; -#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] // use sonic_simd::bits::NeonBits; // not used with unified u32 path use sonic_simd::{i8x32, m8x32, u8x32, u8x64, Mask, Simd}; @@ -212,11 +211,108 @@ pub(crate) struct Pair<'de> { pub status: ParseStatus, } -pub struct Parser { - pub read: R, - error_index: usize, // mark the error position +/// default bitmap based space skipper +/// will cache the bitmap +#[cfg(not(all(target_arch = "aarch64", target_feature = "sve2")))] +struct SpaceSkipper { nospace_bits: u64, // SIMD marked nospace bitmap nospace_start: isize, // the start position of nospace_bits +} + +#[cfg(not(all(target_arch = "aarch64", target_feature = "sve2")))] +impl SpaceSkipper { + pub fn new() -> Self { + Self { + nospace_bits: 0, + nospace_start: -128, + } + } + + #[inline(always)] + pub fn skip_space<'de, R: Reader<'de>>(&mut self, reader: &mut R) -> Option { + // fast path 2: reuse the bitmap for short key or numbers + let nospace_offset = (reader.index() as isize) - self.nospace_start; + if nospace_offset < 64 { + let bitmap = { + let mask = !((1 << nospace_offset) - 1); + self.nospace_bits & mask + }; + if bitmap != 0 { + let cnt = bitmap.trailing_zeros() as usize; + let ch = reader.at(self.nospace_start as usize + cnt); + reader.set_index(self.nospace_start as usize + cnt + 1); + + return Some(ch); + } else { + // we can still fast skip the marked space in here. + reader.set_index(self.nospace_start as usize + 64); + } + } + + // then we use simd to accelerate skipping space + while let Some(chunk) = reader.peek_n(64) { + let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 64]) }; + let bitmap = unsafe { get_nonspace_bits(chunk) }; + if bitmap != 0 { + self.nospace_bits = bitmap; + self.nospace_start = reader.index() as isize; + let cnt = bitmap.trailing_zeros() as usize; + let ch = chunk[cnt]; + reader.eat(cnt + 1); + + return Some(ch); + } + reader.eat(64) + } + + while let Some(ch) = reader.next() { + if !is_whitespace(ch) { + return Some(ch); + } + } + None + } +} + +#[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] +struct SpaceSkipper; + +#[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] +impl SpaceSkipper { + pub fn new() -> Self { + Self + } + + #[inline(always)] + pub fn skip_space<'de, R: Reader<'de>>(&mut self, reader: &mut R) -> Option { + // then we use simd to accelerate skipping space + while let Some(chunk) = reader.peek_n(16) { + let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 16]) }; + let bitmap = unsafe { get_nonspace_bits(chunk) }; + if bitmap != 0 { + let cnt = bitmap.trailing_zeros() as usize; + let ch = chunk[cnt]; + reader.eat(cnt + 1); + + return Some(ch); + } + reader.eat(16) + } + + while let Some(ch) = reader.next() { + if !is_whitespace(ch) { + // + return Some(ch); + } + } + None + } +} + +pub struct Parser { + pub read: R, + error_index: usize, // mark the error position + skipper: SpaceSkipper, // space skipper, maybe bitmap based or sve2 based pub(crate) cfg: DeserializeCfg, } @@ -244,8 +340,7 @@ where Self { read, error_index: usize::MAX, - nospace_bits: 0, - nospace_start: -128, + skipper: SpaceSkipper::new(), cfg: DeserializeCfg::default(), } } @@ -1306,62 +1401,17 @@ where #[inline(always)] pub fn skip_space(&mut self) -> Option { - let reader = &mut self.read; - // fast path 1: for nospace or single space - // most JSON is like ` "name": "balabala" ` - if let Some(ch) = reader.next() { - if !is_whitespace(ch) { - return Some(ch); - } - } - if let Some(ch) = reader.next() { + if let Some(ch) = self.read.next() { if !is_whitespace(ch) { return Some(ch); } } - - // fast path 2: reuse the bitmap for short key or numbers - let nospace_offset = (reader.index() as isize) - self.nospace_start; - if nospace_offset < 64 { - let bitmap = { - let mask = !((1 << nospace_offset) - 1); - self.nospace_bits & mask - }; - if bitmap != 0 { - let cnt = bitmap.trailing_zeros() as usize; - let ch = reader.at(self.nospace_start as usize + cnt); - reader.set_index(self.nospace_start as usize + cnt + 1); - - return Some(ch); - } else { - // we can still fast skip the marked space in here. - reader.set_index(self.nospace_start as usize + 64); - } - } - - // then we use simd to accelerate skipping space - while let Some(chunk) = reader.peek_n(64) { - let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 64]) }; - let bitmap = unsafe { get_nonspace_bits(chunk) }; - if bitmap != 0 { - self.nospace_bits = bitmap; - self.nospace_start = reader.index() as isize; - let cnt = bitmap.trailing_zeros() as usize; - let ch = chunk[cnt]; - reader.eat(cnt + 1); - - return Some(ch); - } - reader.eat(64) - } - - while let Some(ch) = reader.next() { + if let Some(ch) = self.read.next() { if !is_whitespace(ch) { - // return Some(ch); } } - None + self.skipper.skip_space(&mut self.read) } #[inline(always)] diff --git a/src/util/arch/aarch64.rs b/src/util/arch/aarch64.rs index 7ea63f5..c50ef57 100644 --- a/src/util/arch/aarch64.rs +++ b/src/util/arch/aarch64.rs @@ -15,8 +15,6 @@ // This file may have been modified by ByteDance authors. All ByteDance // Modifications are Copyright 2022 ByteDance Authors. -use std::arch::aarch64::*; - // Not use PMULL instructions, but it is apparently slow. // This is copied from simdjson. pub unsafe fn prefix_xor(bitmask: u64) -> u64 { @@ -45,9 +43,11 @@ pub unsafe fn prefix_xor(bitmask: u64) -> u64 { // just for minification (or just to identify the structural characters), // there is a small untaken optimization opportunity here. We deliberately // do not pick it up. +#[cfg(not(target_feature = "sve2"))] #[inline(always)] pub unsafe fn get_nonspace_bits(data: &[u8; 64]) -> u64 { - // return super::fallback::get_nonspace_bits(data); + use std::arch::aarch64::*; + #[inline(always)] unsafe fn chunk_nonspace_bits(input: uint8x16_t) -> uint8x16_t { const LOW_TAB: uint8x16_t = @@ -75,3 +75,39 @@ pub unsafe fn get_nonspace_bits(data: &[u8; 64]) -> u64 { chunk_nonspace_bits(vld1q_u8(data.as_ptr().offset(48))), ) } + +#[cfg(target_feature = "sve2")] +#[inline(always)] +pub unsafe fn get_nonspace_bits(data: &[u8; 16]) -> u64 { + let mut index: u64; + // 空白符集合: 0x09 (Tab), 0x0A (LF), 0x0D (CR), 0x20 (Space) + let tokens: u32 = 0x090a0d20; + + core::arch::asm!( + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + "mov z1.s, {t:w}", // 广播 4 个空白符到 z1 + + // nmatch 寻找不属于 {09, 0a, 0d, 20} 的字符 + // 结果存入 p1,p1 中 true 的位置表示“非空白符” + "nmatch p1.b, p0/z, z0.b, z1.b", + + // 定位第一个非空白符的位置 + "brkb p1.b, p0/z, p1.b", // 截断,只保留第一个 true 之前的位为 true + "cntp {idx}, p0, p1.b", // 统计数量,得到第一个非空白符的 index + + ptr = in(reg) data.as_ptr(), + t = in(reg) tokens, + idx = out(reg) index, + out("z0") _, out("z1") _, + out("p0") _, out("p1") _, + ); + + // 如果 index < 16,返回 1 << index,使外部 trailing_zeros() 拿到正确偏移 + // 如果 index == 16,返回 0,触发外部 skip_space 的“全空白”跳过逻辑 + if index < 16 { + 1u64 << index + } else { + 0 + } +} \ No newline at end of file diff --git a/src/util/arch/mod.rs b/src/util/arch/mod.rs index 84808b6..d3e88d5 100644 --- a/src/util/arch/mod.rs +++ b/src/util/arch/mod.rs @@ -13,13 +13,13 @@ cfg_if::cfg_if! { #[cfg(test)] mod test { - use super::*; + // use super::*; - #[test] - fn test_get_non_space_bits() { - let input = b"\t\r\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; - let non_space_bits = unsafe { get_nonspace_bits(input) }; - let expected_bits = 0b1111111111111111111111111111111111111111111111111111111111110000; - assert_eq!(non_space_bits, expected_bits, "bits is {non_space_bits:b}"); - } + // #[test] + // fn test_get_non_space_bits() { + // let input = b"\t\r\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + // let non_space_bits = unsafe { get_nonspace_bits(input) }; + // let expected_bits = 0b1111111111111111111111111111111111111111111111111111111111110000; + // assert_eq!(non_space_bits, expected_bits, "bits is {non_space_bits:b}"); + // } } From 35e7fdb1d69215744a81d940ba28c0eb8569b1b4 Mon Sep 17 00:00:00 2001 From: hsqStephenZhang Date: Sun, 15 Feb 2026 19:33:19 +0100 Subject: [PATCH 4/7] fmt --- src/parser.rs | 6 +++--- src/util/arch/aarch64.rs | 37 -------------------------------- src/util/arch/mod.rs | 28 +++++++++++++++++------- src/util/arch/sve2.rs | 46 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 48 deletions(-) create mode 100644 src/util/arch/sve2.rs diff --git a/src/parser.rs b/src/parser.rs index 41ce4c3..c1b4f73 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -30,7 +30,7 @@ use crate::{ reader::Reader, serde::de::invalid_type_number, util::{ - arch::{get_nonspace_bits, prefix_xor}, + arch::prefix_xor, string::*, unicode::{codepoint_to_utf8, hex_to_u32_nocheck}, }, @@ -252,7 +252,7 @@ impl SpaceSkipper { // then we use simd to accelerate skipping space while let Some(chunk) = reader.peek_n(64) { let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 64]) }; - let bitmap = unsafe { get_nonspace_bits(chunk) }; + let bitmap = unsafe { crate::util::arch::get_nonspace_bits(chunk) }; if bitmap != 0 { self.nospace_bits = bitmap; self.nospace_start = reader.index() as isize; @@ -288,7 +288,7 @@ impl SpaceSkipper { // then we use simd to accelerate skipping space while let Some(chunk) = reader.peek_n(16) { let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 16]) }; - let bitmap = unsafe { get_nonspace_bits(chunk) }; + let bitmap = unsafe { crate::util::arch::get_nonspace_bits(chunk) }; if bitmap != 0 { let cnt = bitmap.trailing_zeros() as usize; let ch = chunk[cnt]; diff --git a/src/util/arch/aarch64.rs b/src/util/arch/aarch64.rs index c50ef57..3a9a44c 100644 --- a/src/util/arch/aarch64.rs +++ b/src/util/arch/aarch64.rs @@ -43,7 +43,6 @@ pub unsafe fn prefix_xor(bitmask: u64) -> u64 { // just for minification (or just to identify the structural characters), // there is a small untaken optimization opportunity here. We deliberately // do not pick it up. -#[cfg(not(target_feature = "sve2"))] #[inline(always)] pub unsafe fn get_nonspace_bits(data: &[u8; 64]) -> u64 { use std::arch::aarch64::*; @@ -75,39 +74,3 @@ pub unsafe fn get_nonspace_bits(data: &[u8; 64]) -> u64 { chunk_nonspace_bits(vld1q_u8(data.as_ptr().offset(48))), ) } - -#[cfg(target_feature = "sve2")] -#[inline(always)] -pub unsafe fn get_nonspace_bits(data: &[u8; 16]) -> u64 { - let mut index: u64; - // 空白符集合: 0x09 (Tab), 0x0A (LF), 0x0D (CR), 0x20 (Space) - let tokens: u32 = 0x090a0d20; - - core::arch::asm!( - "ptrue p0.b, vl16", - "ld1b {{z0.b}}, p0/z, [{ptr}]", - "mov z1.s, {t:w}", // 广播 4 个空白符到 z1 - - // nmatch 寻找不属于 {09, 0a, 0d, 20} 的字符 - // 结果存入 p1,p1 中 true 的位置表示“非空白符” - "nmatch p1.b, p0/z, z0.b, z1.b", - - // 定位第一个非空白符的位置 - "brkb p1.b, p0/z, p1.b", // 截断,只保留第一个 true 之前的位为 true - "cntp {idx}, p0, p1.b", // 统计数量,得到第一个非空白符的 index - - ptr = in(reg) data.as_ptr(), - t = in(reg) tokens, - idx = out(reg) index, - out("z0") _, out("z1") _, - out("p0") _, out("p1") _, - ); - - // 如果 index < 16,返回 1 << index,使外部 trailing_zeros() 拿到正确偏移 - // 如果 index == 16,返回 0,触发外部 skip_space 的“全空白”跳过逻辑 - if index < 16 { - 1u64 << index - } else { - 0 - } -} \ No newline at end of file diff --git a/src/util/arch/mod.rs b/src/util/arch/mod.rs index d3e88d5..5e68ff0 100644 --- a/src/util/arch/mod.rs +++ b/src/util/arch/mod.rs @@ -2,6 +2,9 @@ cfg_if::cfg_if! { if #[cfg(all(target_arch = "x86_64", target_feature = "pclmulqdq", target_feature = "avx2", target_feature = "sse2"))] { mod x86_64; pub use x86_64::*; + } else if #[cfg(all(target_feature="sve2", target_arch="aarch64"))] { + mod sve2; + pub use sve2::*; } else if #[cfg(all(target_feature="neon", target_arch="aarch64"))] { mod aarch64; pub use aarch64::*; @@ -13,13 +16,22 @@ cfg_if::cfg_if! { #[cfg(test)] mod test { - // use super::*; + use super::*; - // #[test] - // fn test_get_non_space_bits() { - // let input = b"\t\r\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; - // let non_space_bits = unsafe { get_nonspace_bits(input) }; - // let expected_bits = 0b1111111111111111111111111111111111111111111111111111111111110000; - // assert_eq!(non_space_bits, expected_bits, "bits is {non_space_bits:b}"); - // } + #[test] + fn test_get_non_space_bits() { + let input = b"\t\r\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; + cfg_if::cfg_if! { + if #[cfg(all(target_feature="sve2", target_arch="aarch64"))] { + let non_space_bits = unsafe { get_nonspace_bits(std::mem::transmute(input)) }; + // sve2 cannot generate the full bitmap(without performance loss) + let expected_bits = 0b10000; + assert_eq!(non_space_bits, expected_bits, "bits is {non_space_bits:b}"); + } else { + let non_space_bits = unsafe { get_nonspace_bits(input) }; + let expected_bits = 0b1111111111111111111111111111111111111111111111111111111111110000; + assert_eq!(non_space_bits, expected_bits, "bits is {non_space_bits:b}"); + } + } + } } diff --git a/src/util/arch/sve2.rs b/src/util/arch/sve2.rs new file mode 100644 index 0000000..6879dba --- /dev/null +++ b/src/util/arch/sve2.rs @@ -0,0 +1,46 @@ +pub unsafe fn prefix_xor(bitmask: u64) -> u64 { + let mut bitmask = bitmask; + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + bitmask +} + +/// SVE2 implementation of `get_nonspace_bits`. +/// But this won't get the full bitmap +#[inline(always)] +pub unsafe fn get_nonspace_bits(data: &[u8; 16]) -> u64 { + let mut index: u64; + // 0x09 (Tab), 0x0A (LF), 0x0D (CR), 0x20 (Space) + let tokens: u32 = 0x090a0d20; + + core::arch::asm!( + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + // broadcast token set + "mov z1.s, {t:w}", + + // nmatch: find token does not match + "nmatch p1.b, p0/z, z0.b, z1.b", + + // locate + "brkb p1.b, p0/z, p1.b", + // count number of true bits + "cntp {idx}, p0, p1.b", + + ptr = in(reg) data.as_ptr(), + t = in(reg) tokens, + idx = out(reg) index, + out("z0") _, out("z1") _, + out("p0") _, out("p1") _, + ); + + if index < 16 { + 1u64 << index + } else { + 0 + } +} From 5d6f210741041984aef61c293a05354a097717cd Mon Sep 17 00:00:00 2001 From: hsqStephenZhang Date: Mon, 16 Feb 2026 15:55:14 +0100 Subject: [PATCH 5/7] fix --- src/util/string.rs | 42 +++++++++--------------------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/src/util/string.rs b/src/util/string.rs index c5cf33f..5ef38b4 100644 --- a/src/util/string.rs +++ b/src/util/string.rs @@ -599,13 +599,12 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) assert!(dst.len() >= value.len() * 6 + 32 + 3); cfg_if::cfg_if! { - // Prefer SVE2 when available if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { use sonic_simd::{bits::SveBits, u8x16}; let mut v: u8x16; const LANES: usize = 16; - #[inline] + #[inline(always)] fn escaped_mask_at(ptr: *const u8) -> SveBits { let (q, bs, un): (u64, u64, u64); unsafe { @@ -646,7 +645,8 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) use sonic_simd::{bits::NeonBits, u8x16}; let mut v: u8x16; const LANES: usize = 16; - #[inline] + + #[inline(always)] fn escaped_mask(v: u8x16) -> NeonBits { let x1f = u8x16::splat(0x1f); let blash = u8x16::splat(b'\\'); @@ -658,7 +658,8 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) use sonic_simd::u8x32; let mut v: u8x32; const LANES: usize = 32; - #[inline] + + #[inline(always)] fn escaped_mask(v: u8x32) -> u32 { let x1f = u8x32::splat(0x1f); let blash = u8x32::splat(b'\\'); @@ -681,7 +682,7 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) dptr = dptr.add(1); } while nb >= LANES { - v = load(sptr); + v = load_v(sptr); v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); cfg_if::cfg_if! { if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { @@ -697,19 +698,6 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) sptr = sptr.add(cn); escape_unchecked(&mut sptr, &mut nb, &mut dptr); } - } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - let mask = escaped_mask(v); - if mask.all_zero() { - nb -= LANES; - dptr = dptr.add(LANES); - sptr = sptr.add(LANES); - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } } else { let mask = escaped_mask(v); if mask.all_zero() { @@ -731,16 +719,16 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) while nb > 0 { v = if check_cross_page(sptr, LANES) { std::ptr::copy_nonoverlapping(sptr, temp[..].as_mut_ptr(), nb); - load(temp[..].as_ptr()) + load_v(temp[..].as_ptr()) } else { #[cfg(not(any(debug_assertions, feature = "sanitize")))] { - load(sptr) + load_v(sptr) } #[cfg(any(debug_assertions, feature = "sanitize"))] { std::ptr::copy_nonoverlapping(sptr, temp[..].as_mut_ptr(), nb); - load(temp[..].as_ptr()) + load_v(temp[..].as_ptr()) } }; v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); @@ -758,18 +746,6 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) sptr = sptr.add(cn); escape_unchecked(&mut sptr, &mut nb, &mut dptr); } - } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - let mask = escaped_mask(v).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); - } } else { let mask = escaped_mask(v).clear_high_bits(LANES - nb); if mask.all_zero() { From 1e411e045618d71bcb928524b1ca040f7bf17179 Mon Sep 17 00:00:00 2001 From: hsqStephenZhang Date: Mon, 16 Feb 2026 16:37:08 +0100 Subject: [PATCH 6/7] repalce match with cmpeq for better performance --- src/util/string.rs | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/util/string.rs b/src/util/string.rs index 5ef38b4..0daed0d 100644 --- a/src/util/string.rs +++ b/src/util/string.rs @@ -606,40 +606,41 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) #[inline(always)] fn escaped_mask_at(ptr: *const u8) -> SveBits { - let (q, bs, un): (u64, u64, u64); + let idx: u64; unsafe { core::arch::asm!( + // 1. Load data "ptrue p0.b, vl16", "ld1b {{z0.b}}, p0/z, [{ptr}]", - // '"' - "mov z1.b, #34", - "match p1.b, p0/z, z0.b, z1.b", - "brkb p1.b, p0/z, p1.b", - "cntp {q_idx}, p0, p1.b", + // 2. Check for " (using cmpeq) + "mov z1.b, #34", + "cmpeq p1.b, p0/z, z0.b, z1.b", - // '\\' - "mov z1.b, #92", - "match p1.b, p0/z, z0.b, z1.b", - "brkb p1.b, p0/z, p1.b", - "cntp {bs_idx}, p0, p1.b", + // 3. Check for \ (using cmpeq) + "mov z2.b, #92", + "cmpeq p2.b, p0/z, z0.b, z2.b", + + // 4. Check for Control Chars <= 0x1f (using cmpls) + "mov z3.b, #31", + "cmpls p3.b, p0/z, z0.b, z3.b", + + // 5. Combine all results (OR) + // We reuse p1 to accumulate the flags + "orr p1.b, p0/z, p1.b, p2.b", + "orr p1.b, p0/z, p1.b, p3.b", - // ascii control characters (<= 0x1f) - "mov z1.b, #31", - "cmpls p1.b, p0/z, z0.b, z1.b", + // 6. Find first set bit (Break Before) "brkb p1.b, p0/z, p1.b", - "cntp {un_idx}, p0, p1.b", + "cntp {idx}, p0, p1.b", ptr = in(reg) ptr, - q_idx = out(reg) q, - bs_idx = out(reg) bs, - un_idx = out(reg) un, - out("z0") _, out("z1") _, - out("p0") _, out("p1") _, + idx = out(reg) idx, + out("z0") _, out("z1") _, out("z2") _, out("z3") _, + out("p0") _, out("p1") _, out("p2") _, out("p3") _, ); } - let idx = core::cmp::min(q, core::cmp::min(bs, un)) as usize; - SveBits::new(idx) + SveBits::new(idx as usize) } } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { use sonic_simd::{bits::NeonBits, u8x16}; From a75ce817f44fdc2f96b1318c257906fae5c18f43 Mon Sep 17 00:00:00 2001 From: hsqStephenZhang Date: Mon, 16 Feb 2026 16:57:41 +0100 Subject: [PATCH 7/7] replace get_nonspace_bits with get_nonspace_index on sve2 --- src/parser.rs | 9 ++++----- src/util/arch/mod.rs | 5 ++--- src/util/arch/sve2.rs | 38 +++++++++++++++++++------------------- 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index c1b4f73..f637a9e 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -288,12 +288,11 @@ impl SpaceSkipper { // then we use simd to accelerate skipping space while let Some(chunk) = reader.peek_n(16) { let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 16]) }; - let bitmap = unsafe { crate::util::arch::get_nonspace_bits(chunk) }; - if bitmap != 0 { - let cnt = bitmap.trailing_zeros() as usize; - let ch = chunk[cnt]; - reader.eat(cnt + 1); + let cnt = unsafe { crate::util::arch::get_nonspace_index(chunk) }; + if cnt < 16 { + let ch = chunk[cnt]; + reader.eat(cnt + 1); // Skip spaces + return char return Some(ch); } reader.eat(16) diff --git a/src/util/arch/mod.rs b/src/util/arch/mod.rs index 5e68ff0..75d29a6 100644 --- a/src/util/arch/mod.rs +++ b/src/util/arch/mod.rs @@ -23,10 +23,9 @@ mod test { let input = b"\t\r\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; cfg_if::cfg_if! { if #[cfg(all(target_feature="sve2", target_arch="aarch64"))] { - let non_space_bits = unsafe { get_nonspace_bits(std::mem::transmute(input)) }; + let first_nonspace_idx = unsafe { get_nonspace_index(std::mem::transmute(input)) }; // sve2 cannot generate the full bitmap(without performance loss) - let expected_bits = 0b10000; - assert_eq!(non_space_bits, expected_bits, "bits is {non_space_bits:b}"); + assert_eq!(first_nonspace_idx, 4, "first non-space index is {first_nonspace_idx}"); } else { let non_space_bits = unsafe { get_nonspace_bits(input) }; let expected_bits = 0b1111111111111111111111111111111111111111111111111111111111110000; diff --git a/src/util/arch/sve2.rs b/src/util/arch/sve2.rs index 6879dba..0a69bf7 100644 --- a/src/util/arch/sve2.rs +++ b/src/util/arch/sve2.rs @@ -8,39 +8,39 @@ pub unsafe fn prefix_xor(bitmask: u64) -> u64 { bitmask ^= bitmask << 32; bitmask } - -/// SVE2 implementation of `get_nonspace_bits`. -/// But this won't get the full bitmap +/// SVE2 implementation: Returns the index of the first non-space char (0-15). +/// Returns 16 if all characters are spaces. #[inline(always)] -pub unsafe fn get_nonspace_bits(data: &[u8; 16]) -> u64 { - let mut index: u64; - // 0x09 (Tab), 0x0A (LF), 0x0D (CR), 0x20 (Space) +pub unsafe fn get_nonspace_index(data: &[u8; 16]) -> usize { + let mut idx: u64 = 16; // Default to 16 (Not Found) + // 0x09 (Tab), 0x0A (LF), 0x0D (CR), 0x20 (Space) let tokens: u32 = 0x090a0d20; core::arch::asm!( "ptrue p0.b, vl16", "ld1b {{z0.b}}, p0/z, [{ptr}]", - // broadcast token set "mov z1.s, {t:w}", - // nmatch: find token does not match + // 1. Identify non-space characters + // NMATCH sets the Z flag if NO non-spaces are found (all whitespace) "nmatch p1.b, p0/z, z0.b, z1.b", - // locate - "brkb p1.b, p0/z, p1.b", - // count number of true bits - "cntp {idx}, p0, p1.b", + // 2. Fast Path: Branch if NO non-space characters were found. + // b.none checks the Z flag set by nmatch. + // If Z=1 (all spaces), we skip the calculation and keep idx=16. + "b.none 1f", + + // 3. Slow Path (Found something): Calculate the exact index + "brkb p2.b, p0/z, p1.b", // Mask bits *after* the first match + "cntp {idx}, p0, p2.b", // Count leading matches + "1:", ptr = in(reg) data.as_ptr(), t = in(reg) tokens, - idx = out(reg) index, + idx = inout(reg) idx, out("z0") _, out("z1") _, - out("p0") _, out("p1") _, + out("p0") _, out("p1") _, out("p2") _, ); - if index < 16 { - 1u64 << index - } else { - 0 - } + idx as usize }