diff --git a/.cargo/config.toml b/.cargo/config.toml index e9d1744..9ddf6b1 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -10,4 +10,9 @@ rustflags = ["-g", "-C", "target-cpu=native"] rustflags = ["-C", "target-feature=+simd128"] [target.wasm32-wasi] -rustflags = ["-C", "target-feature=+simd128"] \ No newline at end of file +rustflags = ["-C", "target-feature=+simd128"] + +[target.aarch64-unknown-linux-gnu] +runner = "qemu-aarch64 -L /usr/aarch64-linux-gnu -cpu max,sve=on" +rustflags = ["-C", "target-feature=+sve2"] +linker = "aarch64-linux-gnu-gcc" \ No newline at end of file diff --git a/sonic-simd/src/bits.rs b/sonic-simd/src/bits.rs index e2263ac..d5be939 100644 --- a/sonic-simd/src/bits.rs +++ b/sonic-simd/src/bits.rs @@ -100,3 +100,150 @@ impl BitMask for NeonBits { Self(self.0 & u64::MAX >> (n * 4)) } } + +#[cfg(target_feature = "sve2")] +#[derive(Debug, Clone, Copy)] +pub struct SveBits(usize); + +#[cfg(target_feature = "sve2")] +impl SveBits { + #[inline(always)] + pub fn new(u: usize) -> Self { + Self(u) + } +} + +#[cfg(target_feature = "sve2")] +impl BitMask for SveBits { + const LEN: usize = 16; + + #[inline(always)] + fn first_offset(&self) -> usize { + self.0 + } + + #[inline(always)] + fn before(&self, rhs: &Self) -> bool { + self.0 < rhs.0 + } + + #[inline(always)] + fn all_zero(&self) -> bool { + self.0 == 16 + } + + #[inline(always)] + fn as_little_endian(&self) -> Self { + *self + } + + #[inline(always)] + fn clear_high_bits(&self, n: usize) -> Self { + let nb = 16 - n; + + if self.0 >= nb { + Self(16) + } else { + *self + } + } +} + +#[cfg(test)] +#[cfg(target_feature = "sve2")] +#[cfg(target_arch = "aarch64")] +mod tests { + use super::*; + + #[derive(Debug)] + struct SVEStringBlock { + bs_bits: SveBits, + quote_bits: SveBits, + unescaped_bits: SveBits, + } + + impl SVEStringBlock { + #[inline(always)] + pub fn new_sve(ptr: *const u8) -> Self { + let (q, bs, un): (u64, u64, u64); + + unsafe { + core::arch::asm!( + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + + // " + "mov z1.b, #34", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {q_idx}, p0, p1.b", + + // / + "mov z1.b, #92", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {bs_idx}, p0, p1.b", + + // ascii control characters + "mov z1.b, #31", + "cmple p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {un_idx}, p0, p1.b", + + ptr = in(reg) ptr, + q_idx = out(reg) q, + bs_idx = out(reg) bs, + un_idx = out(reg) un, + out("z0") _, out("z1") _, + out("p0") _, out("p1") _, + ); + } + + Self { + quote_bits: SveBits::new(q as usize), + bs_bits: SveBits::new(bs as usize), + unescaped_bits: SveBits::new(un as usize), + } + } + } + + impl SVEStringBlock { + #[inline(always)] + pub fn has_unescaped(&self) -> bool { + self.unescaped_bits.0 < self.quote_bits.0 + } + + #[inline(always)] + pub fn has_quote_first(&self) -> bool { + self.quote_bits.0 < self.bs_bits.0 && !self.has_unescaped() + } + + #[inline(always)] + pub fn has_backslash(&self) -> bool { + self.bs_bits.0 < self.quote_bits.0 + } + + #[inline(always)] + pub fn quote_index(&self) -> usize { + self.quote_bits.0 + } + } + + #[test] + fn test_sve_bits() { + let s = b"\"\\\t\n"; + let block = SVEStringBlock::new_sve(s.as_ptr()); + assert_eq!(block.quote_bits.0, 0); + assert_eq!(block.bs_bits.0, 1); + assert_eq!(block.unescaped_bits.0, 2); + + let block = SVEStringBlock::new_sve(unsafe { + { + s.as_ptr().add(2) + } + }); + assert_eq!(block.quote_bits.0, 16); + assert_eq!(block.bs_bits.0, 16); + assert_eq!(block.unescaped_bits.0, 0); + } +} diff --git a/src/parser.rs b/src/parser.rs index b568e0a..f637a9e 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -11,8 +11,7 @@ use std::{ use faststr::FastStr; use serde::de::{self, Expected, Unexpected}; use sonic_number::{parse_number, ParserNumber}; -#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] -use sonic_simd::bits::NeonBits; +// use sonic_simd::bits::NeonBits; // not used with unified u32 path use sonic_simd::{i8x32, m8x32, u8x32, u8x64, Mask, Simd}; use crate::{ @@ -31,7 +30,7 @@ use crate::{ reader::Reader, serde::de::invalid_type_number, util::{ - arch::{get_nonspace_bits, prefix_xor}, + arch::prefix_xor, string::*, unicode::{codepoint_to_utf8, hex_to_u32_nocheck}, }, @@ -212,11 +211,107 @@ pub(crate) struct Pair<'de> { pub status: ParseStatus, } -pub struct Parser { - pub read: R, - error_index: usize, // mark the error position +/// default bitmap based space skipper +/// will cache the bitmap +#[cfg(not(all(target_arch = "aarch64", target_feature = "sve2")))] +struct SpaceSkipper { nospace_bits: u64, // SIMD marked nospace bitmap nospace_start: isize, // the start position of nospace_bits +} + +#[cfg(not(all(target_arch = "aarch64", target_feature = "sve2")))] +impl SpaceSkipper { + pub fn new() -> Self { + Self { + nospace_bits: 0, + nospace_start: -128, + } + } + + #[inline(always)] + pub fn skip_space<'de, R: Reader<'de>>(&mut self, reader: &mut R) -> Option { + // fast path 2: reuse the bitmap for short key or numbers + let nospace_offset = (reader.index() as isize) - self.nospace_start; + if nospace_offset < 64 { + let bitmap = { + let mask = !((1 << nospace_offset) - 1); + self.nospace_bits & mask + }; + if bitmap != 0 { + let cnt = bitmap.trailing_zeros() as usize; + let ch = reader.at(self.nospace_start as usize + cnt); + reader.set_index(self.nospace_start as usize + cnt + 1); + + return Some(ch); + } else { + // we can still fast skip the marked space in here. + reader.set_index(self.nospace_start as usize + 64); + } + } + + // then we use simd to accelerate skipping space + while let Some(chunk) = reader.peek_n(64) { + let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 64]) }; + let bitmap = unsafe { crate::util::arch::get_nonspace_bits(chunk) }; + if bitmap != 0 { + self.nospace_bits = bitmap; + self.nospace_start = reader.index() as isize; + let cnt = bitmap.trailing_zeros() as usize; + let ch = chunk[cnt]; + reader.eat(cnt + 1); + + return Some(ch); + } + reader.eat(64) + } + + while let Some(ch) = reader.next() { + if !is_whitespace(ch) { + return Some(ch); + } + } + None + } +} + +#[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] +struct SpaceSkipper; + +#[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] +impl SpaceSkipper { + pub fn new() -> Self { + Self + } + + #[inline(always)] + pub fn skip_space<'de, R: Reader<'de>>(&mut self, reader: &mut R) -> Option { + // then we use simd to accelerate skipping space + while let Some(chunk) = reader.peek_n(16) { + let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 16]) }; + let cnt = unsafe { crate::util::arch::get_nonspace_index(chunk) }; + + if cnt < 16 { + let ch = chunk[cnt]; + reader.eat(cnt + 1); // Skip spaces + return char + return Some(ch); + } + reader.eat(16) + } + + while let Some(ch) = reader.next() { + if !is_whitespace(ch) { + // + return Some(ch); + } + } + None + } +} + +pub struct Parser { + pub read: R, + error_index: usize, // mark the error position + skipper: SpaceSkipper, // space skipper, maybe bitmap based or sve2 based pub(crate) cfg: DeserializeCfg, } @@ -244,8 +339,7 @@ where Self { read, error_index: usize::MAX, - nospace_bits: 0, - nospace_start: -128, + skipper: SpaceSkipper::new(), cfg: DeserializeCfg::default(), } } @@ -900,36 +994,29 @@ where &mut self, buf: &'own mut Vec, ) -> Result> { - #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] - let mut block: StringBlock; - #[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] - let mut block: StringBlock; - self.parse_escaped_char(buf)?; - while let Some(chunk) = self.read.peek_n(StringBlock::LANES) { - buf.reserve(StringBlock::LANES); - let v = unsafe { load(chunk.as_ptr()) }; - block = StringBlock::new(&v); - + while let Some(chunk) = self.read.peek_n(crate::util::string::STRING_BLOCK_LANES) { + buf.reserve(crate::util::string::STRING_BLOCK_LANES); + let v = crate::util::string::load_v(chunk.as_ptr()); + let block = crate::util::string::build_block(&v); + // write the chunk to buf, we will set new_len later + let dst_chunk = from_raw_parts_mut( + buf.as_mut_ptr().add(buf.len()), + crate::util::string::STRING_BLOCK_LANES, + ); + v.write_to_slice_unaligned_unchecked(dst_chunk); if block.has_unescaped() { self.read.eat(block.unescaped_index()); return perr!(self, ControlCharacterWhileParsingString); } - - // write the chunk to buf, we will set new_len later - let chunk = from_raw_parts_mut(buf.as_mut_ptr().add(buf.len()), StringBlock::LANES); - v.write_to_slice_unaligned_unchecked(chunk); - if block.has_quote_first() { let cnt = block.quote_index(); buf.set_len(buf.len() + cnt); - // skip the right quote self.read.eat(cnt + 1); return Ok(ParsedSlice::Copied(buf)); } - if block.has_backslash() { // TODO: loop unrooling here let cnt = block.bs_index(); @@ -938,9 +1025,10 @@ where buf.set_len(buf.len() + cnt); self.parse_escaped_char(buf)?; } else { - buf.set_len(buf.len() + StringBlock::LANES); - self.read.eat(StringBlock::LANES); + buf.set_len(buf.len() + crate::util::string::STRING_BLOCK_LANES); + self.read.eat(crate::util::string::STRING_BLOCK_LANES); } + continue; } // scalar codes @@ -974,27 +1062,21 @@ where ) -> Result> { // now reader is start after `"`, so we can directly skipstring let start = self.read.index(); - #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] - let mut block: StringBlock; - #[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] - let mut block: StringBlock; - - while let Some(chunk) = self.read.peek_n(StringBlock::LANES) { - let v = unsafe { load(chunk.as_ptr()) }; - block = StringBlock::new(&v); + // use arch-aware block builder to keep lanes consistent + while let Some(chunk) = self.read.peek_n(crate::util::string::STRING_BLOCK_LANES) { + let v = crate::util::string::load_v(chunk.as_ptr()); + let block = crate::util::string::build_block(&v); if block.has_quote_first() { let cnt = block.quote_index(); self.read.eat(cnt + 1); let slice = self.read.slice_unchecked(start, self.read.index() - 1); return Ok(ParsedSlice::Borrowed { slice, buf }); } - if block.has_unescaped() { self.read.eat(block.unescaped_index()); return perr!(self, ControlCharacterWhileParsingString); } - if block.has_backslash() { let cnt = block.bs_index(); // skip the backslash @@ -1006,8 +1088,7 @@ where return unsafe { self.parse_string_escaped(buf) }; } - - self.read.eat(StringBlock::LANES); + self.read.eat(crate::util::string::STRING_BLOCK_LANES); continue; } @@ -1319,62 +1400,17 @@ where #[inline(always)] pub fn skip_space(&mut self) -> Option { - let reader = &mut self.read; - // fast path 1: for nospace or single space - // most JSON is like ` "name": "balabala" ` - if let Some(ch) = reader.next() { - if !is_whitespace(ch) { - return Some(ch); - } - } - if let Some(ch) = reader.next() { + if let Some(ch) = self.read.next() { if !is_whitespace(ch) { return Some(ch); } } - - // fast path 2: reuse the bitmap for short key or numbers - let nospace_offset = (reader.index() as isize) - self.nospace_start; - if nospace_offset < 64 { - let bitmap = { - let mask = !((1 << nospace_offset) - 1); - self.nospace_bits & mask - }; - if bitmap != 0 { - let cnt = bitmap.trailing_zeros() as usize; - let ch = reader.at(self.nospace_start as usize + cnt); - reader.set_index(self.nospace_start as usize + cnt + 1); - - return Some(ch); - } else { - // we can still fast skip the marked space in here. - reader.set_index(self.nospace_start as usize + 64); - } - } - - // then we use simd to accelerate skipping space - while let Some(chunk) = reader.peek_n(64) { - let chunk = unsafe { &*(chunk.as_ptr() as *const [_; 64]) }; - let bitmap = unsafe { get_nonspace_bits(chunk) }; - if bitmap != 0 { - self.nospace_bits = bitmap; - self.nospace_start = reader.index() as isize; - let cnt = bitmap.trailing_zeros() as usize; - let ch = chunk[cnt]; - reader.eat(cnt + 1); - - return Some(ch); - } - reader.eat(64) - } - - while let Some(ch) = reader.next() { + if let Some(ch) = self.read.next() { if !is_whitespace(ch) { - // return Some(ch); } } - None + self.skipper.skip_space(&mut self.read) } #[inline(always)] diff --git a/src/util/arch/aarch64.rs b/src/util/arch/aarch64.rs index 7ea63f5..3a9a44c 100644 --- a/src/util/arch/aarch64.rs +++ b/src/util/arch/aarch64.rs @@ -15,8 +15,6 @@ // This file may have been modified by ByteDance authors. All ByteDance // Modifications are Copyright 2022 ByteDance Authors. -use std::arch::aarch64::*; - // Not use PMULL instructions, but it is apparently slow. // This is copied from simdjson. pub unsafe fn prefix_xor(bitmask: u64) -> u64 { @@ -47,7 +45,8 @@ pub unsafe fn prefix_xor(bitmask: u64) -> u64 { // do not pick it up. #[inline(always)] pub unsafe fn get_nonspace_bits(data: &[u8; 64]) -> u64 { - // return super::fallback::get_nonspace_bits(data); + use std::arch::aarch64::*; + #[inline(always)] unsafe fn chunk_nonspace_bits(input: uint8x16_t) -> uint8x16_t { const LOW_TAB: uint8x16_t = diff --git a/src/util/arch/mod.rs b/src/util/arch/mod.rs index 84808b6..75d29a6 100644 --- a/src/util/arch/mod.rs +++ b/src/util/arch/mod.rs @@ -2,6 +2,9 @@ cfg_if::cfg_if! { if #[cfg(all(target_arch = "x86_64", target_feature = "pclmulqdq", target_feature = "avx2", target_feature = "sse2"))] { mod x86_64; pub use x86_64::*; + } else if #[cfg(all(target_feature="sve2", target_arch="aarch64"))] { + mod sve2; + pub use sve2::*; } else if #[cfg(all(target_feature="neon", target_arch="aarch64"))] { mod aarch64; pub use aarch64::*; @@ -18,8 +21,16 @@ mod test { #[test] fn test_get_non_space_bits() { let input = b"\t\r\n xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; - let non_space_bits = unsafe { get_nonspace_bits(input) }; - let expected_bits = 0b1111111111111111111111111111111111111111111111111111111111110000; - assert_eq!(non_space_bits, expected_bits, "bits is {non_space_bits:b}"); + cfg_if::cfg_if! { + if #[cfg(all(target_feature="sve2", target_arch="aarch64"))] { + let first_nonspace_idx = unsafe { get_nonspace_index(std::mem::transmute(input)) }; + // sve2 cannot generate the full bitmap(without performance loss) + assert_eq!(first_nonspace_idx, 4, "first non-space index is {first_nonspace_idx}"); + } else { + let non_space_bits = unsafe { get_nonspace_bits(input) }; + let expected_bits = 0b1111111111111111111111111111111111111111111111111111111111110000; + assert_eq!(non_space_bits, expected_bits, "bits is {non_space_bits:b}"); + } + } } } diff --git a/src/util/arch/sve2.rs b/src/util/arch/sve2.rs new file mode 100644 index 0000000..0a69bf7 --- /dev/null +++ b/src/util/arch/sve2.rs @@ -0,0 +1,46 @@ +pub unsafe fn prefix_xor(bitmask: u64) -> u64 { + let mut bitmask = bitmask; + bitmask ^= bitmask << 1; + bitmask ^= bitmask << 2; + bitmask ^= bitmask << 4; + bitmask ^= bitmask << 8; + bitmask ^= bitmask << 16; + bitmask ^= bitmask << 32; + bitmask +} +/// SVE2 implementation: Returns the index of the first non-space char (0-15). +/// Returns 16 if all characters are spaces. +#[inline(always)] +pub unsafe fn get_nonspace_index(data: &[u8; 16]) -> usize { + let mut idx: u64 = 16; // Default to 16 (Not Found) + // 0x09 (Tab), 0x0A (LF), 0x0D (CR), 0x20 (Space) + let tokens: u32 = 0x090a0d20; + + core::arch::asm!( + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + "mov z1.s, {t:w}", + + // 1. Identify non-space characters + // NMATCH sets the Z flag if NO non-spaces are found (all whitespace) + "nmatch p1.b, p0/z, z0.b, z1.b", + + // 2. Fast Path: Branch if NO non-space characters were found. + // b.none checks the Z flag set by nmatch. + // If Z=1 (all spaces), we skip the calculation and keep idx=16. + "b.none 1f", + + // 3. Slow Path (Found something): Calculate the exact index + "brkb p2.b, p0/z, p1.b", // Mask bits *after* the first match + "cntp {idx}, p0, p2.b", // Count leading matches + + "1:", + ptr = in(reg) data.as_ptr(), + t = in(reg) tokens, + idx = inout(reg) idx, + out("z0") _, out("z1") _, + out("p0") _, out("p1") _, out("p2") _, + ); + + idx as usize +} diff --git a/src/util/string.rs b/src/util/string.rs index 8a79791..0daed0d 100644 --- a/src/util/string.rs +++ b/src/util/string.rs @@ -1,13 +1,5 @@ -use std::{ - mem::MaybeUninit, - slice::{from_raw_parts, from_raw_parts_mut}, - str::from_utf8_unchecked, -}; +use std::{mem::MaybeUninit, slice::from_raw_parts, str::from_utf8_unchecked}; -#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] -use sonic_simd::u8x32; -#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] -use sonic_simd::{bits::NeonBits, u8x16}; use sonic_simd::{BitMask, Mask, Simd}; use crate::{ @@ -22,18 +14,20 @@ pub unsafe fn str_from_raw_parts<'a>(ptr: *const u8, len: usize) -> &'a str { from_utf8_unchecked(from_raw_parts(ptr, len)) } -pub const ESCAPED_TAB: [u8; 256] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, b'"', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, b'/', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - b'\\', 0, 0, 0, 0, 0, b'\x08', /* \b */ - 0, 0, 0, b'\x0c', /* \f */ - 0, 0, 0, 0, 0, 0, 0, b'\n', 0, 0, 0, b'\r', 0, b'\t', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -]; +const fn build_escaped_tab() -> [u8; 256] { + let mut arr = [0u8; 256]; + arr[b'"' as usize] = b'"'; + arr[b'/' as usize] = b'/'; + arr[b'\\' as usize] = b'\\'; + arr[b'b' as usize] = 0x08; + arr[b'f' as usize] = 0x0c; + arr[b'n' as usize] = 0x0a; + arr[b'r' as usize] = 0x0d; + arr[b't' as usize] = 0x09; + arr +} + +pub const ESCAPED_TAB: [u8; 256] = build_escaped_tab(); #[derive(Debug)] pub(crate) struct StringBlock { @@ -42,34 +36,6 @@ pub(crate) struct StringBlock { pub(crate) unescaped_bits: B, } -#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] -impl StringBlock { - pub(crate) const LANES: usize = 32; - - #[inline] - pub fn new(v: &u8x32) -> Self { - Self { - bs_bits: (v.eq(&u8x32::splat(b'\\'))).bitmask(), - quote_bits: (v.eq(&u8x32::splat(b'"'))).bitmask(), - unescaped_bits: (v.le(&u8x32::splat(0x1f))).bitmask(), - } - } -} - -#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] -impl StringBlock { - pub(crate) const LANES: usize = 16; - - #[inline] - pub fn new(v: &u8x16) -> Self { - Self { - bs_bits: (v.eq(&u8x16::splat(b'\\'))).bitmask(), - quote_bits: (v.eq(&u8x16::splat(b'"'))).bitmask(), - unescaped_bits: (v.le(&u8x16::splat(0x1f))).bitmask(), - } - } -} - impl StringBlock { #[inline(always)] pub fn has_unescaped(&self) -> bool { @@ -102,12 +68,126 @@ impl StringBlock { } } +impl StringBlock { + #[allow(unused)] + #[inline(always)] + pub(crate) fn new(v: &sonic_simd::u8x32) -> Self { + let v_bs = v.eq(&sonic_simd::u8x32::splat(b'\\')); + let v_quote = v.eq(&sonic_simd::u8x32::splat(b'"')); + let v_cc = v.le(&sonic_simd::u8x32::splat(0x1f)); + Self { + bs_bits: v_bs.bitmask(), + quote_bits: v_quote.bitmask(), + unescaped_bits: v_cc.bitmask(), + } + } +} + +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +impl StringBlock { + #[allow(unused)] + #[inline(always)] + pub(crate) fn new(v: &sonic_simd::u8x16) -> Self { + use sonic_simd::u8x16; + let v_bs = v.eq(&u8x16::splat(b'\\')); + let v_quote = v.eq(&u8x16::splat(b'"')); + let v_cc = v.le(&u8x16::splat(0x1f)); + Self { + bs_bits: v_bs.bitmask(), + quote_bits: v_quote.bitmask(), + unescaped_bits: v_cc.bitmask(), + } + } +} + #[inline(always)] pub(crate) unsafe fn load(ptr: *const u8) -> V { let chunk = from_raw_parts(ptr, V::LANES); V::from_slice_unaligned_unchecked(chunk) } +// build_block is defined per-arch via cfg_if below +// Detect SVE2 first, then Neon, then fallback +cfg_if::cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { + use sonic_simd::bits::SveBits; + use sonic_simd::u8x16; + + #[inline(always)] + pub(crate) fn build_block(ptr: &u8x16) -> StringBlock { + let (q, bs, un): (u64, u64, u64); + unsafe { + core::arch::asm!( + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + + // '"' + "mov z1.b, #34", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {q_idx}, p0, p1.b", + + // '\\' + "mov z1.b, #92", + "match p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {bs_idx}, p0, p1.b", + + // ascii control characters (<= 0x1f) using unsigned compare + "mov z1.b, #31", + "cmpls p1.b, p0/z, z0.b, z1.b", + "brkb p1.b, p0/z, p1.b", + "cntp {un_idx}, p0, p1.b", + + ptr = in(reg) ptr, + q_idx = out(reg) q, + bs_idx = out(reg) bs, + un_idx = out(reg) un, + out("z0") _, out("z1") _, + out("p0") _, out("p1") _, + ); + } + StringBlock { + quote_bits: SveBits::new(q as usize), + bs_bits: SveBits::new(bs as usize), + unescaped_bits: SveBits::new(un as usize), + } + } + + pub(crate) fn load_v(ptr: *const u8) -> u8x16 { + unsafe { load::(ptr) } + } + + pub const STRING_BLOCK_LANES: usize = 16; + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + use sonic_simd::{bits::NeonBits, u8x16}; + + pub(crate) fn load_v(ptr: *const u8) -> u8x16 { + unsafe { load::(ptr) } + } + + #[inline(always)] + pub(crate) fn build_block(v: &u8x16) -> StringBlock { + StringBlock::::new(v) + } + + pub const STRING_BLOCK_LANES: usize = 16; + } else { + use sonic_simd::u8x32; + + pub(crate) fn load_v(ptr: *const u8) -> u8x32 { + unsafe { load::(ptr) } + } + + #[inline(always)] + pub(crate) fn build_block(v: &u8x32) -> StringBlock { + StringBlock::::new(v) + } + + pub const STRING_BLOCK_LANES: usize = 32; + } +} + /// Return the size of the actual parsed string, `repr` means repr invalid UTF16 surrogate with /// `\uFFFD` /// TODO: fix me, there are repeat codes!!! @@ -116,17 +196,13 @@ pub(crate) unsafe fn parse_string_inplace( src: &mut *mut u8, repr: bool, ) -> std::result::Result { - #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] - let mut block: StringBlock; - #[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))] - let mut block: StringBlock; - let sdst = *src; let src: &mut *const u8 = std::mem::transmute(src); - // loop for string without escaped chars + // loop for string without escaped chars (original control flow) + let mut v = load_v(*src); + let mut block = build_block(&v); loop { - block = StringBlock::new(&unsafe { load(*src) }); if block.has_quote_first() { let idx = block.quote_index(); *src = src.add(idx + 1); @@ -138,14 +214,16 @@ pub(crate) unsafe fn parse_string_inplace( if block.has_backslash() { break; } - *src = src.add(StringBlock::LANES); + *src = src.add(STRING_BLOCK_LANES); + v = load_v(*src); + block = build_block(&v); } let bs_dist = block.bs_index(); *src = src.add(bs_dist); let mut dst = sdst.add((*src as usize) - sdst as usize); - // loop for string with escaped chars + // loop for string with escaped chars (original control flow) loop { 'escape: loop { let escaped_char: u8 = *src.add(1); @@ -170,8 +248,8 @@ pub(crate) unsafe fn parse_string_inplace( } 'find_and_move: loop { - let v = unsafe { load(*src) }; - let block = StringBlock::new(&v); + let v = load_v(*src); + let block = build_block(&v); if block.has_quote_first() { while **src != b'"' { *dst = **src; @@ -185,13 +263,14 @@ pub(crate) unsafe fn parse_string_inplace( return Err(ControlCharacterWhileParsingString); } if !block.has_backslash() { - let chunk = from_raw_parts_mut(dst, StringBlock::LANES); + // copy a full chunk without escapes using SIMD store + let chunk = std::slice::from_raw_parts_mut(dst, STRING_BLOCK_LANES); v.write_to_slice_unaligned_unchecked(chunk); - *src = src.add(StringBlock::LANES); - dst = dst.add(StringBlock::LANES); + *src = src.add(STRING_BLOCK_LANES); + dst = dst.add(STRING_BLOCK_LANES); continue 'find_and_move; } - // TODO: loop unrooling here + // TODO: loop unrolling here while **src != b'\\' { *dst = **src; dst = dst.add(1); @@ -202,6 +281,54 @@ pub(crate) unsafe fn parse_string_inplace( } // slow loop for escaped chars } +const NEED_ESCAPED: [u8; 256] = [ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +]; + +#[inline(always)] +unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { + assert!(*nb >= 1); + loop { + let ch = *(*src); + let cnt = QUOTE_TAB[ch as usize].0 as usize; + assert!( + cnt != 0, + "char is {}, cnt is {}, NEED_ESCAPED is {}", + ch as char, + cnt, + NEED_ESCAPED[ch as usize] + ); + std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8); + (*dst) = (*dst).add(cnt); + (*src) = (*src).add(1); + (*nb) -= 1; + if (*nb) == 0 || NEED_ESCAPED[*(*src) as usize] == 0 { + return; + } + } +} + +#[inline(always)] +fn check_cross_page(ptr: *const u8, step: usize) -> bool { + #[cfg(any(target_os = "linux", target_os = "macos"))] + { + let page_size = 4096; + ((ptr as usize & (page_size - 1)) + step) > page_size + } + + #[cfg(not(any(target_os = "linux", target_os = "macos")))] + { + true + } +} + pub const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ // 0x00 ~ 0x1f (6, *b"\\u0000\0\0"), @@ -467,88 +594,81 @@ pub const QUOTE_TAB: [(u8, [u8; 8]); 256] = [ (0, [0; 8]), ]; -const NEED_ESCAPED: [u8; 256] = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -]; - -// only check the src length. -#[inline(always)] -unsafe fn escape_unchecked(src: &mut *const u8, nb: &mut usize, dst: &mut *mut u8) { - assert!(*nb >= 1); - loop { - let ch = *(*src); - let cnt = QUOTE_TAB[ch as usize].0 as usize; - assert!( - cnt != 0, - "char is {}, cnt is {}, NEED_ESCAPED is {}", - ch as char, - cnt, - NEED_ESCAPED[ch as usize] - ); - std::ptr::copy_nonoverlapping(QUOTE_TAB[ch as usize].1.as_ptr(), *dst, 8); - (*dst) = (*dst).add(cnt); - (*src) = (*src).add(1); - (*nb) -= 1; - if (*nb) == 0 || NEED_ESCAPED[*(*src) as usize] == 0 { - return; - } - } -} - -#[inline(always)] -fn check_cross_page(ptr: *const u8, step: usize) -> bool { - #[cfg(any(target_os = "linux", target_os = "macos"))] - { - let page_size = 4096; - ((ptr as usize & (page_size - 1)) + step) > page_size - } - - #[cfg(not(any(target_os = "linux", target_os = "macos")))] - { - // not check page cross in fallback envs, always true - true - } -} - #[inline(always)] pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) -> usize { assert!(dst.len() >= value.len() * 6 + 32 + 3); - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - let mut v: u8x16; - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - let mut v: u8x32; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - const LANES: usize = 16; - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - const LANES: usize = 32; - - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - #[inline] - fn escaped_mask(v: u8x16) -> NeonBits { - let x1f = u8x16::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x16::splat(b'\\'); - let quote = u8x16::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() - } - - #[cfg(not(all(target_arch = "aarch64", target_feature = "neon")))] - #[inline] - fn escaped_mask(v: u8x32) -> u32 { - let x1f = u8x32::splat(0x1f); // 0x00 ~ 0x20 - let blash = u8x32::splat(b'\\'); - let quote = u8x32::splat(b'"'); - let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); - v.bitmask() + cfg_if::cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { + use sonic_simd::{bits::SveBits, u8x16}; + let mut v: u8x16; + const LANES: usize = 16; + + #[inline(always)] + fn escaped_mask_at(ptr: *const u8) -> SveBits { + let idx: u64; + unsafe { + core::arch::asm!( + // 1. Load data + "ptrue p0.b, vl16", + "ld1b {{z0.b}}, p0/z, [{ptr}]", + + // 2. Check for " (using cmpeq) + "mov z1.b, #34", + "cmpeq p1.b, p0/z, z0.b, z1.b", + + // 3. Check for \ (using cmpeq) + "mov z2.b, #92", + "cmpeq p2.b, p0/z, z0.b, z2.b", + + // 4. Check for Control Chars <= 0x1f (using cmpls) + "mov z3.b, #31", + "cmpls p3.b, p0/z, z0.b, z3.b", + + // 5. Combine all results (OR) + // We reuse p1 to accumulate the flags + "orr p1.b, p0/z, p1.b, p2.b", + "orr p1.b, p0/z, p1.b, p3.b", + + // 6. Find first set bit (Break Before) + "brkb p1.b, p0/z, p1.b", + "cntp {idx}, p0, p1.b", + + ptr = in(reg) ptr, + idx = out(reg) idx, + out("z0") _, out("z1") _, out("z2") _, out("z3") _, + out("p0") _, out("p1") _, out("p2") _, out("p3") _, + ); + } + SveBits::new(idx as usize) + } + } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { + use sonic_simd::{bits::NeonBits, u8x16}; + let mut v: u8x16; + const LANES: usize = 16; + + #[inline(always)] + fn escaped_mask(v: u8x16) -> NeonBits { + let x1f = u8x16::splat(0x1f); + let blash = u8x16::splat(b'\\'); + let quote = u8x16::splat(b'\"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() + } + } else { + use sonic_simd::u8x32; + let mut v: u8x32; + const LANES: usize = 32; + + #[inline(always)] + fn escaped_mask(v: u8x32) -> u32 { + let x1f = u8x32::splat(0x1f); + let blash = u8x32::splat(b'\\'); + let quote = u8x32::splat(b'\"'); + let v = v.le(&x1f) | v.eq(&blash) | v.eq("e); + v.bitmask() + } + } } unsafe { @@ -563,19 +683,36 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) dptr = dptr.add(1); } while nb >= LANES { - v = load(sptr); + v = load_v(sptr); v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); - let mask = escaped_mask(v); - if mask.all_zero() { - nb -= LANES; - dptr = dptr.add(LANES); - sptr = sptr.add(LANES); - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); + cfg_if::cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { + let mask = escaped_mask_at(sptr); + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } else { + let mask = escaped_mask(v); + if mask.all_zero() { + nb -= LANES; + dptr = dptr.add(LANES); + sptr = sptr.add(LANES); + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } } } @@ -583,31 +720,46 @@ pub fn format_string(value: &str, dst: &mut [MaybeUninit], need_quote: bool) while nb > 0 { v = if check_cross_page(sptr, LANES) { std::ptr::copy_nonoverlapping(sptr, temp[..].as_mut_ptr(), nb); - load(temp[..].as_ptr()) + load_v(temp[..].as_ptr()) } else { #[cfg(not(any(debug_assertions, feature = "sanitize")))] { - // disable memory sanitizer here - load(sptr) + load_v(sptr) } #[cfg(any(debug_assertions, feature = "sanitize"))] { std::ptr::copy_nonoverlapping(sptr, temp[..].as_mut_ptr(), nb); - load(temp[..].as_ptr()) + load_v(temp[..].as_ptr()) } }; v.write_to_slice_unaligned_unchecked(std::slice::from_raw_parts_mut(dptr, LANES)); - let mask = escaped_mask(v).clear_high_bits(LANES - nb); - if mask.all_zero() { - dptr = dptr.add(nb); - break; - } else { - let cn = mask.first_offset(); - nb -= cn; - dptr = dptr.add(cn); - sptr = sptr.add(cn); - escape_unchecked(&mut sptr, &mut nb, &mut dptr); + cfg_if::cfg_if! { + if #[cfg(all(target_arch = "aarch64", target_feature = "sve2"))] { + let mask = escaped_mask_at(sptr).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } else { + let mask = escaped_mask(v).clear_high_bits(LANES - nb); + if mask.all_zero() { + dptr = dptr.add(nb); + break; + } else { + let cn = mask.first_offset(); + nb -= cn; + dptr = dptr.add(cn); + sptr = sptr.add(cn); + escape_unchecked(&mut sptr, &mut nb, &mut dptr); + } + } } } if need_quote {