Skip to content

Commit a881c1c

Browse files
committed
Faster escape routines
1 parent 0febc2b commit a881c1c

File tree

2 files changed

+50
-5
lines changed

2 files changed

+50
-5
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ document-features = { version = "0.2", optional = true }
1616
encoding_rs = { version = "0.8", optional = true }
1717
serde = { version = "1.0", optional = true }
1818
memchr = "2.5"
19+
jetscii = "0.5.2"
20+
once_cell = "1.12.0"
1921

2022
[dev-dependencies]
2123
criterion = "0.3"

src/escapei.rs

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
//! Manage xml character escapes
22
3-
use memchr;
43
use std::borrow::Cow;
54
use std::collections::HashMap;
65
use std::ops::Range;
76

7+
use jetscii::bytes;
8+
use memchr;
9+
use once_cell::sync::Lazy;
10+
811
#[cfg(test)]
912
use pretty_assertions::assert_eq;
1013

14+
static XML_ESCAPE_BYTES: Lazy<jetscii::BytesConst> =
15+
Lazy::new(|| bytes!(b'<', b'>', b'&', b'\'', b'"'));
16+
static XML_PARTIAL_ESCAPE_BYTES: Lazy<jetscii::BytesConst> = Lazy::new(|| bytes!(b'<', b'>', b'&'));
17+
1118
/// Error for XML escape/unescqpe.
1219
#[derive(Debug)]
1320
pub enum EscapeError {
@@ -73,8 +80,8 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> {
7380
_ => false,
7481
}
7582
}
76-
77-
_escape(raw, to_escape)
83+
// _escape(raw, to_escape)
84+
simd_escape(raw, &XML_PARTIAL_ESCAPE_BYTES)
7885
}
7986

8087
/// Should only be used for escaping text content. In xml text content, it is allowed
@@ -89,8 +96,8 @@ pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
8996
_ => false,
9097
}
9198
}
92-
93-
_escape(raw, to_escape)
99+
// _escape(raw, to_escape)
100+
simd_escape(raw, &XML_ESCAPE_BYTES)
94101
}
95102

96103
/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
@@ -127,6 +134,42 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
127134
}
128135
}
129136

137+
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
138+
/// corresponding xml escaped value.
139+
pub fn simd_escape<'a>(raw: &'a [u8], escape_matcher: &jetscii::BytesConst) -> Cow<'a, [u8]> {
140+
let mut escaped = None;
141+
let mut pos = 0;
142+
while let Some(i) = escape_matcher.find(&raw[pos..]) {
143+
if escaped.is_none() {
144+
escaped = Some(Vec::with_capacity(raw.len()));
145+
}
146+
let escaped = escaped.as_mut().expect("initialized");
147+
let new_pos = pos + i;
148+
escaped.extend_from_slice(&raw[pos..new_pos]);
149+
match raw[new_pos] {
150+
b'<' => escaped.extend_from_slice(b"&lt;"),
151+
b'>' => escaped.extend_from_slice(b"&gt;"),
152+
b'\'' => escaped.extend_from_slice(b"&apos;"),
153+
b'&' => escaped.extend_from_slice(b"&amp;"),
154+
b'"' => escaped.extend_from_slice(b"&quot;"),
155+
c @ _ => unreachable!(
156+
"Found {} but only '<', '>', ', '&' and '\"' are escaped",
157+
c as char
158+
),
159+
}
160+
pos = new_pos + 1;
161+
}
162+
163+
if let Some(mut escaped) = escaped {
164+
if let Some(raw) = raw.get(pos..) {
165+
escaped.extend_from_slice(raw);
166+
}
167+
Cow::Owned(escaped)
168+
} else {
169+
Cow::Borrowed(raw)
170+
}
171+
}
172+
130173
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
131174
/// value
132175
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {

0 commit comments

Comments
 (0)