Skip to content

Commit 4c85384

Browse files
authored
Merge pull request #261 from pchampin/custom-entities
add `*_with_custom_entities` versions of all `unescape_*\ methods
2 parents 3a1e02a + 4855fae commit 4c85384

File tree

5 files changed

+470
-57
lines changed

5 files changed

+470
-57
lines changed

examples/custom_entities.rs

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
//! This example demonstrate how custom entities can be extracted from the DOCTYPE!,
2+
//! and later use to decode text and attribute values.
3+
//!
4+
//! NB: this example is deliberately kept simple:
5+
//! * it assumes that the XML file is UTF-8 encoded (custom_entities must only contain UTF-8 data)
6+
//! * it only handles internal entities;
7+
//! * the regex in this example is simple but brittle;
8+
//! * it does not support the use of entities in entity declaration.
9+
10+
extern crate quick_xml;
11+
extern crate regex;
12+
13+
use quick_xml::events::Event;
14+
use quick_xml::Reader;
15+
use regex::bytes::Regex;
16+
use std::collections::HashMap;
17+
18+
const DATA: &str = r#"
19+
20+
<?xml version="1.0"?>
21+
<!DOCTYPE test [
22+
<!ENTITY msg "hello world" >
23+
]>
24+
<test label="&msg;">&msg;</test>
25+
26+
"#;
27+
28+
fn main() -> Result<(), Box<dyn std::error::Error>> {
29+
let mut reader = Reader::from_str(DATA);
30+
reader.trim_text(true);
31+
32+
let mut buf = Vec::new();
33+
let mut custom_entities = HashMap::new();
34+
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;
35+
36+
loop {
37+
match reader.read_event(&mut buf) {
38+
Ok(Event::DocType(ref e)) => {
39+
for cap in entity_re.captures_iter(&e) {
40+
custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
41+
}
42+
}
43+
Ok(Event::Start(ref e)) => match e.name() {
44+
b"test" => println!(
45+
"attributes values: {:?}",
46+
e.attributes()
47+
.map(|a| a
48+
.unwrap()
49+
.unescape_and_decode_value_with_custom_entities(
50+
&reader,
51+
&custom_entities
52+
)
53+
.unwrap())
54+
.collect::<Vec<_>>()
55+
),
56+
_ => (),
57+
},
58+
Ok(Event::Text(ref e)) => {
59+
println!(
60+
"text value: {}",
61+
e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
62+
.unwrap()
63+
);
64+
}
65+
Ok(Event::Eof) => break,
66+
Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
67+
_ => (),
68+
}
69+
}
70+
Ok(())
71+
}

src/escapei.rs

Lines changed: 72 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
33
use memchr;
44
use std::borrow::Cow;
5+
use std::collections::HashMap;
56

67
#[derive(Debug)]
78
pub enum EscapeError {
@@ -107,6 +108,32 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> {
107108
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
108109
/// value
109110
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
111+
do_unescape(raw, None)
112+
}
113+
114+
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
115+
/// value, using a dictionnary of custom entities.
116+
///
117+
/// # Pre-condition
118+
///
119+
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
120+
pub fn unescape_with<'a>(
121+
raw: &'a [u8],
122+
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
123+
) -> Result<Cow<'a, [u8]>, EscapeError> {
124+
do_unescape(raw, Some(custom_entities))
125+
}
126+
127+
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
128+
/// value, using an optional dictionnary of custom entities.
129+
///
130+
/// # Pre-condition
131+
///
132+
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
133+
pub fn do_unescape<'a>(
134+
raw: &'a [u8],
135+
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
136+
) -> Result<Cow<'a, [u8]>, EscapeError> {
110137
let mut unescaped = None;
111138
let mut last_end = 0;
112139
let mut iter = memchr::memchr2_iter(b'&', b';', raw);
@@ -128,22 +155,27 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
128155
b"amp" => unescaped.push(b'&'),
129156
b"apos" => unescaped.push(b'\''),
130157
b"quot" => unescaped.push(b'\"'),
131-
bytes => {
132-
let code = if bytes.starts_with(b"#x") {
133-
parse_hexadecimal(&bytes[2..])
134-
} else if bytes.starts_with(b"#") {
135-
parse_decimal(&bytes[1..])
158+
bytes if bytes.starts_with(b"#") => {
159+
let bytes = &bytes[1..];
160+
let code = if bytes.starts_with(b"x") {
161+
parse_hexadecimal(&bytes[1..])
136162
} else {
137-
Err(EscapeError::UnrecognizedSymbol(
138-
start + 1..end,
139-
String::from_utf8(bytes.to_vec()),
140-
))
163+
parse_decimal(&bytes)
141164
}?;
142165
if code == 0 {
143166
return Err(EscapeError::EntityWithNull(start..end));
144167
}
145168
push_utf8(unescaped, code);
146169
}
170+
bytes => match custom_entities.and_then(|hm| hm.get(bytes)) {
171+
Some(value) => unescaped.extend_from_slice(&value),
172+
None => {
173+
return Err(EscapeError::UnrecognizedSymbol(
174+
start + 1..end,
175+
String::from_utf8(bytes.to_vec()),
176+
))
177+
}
178+
},
147179
}
148180

149181
#[cfg(feature = "escape-html")]
@@ -5532,22 +5564,27 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
55325564
unescaped.push(b'\x1D');
55335565
unescaped.push(b'\x56');
55345566
}
5535-
bytes => {
5536-
let code = if bytes.starts_with(b"#x") {
5537-
parse_hexadecimal(&bytes[2..])
5538-
} else if bytes.starts_with(b"#") {
5539-
parse_decimal(&bytes[1..])
5567+
bytes if bytes.starts_with(b"#") => {
5568+
let bytes = &bytes[1..];
5569+
let code = if bytes.starts_with(b"x") {
5570+
parse_hexadecimal(&bytes[1..])
55405571
} else {
5541-
Err(EscapeError::UnrecognizedSymbol(
5542-
start + 1..end,
5543-
String::from_utf8(bytes.to_vec()),
5544-
))
5572+
parse_decimal(&bytes)
55455573
}?;
55465574
if code == 0 {
55475575
return Err(EscapeError::EntityWithNull(start..end));
55485576
}
55495577
push_utf8(unescaped, code);
55505578
}
5579+
bytes => match custom_entities.and_then(|hm| hm.get(bytes)) {
5580+
Some(value) => unescaped.extend_from_slice(&value),
5581+
None => {
5582+
return Err(EscapeError::UnrecognizedSymbol(
5583+
start + 1..end,
5584+
String::from_utf8(bytes.to_vec()),
5585+
))
5586+
}
5587+
},
55515588
}
55525589
last_end = end + 1;
55535590
}
@@ -5623,6 +5660,23 @@ fn test_unescape() {
56235660
assert_eq!(&*unescape(b"&lt;test&gt;").unwrap(), b"<test>");
56245661
assert_eq!(&*unescape(b"&#x30;").unwrap(), b"0");
56255662
assert_eq!(&*unescape(b"&#48;").unwrap(), b"0");
5663+
assert!(unescape(b"&foo;").is_err());
5664+
}
5665+
5666+
#[test]
5667+
fn test_unescape_with() {
5668+
let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
5669+
.into_iter()
5670+
.collect();
5671+
assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
5672+
assert_eq!(
5673+
&*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
5674+
b"<test>"
5675+
);
5676+
assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
5677+
assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
5678+
assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
5679+
assert!(unescape_with(b"&fop;", &custom_entities).is_err());
56265680
}
56275681

56285682
#[test]

0 commit comments

Comments
 (0)