From cf623245cc204edfc19845fc3d5cb1cebb19541f Mon Sep 17 00:00:00 2001 From: tompro Date: Sat, 20 Dec 2025 23:19:23 +0100 Subject: [PATCH] Split SyntaxError::UnclosedPIOrXmlDecl into separate enum Variants --- Changelog.md | 4 ++ src/errors.rs | 14 ++++-- src/parser/element.rs | 2 +- src/parser/mod.rs | 6 ++- src/parser/pi.rs | 16 +++++- src/reader/buffered_reader.rs | 2 +- src/reader/slice_reader.rs | 2 +- src/reader/state.rs | 15 ++++-- tests/reader-errors.rs | 95 +++++++++++++++++++++++++++++------ 9 files changed, 129 insertions(+), 27 deletions(-) diff --git a/Changelog.md b/Changelog.md index f25ef68d6..e07e15c88 100644 --- a/Changelog.md +++ b/Changelog.md @@ -29,6 +29,9 @@ struct and can be applied at once. When `serde-types` feature is enabled, config ### Misc Changes +- [#924]: (breaking change) Split `SyntaxError::UnclosedPIOrXmlDecl` into `UnclosedPI` and + `UnclosedXmlDecl` for more precise error reporting. +- [#924]: (breaking change) `Parser::eof_error` now takes `&self` and content `&[u8]` parameters. - [#908]: Increase minimal supported `serde` version from 1.0.139 to 1.0.180. - [#913]: Deprecate `.prefixes()`, `.resolve()`, `.resolve_attribute()`, and `.resolve_element()` of `NsReader`. Use `.resolver().bindings()` and `.resolver().resolve()` methods instead. @@ -37,6 +40,7 @@ struct and can be applied at once. When `serde-types` feature is enabled, config [#846]: https://github.com/tafia/quick-xml/issues/846 [#908]: https://github.com/tafia/quick-xml/pull/908 [#913]: https://github.com/tafia/quick-xml/pull/913 +[#924]: https://github.com/tafia/quick-xml/pull/924 ## 0.38.4 -- 2025-11-11 diff --git a/src/errors.rs b/src/errors.rs index f7c7a8c69..cc69c91aa 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -17,9 +17,12 @@ pub enum SyntaxError { /// The parser started to parse `` sequence was found. - UnclosedPIOrXmlDecl, + UnclosedPI, + /// The parser started to parse XML declaration (`` sequence was found. + UnclosedXmlDecl, /// The parser started to parse comment (`` sequence was found. UnclosedComment, @@ -38,8 +41,11 @@ impl fmt::Display for SyntaxError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::InvalidBangMarkup => f.write_str("unknown or missed symbol in markup"), - Self::UnclosedPIOrXmlDecl => { - f.write_str("processing instruction or xml declaration not closed: `?>` not found before end of input") + Self::UnclosedPI => { + f.write_str("processing instruction not closed: `?>` not found before end of input") + } + Self::UnclosedXmlDecl => { + f.write_str("XML declaration not closed: `?>` not found before end of input") } Self::UnclosedComment => { f.write_str("comment not closed: `-->` not found before end of input") diff --git a/src/parser/element.rs b/src/parser/element.rs index e257c5acd..3013b77b5 100644 --- a/src/parser/element.rs +++ b/src/parser/element.rs @@ -73,7 +73,7 @@ impl Parser for ElementParser { } #[inline] - fn eof_error() -> SyntaxError { + fn eof_error(&self, _content: &[u8]) -> SyntaxError { SyntaxError::UnclosedTag } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 90f97c9f5..c554e7a0b 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -25,5 +25,9 @@ pub trait Parser { /// Returns parse error produced by this parser in case of reaching end of /// input without finding the end of a parsed thing. - fn eof_error() -> SyntaxError; + /// + /// # Parameters + /// - `content`: the content that was read before EOF. Some parsers may use + /// this to provide more specific error messages. + fn eof_error(&self, content: &[u8]) -> SyntaxError; } diff --git a/src/parser/pi.rs b/src/parser/pi.rs index ee5535795..78924aa96 100644 --- a/src/parser/pi.rs +++ b/src/parser/pi.rs @@ -2,6 +2,7 @@ use crate::errors::SyntaxError; use crate::parser::Parser; +use crate::utils::is_whitespace; /// A parser that search a `?>` sequence in the slice. /// @@ -72,8 +73,19 @@ impl Parser for PiParser { } #[inline] - fn eof_error() -> SyntaxError { - SyntaxError::UnclosedPIOrXmlDecl + fn eof_error(&self, content: &[u8]) -> SyntaxError { + // Check if content starts with "?xml" followed by whitespace, '?' or end. + // This determines whether to report an unclosed XML declaration or PI. + // FIXME: Add support for UTF-8/ASCII incompatible encodings (UTF-16) + let is_xml_decl = content.starts_with(b"?xml") + && content + .get(4) + .map_or(true, |&b| is_whitespace(b) || b == b'?'); + if is_xml_decl { + SyntaxError::UnclosedXmlDecl + } else { + SyntaxError::UnclosedPI + } } } diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 9b47da349..27aa74c06 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -234,7 +234,7 @@ macro_rules! impl_buffered_source { } *position += read; - Err(Error::Syntax(P::eof_error())) + Err(Error::Syntax(parser.eof_error(&buf[start..]))) } #[inline] diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 311edf6a6..45ec32b7b 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -350,7 +350,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } *position += self.len() as u64; - Err(Error::Syntax(P::eof_error())) + Err(Error::Syntax(parser.eof_error(self))) } #[inline] diff --git a/src/reader/state.rs b/src/reader/state.rs index 656e57cce..c1e58ad04 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -270,11 +270,20 @@ impl ReaderState { ))) } } else { - // + // ^^^^^ - `buf` does not contain `<`, but we want to report error at `<`, // so we move offset to it (-2 for `<` and `>`) self.last_error_offset = self.offset - len as u64 - 2; - Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl)) + + // Check if this is an XML declaration (starts with "?xml" followed by whitespace or "?") + // FIXME: Add support for UTF-8/ASCII incompatible encodings (UTF-16) + let is_xml_decl = buf.starts_with(b"?xml") + && buf.get(4).map_or(true, |&b| is_whitespace(b) || b == b'?'); + if is_xml_decl { + Err(Error::Syntax(SyntaxError::UnclosedXmlDecl)) + } else { + Err(Error::Syntax(SyntaxError::UnclosedPI)) + } } } diff --git a/tests/reader-errors.rs b/tests/reader-errors.rs index 0eecca7d9..4d7ec2cc1 100644 --- a/tests/reader-errors.rs +++ b/tests/reader-errors.rs @@ -430,16 +430,16 @@ mod syntax { mod pi { use super::*; - err!(unclosed01(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed02(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed03(".") => SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed04(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed05(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed06(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed07(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed08(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed09(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed10(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed01(". SyntaxError::UnclosedPI); + err!(unclosed02(". SyntaxError::UnclosedPI); + err!(unclosed03(".") => SyntaxError::UnclosedPI); + err!(unclosed04(". SyntaxError::UnclosedPI); + err!(unclosed05(". SyntaxError::UnclosedPI); + err!(unclosed06(". SyntaxError::UnclosedPI); + err!(unclosed07(". SyntaxError::UnclosedPI); + err!(unclosed08(". SyntaxError::UnclosedPI); + err!(unclosed09(". SyntaxError::UnclosedPI); + err!(unclosed10(". SyntaxError::UnclosedPI); // According to the grammar, processing instruction MUST contain a non-empty // target name, but we do not consider this as a _syntax_ error. @@ -453,10 +453,16 @@ mod syntax { mod decl { use super::*; - err!(unclosed1(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed2(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed3(". SyntaxError::UnclosedPIOrXmlDecl); - err!(unclosed4(". SyntaxError::UnclosedPIOrXmlDecl); + err!(unclosed1(". SyntaxError::UnclosedPI); + err!(unclosed2(". SyntaxError::UnclosedPI); + err!(unclosed3(". SyntaxError::UnclosedXmlDecl); + err!(unclosed4(". SyntaxError::UnclosedXmlDecl); + err!(unclosed5(". SyntaxError::UnclosedXmlDecl); + err!(unclosed6(". SyntaxError::UnclosedXmlDecl); + err!(unclosed7(". SyntaxError::UnclosedXmlDecl); + err!(unclosed8(". SyntaxError::UnclosedXmlDecl); + // "xmls" is a PI target, not an XML declaration + err!(unclosed9(". SyntaxError::UnclosedPI); // According to the grammar, XML declaration MUST contain at least one space // and `version` attribute, but we do not consider this as a _syntax_ error. @@ -467,6 +473,67 @@ mod syntax { ok!(normal5("") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\n", 3)))); ok!(normal6("rest") => 8: Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml\n", 3)))); } + + /// Tests for UTF-16 encoded XML declarations. + /// FIXME: Add support for UTF-8/ASCII incompatible encodings (UTF-16) + mod decl_utf16 { + use super::*; + use pretty_assertions::assert_eq; + + /// UTF-16 LE encoded ` { + assert_eq!(cause, SyntaxError::UnclosedXmlDecl); + } + x => panic!( + "Expected `Err(Syntax(UnclosedXmlDecl))`, but got {:?}", + x + ), + } + } + + #[test] + #[ignore = "UTF-16 support not yet implemented for XML declaration detection"] + fn utf16_be_unclosed_xml_decl() { + let mut reader = Reader::from_reader(UTF16_BE_XML_DECL); + match reader.read_event() { + Err(Error::Syntax(cause)) => { + assert_eq!(cause, SyntaxError::UnclosedXmlDecl); + } + x => panic!( + "Expected `Err(Syntax(UnclosedXmlDecl))`, but got {:?}", + x + ), + } + } + } } mod ill_formed {