Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions crates/wit-parser/src/ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1843,14 +1843,7 @@ impl SourceMap {
}

if let Some(lex) = err.downcast_ref::<lex::Error>() {
let pos = match lex {
lex::Error::Unexpected(at, _)
| lex::Error::UnterminatedComment(at)
| lex::Error::Wanted { at, .. }
| lex::Error::InvalidCharInId(at, _)
| lex::Error::IdPartEmpty(at)
| lex::Error::InvalidEscape(at, _) => *at,
};
let pos = lex.position();
let msg = self.highlight_err(pos, None, lex);
bail!("{msg}")
}
Expand Down
68 changes: 46 additions & 22 deletions crates/wit-parser/src/ast/lex.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#[cfg(test)]
use alloc::{vec, vec::Vec};
use anyhow::{Result, bail};
use core::char;
use core::fmt;
use core::result::Result;
use core::str;
use unicode_xid::UnicodeXID;

Expand Down Expand Up @@ -166,6 +166,9 @@ pub enum Token {
#[derive(Eq, PartialEq, Debug)]
#[allow(dead_code)]
pub enum Error {
ControlCodepoint(u32, char),
DeprecatedCodepoint(u32, char),
ForbiddenCodepoint(u32, char),
InvalidCharInId(u32, char),
IdPartEmpty(u32),
InvalidEscape(u32, char),
Expand All @@ -179,7 +182,7 @@ pub enum Error {
}

impl<'a> Tokenizer<'a> {
pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>> {
pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>, Error> {
detect_invalid_input(input)?;

let mut t = Tokenizer {
Expand All @@ -194,7 +197,7 @@ impl<'a> Tokenizer<'a> {
Ok(t)
}

pub fn expect_semicolon(&mut self) -> Result<()> {
pub fn expect_semicolon(&mut self) -> Result<(), Error> {
self.expect(Token::Semicolon)?;
Ok(())
}
Expand All @@ -205,13 +208,13 @@ impl<'a> Tokenizer<'a> {
&self.input[start..end]
}

pub fn parse_id(&self, span: Span) -> Result<&'a str> {
pub fn parse_id(&self, span: Span) -> Result<&'a str, Error> {
let ret = self.get_span(span);
validate_id(span.start(), &ret)?;
Ok(ret)
}

pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str, Error> {
let token = self.get_span(span);
let id_part = token.strip_prefix('%').unwrap();
validate_id(span.start(), id_part)?;
Expand Down Expand Up @@ -456,13 +459,11 @@ impl<'a> Iterator for CrlfFold<'a> {
}
}

fn detect_invalid_input(input: &str) -> Result<()> {
fn detect_invalid_input(input: &str) -> Result<(), Error> {
// Disallow specific codepoints.
let mut line = 1;
for ch in input.chars() {
for (pos, ch) in input.char_indices() {
match ch {
'\n' => line += 1,
'\r' | '\t' => {}
'\n' | '\r' | '\t' => {}

// Bidirectional override codepoints can be used to craft source code that
// appears to have a different meaning than its actual meaning. See
Expand All @@ -471,11 +472,7 @@ fn detect_invalid_input(input: &str) -> Result<()> {
// [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
'\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
| '\u{2067}' | '\u{2068}' | '\u{2069}' => {
bail!(
"Input contains bidirectional override codepoint {:?} at line {}",
ch.escape_unicode(),
line
);
return Err(Error::ForbiddenCodepoint(u32::try_from(pos).unwrap(), ch));
}

// Disallow several characters which are deprecated or discouraged in Unicode.
Expand All @@ -487,18 +484,14 @@ fn detect_invalid_input(input: &str) -> Result<()> {
// Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
'\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
| '\u{17b4}' | '\u{17b5}' => {
bail!(
"Codepoint {:?} at line {} is discouraged by Unicode",
ch.escape_unicode(),
line
);
return Err(Error::DeprecatedCodepoint(u32::try_from(pos).unwrap(), ch));
}

// Disallow control codes other than the ones explicitly recognized above,
// so that viewing a wit file on a terminal doesn't have surprising side
// effects or appear to have a different meaning than its actual meaning.
ch if ch.is_control() => {
bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
return Err(Error::ControlCodepoint(u32::try_from(pos).unwrap(), ch));
}

_ => {}
Expand Down Expand Up @@ -635,9 +628,40 @@ impl Token {

impl core::error::Error for Error {}

impl Error {
pub fn position(&self) -> u32 {
match self {
Error::ControlCodepoint(at, _)
| Error::DeprecatedCodepoint(at, _)
| Error::ForbiddenCodepoint(at, _)
| Error::InvalidCharInId(at, _)
| Error::IdPartEmpty(at)
| Error::InvalidEscape(at, _)
| Error::Unexpected(at, _)
| Error::UnterminatedComment(at) => *at,
Error::Wanted { at, .. } => *at,
}
}
}

impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Error::ControlCodepoint(_, ch) => write!(f, "Control code '{}'", ch.escape_unicode()),
Error::DeprecatedCodepoint(_, ch) => {
write!(
f,
"Codepoint {:?} is discouraged by Unicode",
ch.escape_unicode()
)
}
Error::ForbiddenCodepoint(_, ch) => {
write!(
f,
"Input contains bidirectional override codepoint {:?}",
ch.escape_unicode()
)
}
Error::Unexpected(_, ch) => write!(f, "unexpected character {ch:?}"),
Error::UnterminatedComment(_) => write!(f, "unterminated block comment"),
Error::Wanted {
Expand Down Expand Up @@ -712,7 +736,7 @@ fn test_validate_id() {

#[test]
fn test_tokenizer() {
fn collect(s: &str) -> Result<Vec<Token>> {
fn collect(s: &str) -> Result<Vec<Token>, Error> {
let mut t = Tokenizer::new(s, 0)?;
let mut tokens = Vec::new();
while let Some(token) = t.next()? {
Expand Down