Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
435 changes: 286 additions & 149 deletions crates/wit-parser/src/ast.rs

Large diffs are not rendered by default.

68 changes: 47 additions & 21 deletions crates/wit-parser/src/ast/lex.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#[cfg(test)]
use alloc::{vec, vec::Vec};
use anyhow::{Result, bail};
use core::char;
use core::fmt;
use core::result::Result;
use core::str;
use unicode_xid::UnicodeXID;

Expand Down Expand Up @@ -171,6 +171,9 @@ pub enum Error {
InvalidEscape(u32, char),
Unexpected(u32, char),
UnterminatedComment(u32),
ForbiddenCodepoint(u32, char),
DeprecatedCodepoint(u32, char),
ControlCodepoint(u32, char),
Wanted {
at: u32,
expected: &'static str,
Expand All @@ -179,7 +182,7 @@ pub enum Error {
}

impl<'a> Tokenizer<'a> {
pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>> {
pub fn new(input: &'a str, span_offset: u32) -> Result<Tokenizer<'a>, Error> {
detect_invalid_input(input)?;

let mut t = Tokenizer {
Expand All @@ -194,7 +197,7 @@ impl<'a> Tokenizer<'a> {
Ok(t)
}

pub fn expect_semicolon(&mut self) -> Result<()> {
pub fn expect_semicolon(&mut self) -> Result<(), Error> {
self.expect(Token::Semicolon)?;
Ok(())
}
Expand All @@ -205,13 +208,13 @@ impl<'a> Tokenizer<'a> {
&self.input[start..end]
}

pub fn parse_id(&self, span: Span) -> Result<&'a str> {
pub fn parse_id(&self, span: Span) -> Result<&'a str, Error> {
let ret = self.get_span(span);
validate_id(span.start(), &ret)?;
Ok(ret)
}

pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str> {
pub fn parse_explicit_id(&self, span: Span) -> Result<&'a str, Error> {
let token = self.get_span(span);
let id_part = token.strip_prefix('%').unwrap();
validate_id(span.start(), id_part)?;
Expand Down Expand Up @@ -456,13 +459,11 @@ impl<'a> Iterator for CrlfFold<'a> {
}
}

fn detect_invalid_input(input: &str) -> Result<()> {
fn detect_invalid_input(input: &str) -> Result<(), Error> {
// Disallow specific codepoints.
let mut line = 1;
for ch in input.chars() {
for (pos, ch) in input.char_indices() {
match ch {
'\n' => line += 1,
'\r' | '\t' => {}
'\n' | '\r' | '\t' => {}

// Bidirectional override codepoints can be used to craft source code that
// appears to have a different meaning than its actual meaning. See
Expand All @@ -471,11 +472,7 @@ fn detect_invalid_input(input: &str) -> Result<()> {
// [CVE-2021-42574]: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2021-42574
'\u{202a}' | '\u{202b}' | '\u{202c}' | '\u{202d}' | '\u{202e}' | '\u{2066}'
| '\u{2067}' | '\u{2068}' | '\u{2069}' => {
bail!(
"Input contains bidirectional override codepoint {:?} at line {}",
ch.escape_unicode(),
line
);
return Err(Error::ForbiddenCodepoint(u32::try_from(pos).unwrap(), ch));
}

// Disallow several characters which are deprecated or discouraged in Unicode.
Expand All @@ -487,18 +484,14 @@ fn detect_invalid_input(input: &str) -> Result<()> {
// Unicode 13.0.0, sec. 16.4 Khmer, Characters Whose Use Is Discouraged.
'\u{149}' | '\u{673}' | '\u{f77}' | '\u{f79}' | '\u{17a3}' | '\u{17a4}'
| '\u{17b4}' | '\u{17b5}' => {
bail!(
"Codepoint {:?} at line {} is discouraged by Unicode",
ch.escape_unicode(),
line
);
return Err(Error::DeprecatedCodepoint(u32::try_from(pos).unwrap(), ch));
}

// Disallow control codes other than the ones explicitly recognized above,
// so that viewing a wit file on a terminal doesn't have surprising side
// effects or appear to have a different meaning than its actual meaning.
ch if ch.is_control() => {
bail!("Control code '{}' at line {}", ch.escape_unicode(), line);
return Err(Error::ControlCodepoint(u32::try_from(pos).unwrap(), ch));
}

_ => {}
Expand Down Expand Up @@ -635,6 +628,22 @@ impl Token {

impl core::error::Error for Error {}

impl Error {
pub fn position(&self) -> u32 {
match self {
Error::InvalidCharInId(at, _)
| Error::IdPartEmpty(at)
| Error::InvalidEscape(at, _)
| Error::Unexpected(at, _)
| Error::UnterminatedComment(at)
| Error::ForbiddenCodepoint(at, _)
| Error::DeprecatedCodepoint(at, _)
| Error::ControlCodepoint(at, _) => *at,
Error::Wanted { at, .. } => *at,
}
}
}

impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Expand All @@ -646,6 +655,21 @@ impl fmt::Display for Error {
Error::InvalidCharInId(_, ch) => write!(f, "invalid character in identifier {ch:?}"),
Error::IdPartEmpty(_) => write!(f, "identifiers must have characters between '-'s"),
Error::InvalidEscape(_, ch) => write!(f, "invalid escape in string {ch:?}"),
Error::ForbiddenCodepoint(_, ch) => {
write!(
f,
"Input contains bidirectional override codepoint {:?}",
ch.escape_unicode()
)
}
Error::DeprecatedCodepoint(_, ch) => {
write!(
f,
"Codepoint {:?} is discouraged by Unicode",
ch.escape_unicode()
)
}
Error::ControlCodepoint(_, ch) => write!(f, "Control code '{}'", ch.escape_unicode()),
}
}
}
Expand Down Expand Up @@ -712,6 +736,8 @@ fn test_validate_id() {

#[test]
fn test_tokenizer() {
use anyhow::Result;

fn collect(s: &str) -> Result<Vec<Token>> {
let mut t = Tokenizer::new(s, 0)?;
let mut tokens = Vec::new();
Expand Down
Loading
Loading