diff --git a/Cargo.lock b/Cargo.lock index 98567f858e9f1..2763d12fc9c33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4450,6 +4450,7 @@ dependencies = [ "thin-vec", "tracing", "unicode-normalization", + "unicode-properties", "unicode-width 0.2.2", ] diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index dc6e3b1f358dc..fea8980103cce 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -140,7 +140,7 @@ pub enum TokenKind { /// A lifetime, e.g. `'a`. Lifetime { - starts_with_number: bool, + invalid: bool, }, /// `;` @@ -584,7 +584,7 @@ impl<'a> Cursor<'a> { let kind = RawStr { n_hashes: res.ok() }; Literal { kind, suffix_start } } - _ => self.ident_or_unknown_prefix(), + _ => self.ident_or_unknown_prefix(false), }, // Byte literal, byte string literal, raw byte string literal or identifier. @@ -603,7 +603,7 @@ impl<'a> Cursor<'a> { // Identifier (this should be checked after other variant that can // start as identifier). - c if is_id_start(c) => self.ident_or_unknown_prefix(), + c if is_id_start(c) => self.ident_or_unknown_prefix(false), // Numeric literal. c @ '0'..='9' => { @@ -661,7 +661,7 @@ impl<'a> Cursor<'a> { Literal { kind, suffix_start } } // Identifier starting with an emoji. Only lexed for graceful error recovery. - c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), + c if is_emoji(c) => self.invalid_ident(), _ => Unknown, }; if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes) @@ -832,25 +832,22 @@ impl<'a> Cursor<'a> { RawIdent } - fn ident_or_unknown_prefix(&mut self) -> TokenKind { - debug_assert!(is_id_start(self.prev())); + fn ident_or_unknown_prefix(&mut self, already_invalid: bool) -> TokenKind { + debug_assert!(is_id_start(self.prev()) || already_invalid); // Start is already eaten, eat the rest of identifier. self.eat_while(is_id_continue); // Known prefixes must have been handled earlier. So if // we see a prefix here, it is definitely an unknown prefix. match self.first() { '#' | '"' | '\'' => UnknownPrefix, - c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(), + c if is_emoji(c) => self.invalid_ident(), _ => Ident, } } fn invalid_ident(&mut self) -> TokenKind { // Start is already eaten, eat the rest of identifier. - self.eat_while(|c| { - const ZERO_WIDTH_JOINER: char = '\u{200d}'; - is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER - }); + self.eat_while(|c| is_id_continue(c) || is_emoji(c)); // An invalid identifier followed by '#' or '"' or '\'' could be // interpreted as an invalid literal prefix. We don't bother doing that // because the treatment of invalid identifiers and invalid prefixes @@ -895,7 +892,7 @@ impl<'a> Cursor<'a> { let kind = mk_kind_raw(res.ok()); Literal { kind, suffix_start } } - _ => self.ident_or_unknown_prefix(), + _ => self.ident_or_unknown_prefix(false), } } @@ -975,6 +972,7 @@ impl<'a> Cursor<'a> { fn lifetime_or_char(&mut self) -> TokenKind { debug_assert!(self.prev() == '\''); + let mut invalid = false; let can_be_a_lifetime = if self.second() == '\'' { // It's surely not a lifetime. false @@ -982,7 +980,10 @@ impl<'a> Cursor<'a> { // If the first symbol is valid for identifier, it can be a lifetime. // Also check if it's a number for a better error reporting (so '0 will // be reported as invalid lifetime and not as unterminated char literal). - is_id_start(self.first()) || self.first().is_ascii_digit() + let c = self.first(); + invalid |= c.is_ascii_digit(); + invalid |= is_emoji(c); + is_id_start(c) || invalid }; if !can_be_a_lifetime { @@ -1012,7 +1013,7 @@ impl<'a> Cursor<'a> { // First symbol can be a number (which isn't a valid identifier start), // so skip it without any checks. self.bump(); - self.eat_while(is_id_continue); + invalid |= matches!(self.ident_or_unknown_prefix(invalid), InvalidIdent); match self.first() { // Check if after skipping literal contents we've met a closing @@ -1024,7 +1025,7 @@ impl<'a> Cursor<'a> { Literal { kind, suffix_start: self.pos_within_token() } } '#' if !starts_with_number => UnknownPrefixLifetime, - _ => Lifetime { starts_with_number }, + _ => Lifetime { invalid }, } } @@ -1277,3 +1278,7 @@ impl<'a> Cursor<'a> { self.eat_while(is_id_continue); } } + +fn is_emoji(c: char) -> bool { + !c.is_ascii() && c.is_emoji_char() +} diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs index a7357ba38c8e4..f74ba7f532a68 100644 --- a/compiler/rustc_lexer/src/tests.rs +++ b/compiler/rustc_lexer/src/tests.rs @@ -231,7 +231,7 @@ fn lifetime() { "'abc", FrontmatterAllowed::No, expect![[r#" - Token { kind: Lifetime { starts_with_number: false }, len: 4 } + Token { kind: Lifetime { invalid: false }, len: 4 } "#]], ); } diff --git a/compiler/rustc_parse/Cargo.toml b/compiler/rustc_parse/Cargo.toml index 28a67ae12126b..2c9c4f54d1034 100644 --- a/compiler/rustc_parse/Cargo.toml +++ b/compiler/rustc_parse/Cargo.toml @@ -20,6 +20,7 @@ rustc_span = { path = "../rustc_span" } thin-vec = "0.2.12" tracing = "0.1" unicode-normalization = "0.1.25" +unicode-properties = { version = "0.1.4", default-features = false, features = ["emoji"] } unicode-width = "0.2.2" # tidy-alphabetical-end diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index cd90655125b2b..7d26ea9454f4f 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -17,6 +17,7 @@ use rustc_session::lint::builtin::{ use rustc_session::parse::ParseSess; use rustc_span::{BytePos, Pos, Span, Symbol, sym}; use tracing::debug; +use unicode_properties::emoji::UnicodeEmoji; use crate::errors; use crate::lexer::diagnostics::TokenTreeDiagInfo; @@ -316,18 +317,42 @@ impl<'psess, 'src> Lexer<'psess, 'src> { self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal"); token::Literal(token::Lit { kind, symbol, suffix }) } - rustc_lexer::TokenKind::Lifetime { starts_with_number } => { + rustc_lexer::TokenKind::Lifetime { invalid } => { // Include the leading `'` in the real identifier, for macro // expansion purposes. See #12512 for the gory details of why // this is necessary. let lifetime_name = nfc_normalize(self.str_from(start)); self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1))); - if starts_with_number { - let span = self.mk_sp(start, self.pos); - self.dcx() - .struct_err("lifetimes cannot start with a number") - .with_span(span) - .stash(span, StashKey::LifetimeIsChar); + let span = self.mk_sp(start, self.pos); + if invalid { + let name = lifetime_name.as_str(); + // skip(1) to skip the `'` + let starts_with_number = matches!( + name.chars().skip(1).next(), + Some(c) if c.is_ascii_digit() + ); + if name.chars().any(|c| !c.is_ascii() && c.is_emoji_char()) { + self.psess + .bad_unicode_identifiers + .borrow_mut() + .entry(lifetime_name) + .or_default() + .push(span); + } + if starts_with_number { + let mut err = self.dcx() + .struct_err(format!( + "lifetimes cannot start with a number: `{name}`" + )) + .with_span(span); + if name.len() > 2 { + // Point at the first lifetime name character. + let start_span = self.mk_sp(start + BytePos(1), start + BytePos(2)); + err.span(start_span); + err.span_label(span, ""); + } + err.stash(span, StashKey::LifetimeIsChar); + } } token::Lifetime(lifetime_name, IdentIsRaw::No) } diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index d7eec6cde8c01..df7bef5843da8 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -255,9 +255,9 @@ impl<'a> Converter<'a> { return; } - rustc_lexer::TokenKind::Lifetime { starts_with_number } => { - if *starts_with_number { - errors.push("Lifetime name cannot start with a number".into()); + rustc_lexer::TokenKind::Lifetime { invalid } => { + if *invalid { + errors.push("Lifetime name contains invalid characters".into()); } LIFETIME_IDENT } diff --git a/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/lifetime_starts_with_a_number.rast b/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/lifetime_starts_with_a_number.rast index e919bf2a4aef2..b2bf087e749f3 100644 --- a/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/lifetime_starts_with_a_number.rast +++ b/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/lifetime_starts_with_a_number.rast @@ -1,4 +1,4 @@ -LIFETIME_IDENT "'1" error: Lifetime name cannot start with a number +LIFETIME_IDENT "'1" error: Lifetime name contains invalid characters WHITESPACE "\n" -LIFETIME_IDENT "'1lifetime" error: Lifetime name cannot start with a number +LIFETIME_IDENT "'1lifetime" error: Lifetime name contains invalid characters WHITESPACE "\n" diff --git a/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/lifetime_starts_with_a_number.txt b/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/lifetime_starts_with_a_number.txt index e919bf2a4aef2..b2bf087e749f3 100644 --- a/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/lifetime_starts_with_a_number.txt +++ b/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/lifetime_starts_with_a_number.txt @@ -1,4 +1,4 @@ -LIFETIME_IDENT "'1" error: Lifetime name cannot start with a number +LIFETIME_IDENT "'1" error: Lifetime name contains invalid characters WHITESPACE "\n" -LIFETIME_IDENT "'1lifetime" error: Lifetime name cannot start with a number +LIFETIME_IDENT "'1lifetime" error: Lifetime name contains invalid characters WHITESPACE "\n" diff --git a/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/unclosed_char_with_ferris.rast b/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/unclosed_char_with_ferris.rast index 56f19cce0784e..6ace31f434ef2 100644 --- a/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/unclosed_char_with_ferris.rast +++ b/src/tools/rust-analyzer/crates/parser/test_data/lexer/err/unclosed_char_with_ferris.rast @@ -1 +1 @@ -CHAR "'๐Ÿฆ€" error: Missing trailing `'` symbol to terminate the character literal +LIFETIME_IDENT "'๐Ÿฆ€" error: Lifetime name contains invalid characters diff --git a/src/tools/rust-analyzer/crates/proc-macro-srv/src/token_stream.rs b/src/tools/rust-analyzer/crates/proc-macro-srv/src/token_stream.rs index 2358f6963c79e..d6773153ed988 100644 --- a/src/tools/rust-analyzer/crates/proc-macro-srv/src/token_stream.rs +++ b/src/tools/rust-analyzer/crates/proc-macro-srv/src/token_stream.rs @@ -302,9 +302,9 @@ impl TokenStream { span: span.derive_ranged(range), })) } - rustc_lexer::TokenKind::Lifetime { starts_with_number } => { - if starts_with_number { - return Err("Lifetime cannot start with a number".to_owned()); + rustc_lexer::TokenKind::Lifetime { invalid } => { + if invalid { + return Err(format!("Invalid lifetime identifier: `{}`", &s[range])); } let range = range.start + 1..range.end; tokenstream.push(TokenTree::Punct(Punct { diff --git a/tests/ui/lexer/emoji-in-lifetime.rs b/tests/ui/lexer/emoji-in-lifetime.rs new file mode 100644 index 0000000000000..e4a12f332ae70 --- /dev/null +++ b/tests/ui/lexer/emoji-in-lifetime.rs @@ -0,0 +1,16 @@ +// #141081 +fn bad_lifetime_name< + '๐Ÿ›๐Ÿ›๐Ÿ›family๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ,//~ ERROR: identifiers cannot contain emoji + '12, //~ ERROR: lifetimes cannot start with a number + 'a๐Ÿ›, //~ ERROR: identifiers cannot contain emoji + '1๐Ÿ›, //~ ERROR: identifiers cannot contain emoji + //~^ ERROR: lifetimes cannot start with a number + '1, //~ ERROR: lifetimes cannot start with a number + 'aโ€Œb // bare zero-width-joiners are accepted as XID_Continue +>() {} + +fn main() { + 'a๐Ÿ›: { // pointed at on the error from line 5 + todo!(); + }; +} diff --git a/tests/ui/lexer/emoji-in-lifetime.stderr b/tests/ui/lexer/emoji-in-lifetime.stderr new file mode 100644 index 0000000000000..7e205639a9a3c --- /dev/null +++ b/tests/ui/lexer/emoji-in-lifetime.stderr @@ -0,0 +1,41 @@ +error: identifiers cannot contain emoji: `'๐Ÿ›๐Ÿ›๐Ÿ›family๐Ÿ‘จ๐Ÿ‘ฉ๐Ÿ‘ง๐Ÿ‘ฆ` + --> $DIR/emoji-in-lifetime.rs:3:5 + | +LL | '๐Ÿ›๐Ÿ›๐Ÿ›family๐Ÿ‘จ๐Ÿ‘ฉ๐Ÿ‘ง๐Ÿ‘ฆ, + | ^^^^^^^^^^^^^^^^^^^^^ + +error: identifiers cannot contain emoji: `'a๐Ÿ›` + --> $DIR/emoji-in-lifetime.rs:5:5 + | +LL | 'a๐Ÿ›, + | ^^^^ +... +LL | 'a๐Ÿ›: { // pointed at on the error from line 5 + | ^^^^ + +error: identifiers cannot contain emoji: `'1๐Ÿ›` + --> $DIR/emoji-in-lifetime.rs:6:5 + | +LL | '1๐Ÿ›, + | ^^^^ + +error: lifetimes cannot start with a number: `'12` + --> $DIR/emoji-in-lifetime.rs:4:6 + | +LL | '12, + | -^- + +error: lifetimes cannot start with a number: `'1๐Ÿ›` + --> $DIR/emoji-in-lifetime.rs:6:6 + | +LL | '1๐Ÿ›, + | -^-- + +error: lifetimes cannot start with a number: `'1` + --> $DIR/emoji-in-lifetime.rs:8:5 + | +LL | '1, + | ^^ + +error: aborting due to 6 previous errors + diff --git a/tests/ui/lexer/lex-bad-str-literal-as-char-1.stderr b/tests/ui/lexer/lex-bad-str-literal-as-char-1.stderr index 81ee697802b9c..7503774cd686f 100644 --- a/tests/ui/lexer/lex-bad-str-literal-as-char-1.stderr +++ b/tests/ui/lexer/lex-bad-str-literal-as-char-1.stderr @@ -10,7 +10,7 @@ LL - println!('1 + 1'); LL + println!("1 + 1"); | -error: lifetimes cannot start with a number +error: lifetimes cannot start with a number: `'1` --> $DIR/lex-bad-str-literal-as-char-1.rs:3:14 | LL | println!('1 + 1'); diff --git a/tests/ui/parser/numeric-lifetime.stderr b/tests/ui/parser/numeric-lifetime.stderr index 7c1bcb7263171..46b06ae7dc3cb 100644 --- a/tests/ui/parser/numeric-lifetime.stderr +++ b/tests/ui/parser/numeric-lifetime.stderr @@ -6,13 +6,13 @@ LL | let x: usize = ""; | | | expected due to this -error: lifetimes cannot start with a number +error: lifetimes cannot start with a number: `'1` --> $DIR/numeric-lifetime.rs:1:10 | LL | struct S<'1> { s: &'1 usize } | ^^ -error: lifetimes cannot start with a number +error: lifetimes cannot start with a number: `'1` --> $DIR/numeric-lifetime.rs:1:20 | LL | struct S<'1> { s: &'1 usize }