Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4450,6 +4450,7 @@ dependencies = [
"thin-vec",
"tracing",
"unicode-normalization",
"unicode-properties",
"unicode-width 0.2.2",
]

Expand Down
35 changes: 20 additions & 15 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ pub enum TokenKind {

/// A lifetime, e.g. `'a`.
Lifetime {
starts_with_number: bool,
invalid: bool,
},

/// `;`
Expand Down Expand Up @@ -584,7 +584,7 @@ impl<'a> Cursor<'a> {
let kind = RawStr { n_hashes: res.ok() };
Literal { kind, suffix_start }
}
_ => self.ident_or_unknown_prefix(),
_ => self.ident_or_unknown_prefix(false),
},

// Byte literal, byte string literal, raw byte string literal or identifier.
Expand All @@ -603,7 +603,7 @@ impl<'a> Cursor<'a> {

// Identifier (this should be checked after other variant that can
// start as identifier).
c if is_id_start(c) => self.ident_or_unknown_prefix(),
c if is_id_start(c) => self.ident_or_unknown_prefix(false),

// Numeric literal.
c @ '0'..='9' => {
Expand Down Expand Up @@ -661,7 +661,7 @@ impl<'a> Cursor<'a> {
Literal { kind, suffix_start }
}
// Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
c if is_emoji(c) => self.invalid_ident(),
_ => Unknown,
};
if matches!(self.frontmatter_allowed, FrontmatterAllowed::Yes)
Expand Down Expand Up @@ -832,25 +832,22 @@ impl<'a> Cursor<'a> {
RawIdent
}

fn ident_or_unknown_prefix(&mut self) -> TokenKind {
debug_assert!(is_id_start(self.prev()));
fn ident_or_unknown_prefix(&mut self, already_invalid: bool) -> TokenKind {
debug_assert!(is_id_start(self.prev()) || already_invalid);
// Start is already eaten, eat the rest of identifier.
self.eat_while(is_id_continue);
// Known prefixes must have been handled earlier. So if
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
c if is_emoji(c) => self.invalid_ident(),
_ => Ident,
}
}

fn invalid_ident(&mut self) -> TokenKind {
// Start is already eaten, eat the rest of identifier.
self.eat_while(|c| {
const ZERO_WIDTH_JOINER: char = '\u{200d}';
is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
});
self.eat_while(|c| is_id_continue(c) || is_emoji(c));
// An invalid identifier followed by '#' or '"' or '\'' could be
// interpreted as an invalid literal prefix. We don't bother doing that
// because the treatment of invalid identifiers and invalid prefixes
Expand Down Expand Up @@ -895,7 +892,7 @@ impl<'a> Cursor<'a> {
let kind = mk_kind_raw(res.ok());
Literal { kind, suffix_start }
}
_ => self.ident_or_unknown_prefix(),
_ => self.ident_or_unknown_prefix(false),
}
}

Expand Down Expand Up @@ -975,14 +972,18 @@ impl<'a> Cursor<'a> {
fn lifetime_or_char(&mut self) -> TokenKind {
debug_assert!(self.prev() == '\'');

let mut invalid = false;
let can_be_a_lifetime = if self.second() == '\'' {
// It's surely not a lifetime.
false
} else {
// If the first symbol is valid for identifier, it can be a lifetime.
// Also check if it's a number for a better error reporting (so '0 will
// be reported as invalid lifetime and not as unterminated char literal).
is_id_start(self.first()) || self.first().is_ascii_digit()
let c = self.first();
invalid |= c.is_ascii_digit();
invalid |= is_emoji(c);
is_id_start(c) || invalid
};

if !can_be_a_lifetime {
Expand Down Expand Up @@ -1012,7 +1013,7 @@ impl<'a> Cursor<'a> {
// First symbol can be a number (which isn't a valid identifier start),
// so skip it without any checks.
self.bump();
self.eat_while(is_id_continue);
invalid |= matches!(self.ident_or_unknown_prefix(invalid), InvalidIdent);

match self.first() {
// Check if after skipping literal contents we've met a closing
Expand All @@ -1024,7 +1025,7 @@ impl<'a> Cursor<'a> {
Literal { kind, suffix_start: self.pos_within_token() }
}
'#' if !starts_with_number => UnknownPrefixLifetime,
_ => Lifetime { starts_with_number },
_ => Lifetime { invalid },
}
}

Expand Down Expand Up @@ -1277,3 +1278,7 @@ impl<'a> Cursor<'a> {
self.eat_while(is_id_continue);
}
}

fn is_emoji(c: char) -> bool {
!c.is_ascii() && c.is_emoji_char()
}
2 changes: 1 addition & 1 deletion compiler/rustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ fn lifetime() {
"'abc",
FrontmatterAllowed::No,
expect![[r#"
Token { kind: Lifetime { starts_with_number: false }, len: 4 }
Token { kind: Lifetime { invalid: false }, len: 4 }
"#]],
);
}
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_parse/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ rustc_span = { path = "../rustc_span" }
thin-vec = "0.2.12"
tracing = "0.1"
unicode-normalization = "0.1.25"
unicode-properties = { version = "0.1.4", default-features = false, features = ["emoji"] }
unicode-width = "0.2.2"
# tidy-alphabetical-end

Expand Down
39 changes: 32 additions & 7 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use rustc_session::lint::builtin::{
use rustc_session::parse::ParseSess;
use rustc_span::{BytePos, Pos, Span, Symbol, sym};
use tracing::debug;
use unicode_properties::emoji::UnicodeEmoji;

use crate::errors;
use crate::lexer::diagnostics::TokenTreeDiagInfo;
Expand Down Expand Up @@ -316,18 +317,42 @@ impl<'psess, 'src> Lexer<'psess, 'src> {
self.lint_literal_unicode_text_flow(symbol, kind, self.mk_sp(start, self.pos), "literal");
token::Literal(token::Lit { kind, symbol, suffix })
}
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
rustc_lexer::TokenKind::Lifetime { invalid } => {
// Include the leading `'` in the real identifier, for macro
// expansion purposes. See #12512 for the gory details of why
// this is necessary.
let lifetime_name = nfc_normalize(self.str_from(start));
self.last_lifetime = Some(self.mk_sp(start, start + BytePos(1)));
if starts_with_number {
let span = self.mk_sp(start, self.pos);
self.dcx()
.struct_err("lifetimes cannot start with a number")
.with_span(span)
.stash(span, StashKey::LifetimeIsChar);
let span = self.mk_sp(start, self.pos);
if invalid {
let name = lifetime_name.as_str();
// skip(1) to skip the `'`
let starts_with_number = matches!(
name.chars().skip(1).next(),
Some(c) if c.is_ascii_digit()
);
if name.chars().any(|c| !c.is_ascii() && c.is_emoji_char()) {
self.psess
.bad_unicode_identifiers
.borrow_mut()
.entry(lifetime_name)
.or_default()
.push(span);
}
if starts_with_number {
let mut err = self.dcx()
.struct_err(format!(
"lifetimes cannot start with a number: `{name}`"
))
.with_span(span);
if name.len() > 2 {
// Point at the first lifetime name character.
let start_span = self.mk_sp(start + BytePos(1), start + BytePos(2));
err.span(start_span);
err.span_label(span, "");
}
err.stash(span, StashKey::LifetimeIsChar);
}
}
token::Lifetime(lifetime_name, IdentIsRaw::No)
}
Expand Down
6 changes: 3 additions & 3 deletions src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,9 @@ impl<'a> Converter<'a> {
return;
}

rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
if *starts_with_number {
errors.push("Lifetime name cannot start with a number".into());
rustc_lexer::TokenKind::Lifetime { invalid } => {
if *invalid {
errors.push("Lifetime name contains invalid characters".into());
}
LIFETIME_IDENT
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
LIFETIME_IDENT "'1" error: Lifetime name cannot start with a number
LIFETIME_IDENT "'1" error: Lifetime name contains invalid characters
WHITESPACE "\n"
LIFETIME_IDENT "'1lifetime" error: Lifetime name cannot start with a number
LIFETIME_IDENT "'1lifetime" error: Lifetime name contains invalid characters
WHITESPACE "\n"
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
LIFETIME_IDENT "'1" error: Lifetime name cannot start with a number
LIFETIME_IDENT "'1" error: Lifetime name contains invalid characters
WHITESPACE "\n"
LIFETIME_IDENT "'1lifetime" error: Lifetime name cannot start with a number
LIFETIME_IDENT "'1lifetime" error: Lifetime name contains invalid characters
WHITESPACE "\n"
Original file line number Diff line number Diff line change
@@ -1 +1 @@
CHAR "'🦀" error: Missing trailing `'` symbol to terminate the character literal
LIFETIME_IDENT "'🦀" error: Lifetime name contains invalid characters
Original file line number Diff line number Diff line change
Expand Up @@ -302,9 +302,9 @@ impl<S> TokenStream<S> {
span: span.derive_ranged(range),
}))
}
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
if starts_with_number {
return Err("Lifetime cannot start with a number".to_owned());
rustc_lexer::TokenKind::Lifetime { invalid } => {
if invalid {
return Err(format!("Invalid lifetime identifier: `{}`", &s[range]));
}
let range = range.start + 1..range.end;
tokenstream.push(TokenTree::Punct(Punct {
Expand Down
16 changes: 16 additions & 0 deletions tests/ui/lexer/emoji-in-lifetime.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// #141081
fn bad_lifetime_name<
'🐛🐛🐛family👨‍👩‍👧‍👦,//~ ERROR: identifiers cannot contain emoji
'12, //~ ERROR: lifetimes cannot start with a number
'a🐛, //~ ERROR: identifiers cannot contain emoji
'1🐛, //~ ERROR: identifiers cannot contain emoji
//~^ ERROR: lifetimes cannot start with a number
'1, //~ ERROR: lifetimes cannot start with a number
'a‌b // bare zero-width-joiners are accepted as XID_Continue
>() {}

fn main() {
'a🐛: { // pointed at on the error from line 5
todo!();
};
}
41 changes: 41 additions & 0 deletions tests/ui/lexer/emoji-in-lifetime.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
error: identifiers cannot contain emoji: `'🐛🐛🐛family👨👩👧👦`
--> $DIR/emoji-in-lifetime.rs:3:5
|
LL | '🐛🐛🐛family👨👩👧👦,
| ^^^^^^^^^^^^^^^^^^^^^

error: identifiers cannot contain emoji: `'a🐛`
--> $DIR/emoji-in-lifetime.rs:5:5
|
LL | 'a🐛,
| ^^^^
...
LL | 'a🐛: { // pointed at on the error from line 5
| ^^^^

error: identifiers cannot contain emoji: `'1🐛`
--> $DIR/emoji-in-lifetime.rs:6:5
|
LL | '1🐛,
| ^^^^

error: lifetimes cannot start with a number: `'12`
--> $DIR/emoji-in-lifetime.rs:4:6
|
LL | '12,
| -^-

error: lifetimes cannot start with a number: `'1🐛`
--> $DIR/emoji-in-lifetime.rs:6:6
|
LL | '1🐛,
| -^--

error: lifetimes cannot start with a number: `'1`
--> $DIR/emoji-in-lifetime.rs:8:5
|
LL | '1,
| ^^

error: aborting due to 6 previous errors

2 changes: 1 addition & 1 deletion tests/ui/lexer/lex-bad-str-literal-as-char-1.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ LL - println!('1 + 1');
LL + println!("1 + 1");
|

error: lifetimes cannot start with a number
error: lifetimes cannot start with a number: `'1`
--> $DIR/lex-bad-str-literal-as-char-1.rs:3:14
|
LL | println!('1 + 1');
Expand Down
4 changes: 2 additions & 2 deletions tests/ui/parser/numeric-lifetime.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ LL | let x: usize = "";
| |
| expected due to this

error: lifetimes cannot start with a number
error: lifetimes cannot start with a number: `'1`
--> $DIR/numeric-lifetime.rs:1:10
|
LL | struct S<'1> { s: &'1 usize }
| ^^

error: lifetimes cannot start with a number
error: lifetimes cannot start with a number: `'1`
--> $DIR/numeric-lifetime.rs:1:20
|
LL | struct S<'1> { s: &'1 usize }
Expand Down
Loading