Skip to content

Commit 21224e6

Browse files
committed
Account for confusable codepoints when recovering emoji identifiers
1 parent 4489aeb commit 21224e6

File tree

4 files changed

+30
-7
lines changed

4 files changed

+30
-7
lines changed

compiler/rustc_parse/src/lexer/mod.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::lexer::unicode_chars::UNICODE_ARRAY;
12
use rustc_ast::ast::{self, AttrStyle};
23
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
34
use rustc_ast::tokenstream::{Spacing, TokenStream};
@@ -222,7 +223,17 @@ impl<'a> StringReader<'a> {
222223
}
223224
token::Ident(sym, is_raw_ident)
224225
}
225-
rustc_lexer::TokenKind::InvalidIdent => {
226+
rustc_lexer::TokenKind::InvalidIdent
227+
// Do not recover an identifier with emojis if the codepoint is a confusable
228+
// with a recoverable substitution token, like `➖`.
229+
if UNICODE_ARRAY
230+
.iter()
231+
.find(|&&(c, _, _)| {
232+
let sym = self.str_from(start);
233+
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
234+
})
235+
.is_none() =>
236+
{
226237
let sym = nfc_normalize(self.str_from(start));
227238
let span = self.mk_sp(start, self.pos);
228239
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
@@ -299,7 +310,7 @@ impl<'a> StringReader<'a> {
299310
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
300311
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
301312

302-
rustc_lexer::TokenKind::Unknown => {
313+
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
303314
let c = self.str_from(start).chars().next().unwrap();
304315
let mut err =
305316
self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);

compiler/rustc_parse/src/lexer/unicode_chars.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
77
use rustc_span::{symbol::kw, BytePos, Pos, Span};
88

99
#[rustfmt::skip] // for line breaks
10-
const UNICODE_ARRAY: &[(char, &str, char)] = &[
10+
pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
1111
('
', "Line Separator", ' '),
1212
('
', "Paragraph Separator", ' '),
1313
(' ', "Ogham Space mark", ' '),

src/test/ui/parser/emoji-identifiers.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ fn i_like_to_😅_a_lot() -> 👀 { //~ ERROR identifiers cannot contain emojis
1010
//~^ ERROR identifiers cannot contain emojis
1111
}
1212
fn main() {
13-
let _ = i_like_to_😄_a_lot(); //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
13+
let _ = i_like_to_😄_a_lot()4; //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
1414
//~^ ERROR identifiers cannot contain emojis
15+
//~| ERROR unknown start of token: \u{2796}
1516
}

src/test/ui/parser/emoji-identifiers.stderr

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,27 @@
1+
error: unknown start of token: \u{2796}
2+
--> $DIR/emoji-identifiers.rs:13:33
3+
|
4+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
5+
| ^^
6+
|
7+
help: Unicode character '➖' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not
8+
|
9+
LL | let _ = i_like_to_😄_a_lot() - 4;
10+
| ~
11+
112
error[E0425]: cannot find function `i_like_to_😄_a_lot` in this scope
213
--> $DIR/emoji-identifiers.rs:13:13
314
|
415
LL | fn i_like_to_😅_a_lot() -> 👀 {
516
| ----------------------------- similarly named function `i_like_to_😅_a_lot` defined here
617
...
7-
LL | let _ = i_like_to_😄_a_lot();
18+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
819
| ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_😅_a_lot`
920

1021
error: identifiers cannot contain emojis: `i_like_to_😄_a_lot`
1122
--> $DIR/emoji-identifiers.rs:13:13
1223
|
13-
LL | let _ = i_like_to_😄_a_lot();
24+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
1425
| ^^^^^^^^^^^^^^^^^^
1526

1627
error: identifiers cannot contain emojis: `full_of_✨`
@@ -66,7 +77,7 @@ LL | 👀::full_of✨()
6677
| function or associated item not found in `👀`
6778
| help: there is an associated function with a similar name: `full_of_✨`
6879

69-
error: aborting due to 8 previous errors
80+
error: aborting due to 9 previous errors
7081

7182
Some errors have detailed explanations: E0425, E0599.
7283
For more information about an error, try `rustc --explain E0425`.

0 commit comments

Comments
 (0)