Skip to content

Commit 1931d49

Browse files
tamaroningphilberty
authored andcommitted
gccrs: tokenize Unicode identifiers
gcc/rust/ChangeLog: * lex/rust-lex.cc (is_whitespace):add all lacked codepoints valid as whitespaces (is_identifier_start):new function to check XID_Start and underscore (is_identifier_continue):new function to check XID_Continue (Lexer::build_token):tokenize Unicode identifiers (Lexer::parse_partial_string_continue):add comments (Lexer::parse_partial_unicode_escape):add comments (Lexer::parse_raw_identifier):change to use `is_identifier_scontinue` (Lexer::parse_identifier_or_keyword):change to use `is_identifier_continue` (Lexer::parse_char_or_lifetime):change to use `is_identifier_start/continue` (Lexer::skip_codepoint_input):do not attempt to skip input when bumping EOF * lex/rust-lex.h:add `current_char32` field Signed-off-by: Raiki Tamura <[email protected]>
1 parent c05b7f2 commit 1931d49

File tree

2 files changed

+61
-32
lines changed

2 files changed

+61
-32
lines changed

gcc/rust/lex/rust-lex.cc

+57-30
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "rust-linemap.h"
2323
#include "rust-session-manager.h"
2424
#include "safe-ctype.h"
25+
#include "cpplib.h"
2526

2627
namespace Rust {
2728
// TODO: move to separate compilation unit?
@@ -103,11 +104,17 @@ check_valid_float_dot_end (char character)
103104
return character != '.' && character != '_' && !ISALPHA (character);
104105
}
105106

106-
// ISSPACE from safe-ctype but may change in future
107107
bool
108-
is_whitespace (char character)
108+
is_whitespace (int character)
109109
{
110-
return ISSPACE (character);
110+
// https://doc.rust-lang.org/reference/whitespace.html
111+
return character == '\t' || character == '\n' || character == '\v'
112+
|| character == '\f' || character == '\r' || character == ' '
113+
|| character == 0x0085 // next line
114+
|| character == 0x200e // left-to-right mark
115+
|| character == 0x200f // right-to-left mark
116+
|| character == 0x2028 // line separator
117+
|| character == 0x2029; // pragraph separator
111118
}
112119

113120
bool
@@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character)
116123
return character == 'x' || character == 'o' || character == 'b';
117124
}
118125

126+
bool
127+
is_identifier_start (int codepoint)
128+
{
129+
return (check_xid_property (codepoint) & XID_START) || codepoint == '_';
130+
}
131+
132+
bool
133+
is_identifier_continue (int codepoint)
134+
{
135+
return check_xid_property (codepoint) & XID_CONTINUE;
136+
}
137+
119138
Lexer::Lexer (const std::string &input)
120139
: input (RAIIFile::create_error ()), current_line (1), current_column (1),
121140
line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()),
@@ -284,22 +303,22 @@ Lexer::build_token ()
284303
while (true)
285304
{
286305
Location loc = get_current_location ();
287-
current_char = peek_input ();
288-
skip_input ();
289306

290307
// detect UTF8 bom
291308
//
292309
// Must be the first thing on the first line.
293310
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is
294311
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
295-
if (current_line == 1 && current_column == 1 && current_char == 0xef
296-
&& peek_input () == 0xbb && peek_input (1) == 0xbf)
312+
if (current_line == 1 && current_column == 1 && peek_input () == 0xef
313+
&& peek_input (1) == 0xbb && peek_input (2) == 0xbf)
297314
{
298-
skip_input (1);
299-
current_char = peek_input ();
300-
skip_input ();
315+
skip_input (2);
301316
}
302317

318+
current_char = peek_input ();
319+
current_char32 = peek_codepoint_input ();
320+
skip_codepoint_input ();
321+
303322
// detect shebang
304323
// Must be the first thing on the first line, starting with #!
305324
// But since an attribute can also start with an #! we don't count it as a
@@ -312,6 +331,7 @@ Lexer::build_token ()
312331
int n = 1;
313332
while (true)
314333
{
334+
// TODO use utf-8 codepoint to skip whitespaces
315335
int next_char = peek_input (n);
316336
if (is_whitespace (next_char))
317337
n++;
@@ -1052,7 +1072,8 @@ Lexer::build_token ()
10521072
int peek = peek_input ();
10531073
int peek1 = peek_input (1);
10541074

1055-
if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
1075+
// TODO (tamaron) parse Unicode ident
1076+
if (peek == '#' && is_identifier_start (peek1))
10561077
{
10571078
TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
10581079
if (raw_ident_ptr != nullptr)
@@ -1069,8 +1090,8 @@ Lexer::build_token ()
10691090
}
10701091
}
10711092

1072-
// find identifiers and keywords
1073-
if (ISALPHA (current_char) || current_char == '_')
1093+
// find identifiers and keywords.
1094+
if (is_identifier_start (current_char32.value))
10741095
return parse_identifier_or_keyword (loc);
10751096

10761097
// int and float literals
@@ -1468,6 +1489,7 @@ Lexer::parse_partial_string_continue ()
14681489
int additional_length_offset = 1;
14691490

14701491
// string continue
1492+
// TODO use utf-8 codepoint to skip whitespaces
14711493
while (is_whitespace (current_char))
14721494
{
14731495
if (current_char == '\n')
@@ -1611,6 +1633,7 @@ Lexer::parse_partial_unicode_escape ()
16111633
// wrong bracketm whitespace or single/double quotes are wrong
16121634
// termination, otherwise it is a wrong character, then skip to the actual
16131635
// terminator.
1636+
// TODO use utf-8 codepoint to skip whitespaces
16141637
if (current_char == '{' || is_whitespace (current_char)
16151638
|| current_char == '\'' || current_char == '"')
16161639
{
@@ -1623,6 +1646,7 @@ Lexer::parse_partial_unicode_escape ()
16231646
rust_error_at (get_current_location (),
16241647
"invalid character %<%c%> in unicode escape",
16251648
current_char);
1649+
// TODO use utf-8 codepoint to skip whitespaces
16261650
while (current_char != '}' && current_char != '{'
16271651
&& !is_whitespace (current_char) && current_char != '\''
16281652
&& current_char != '"')
@@ -1905,8 +1929,7 @@ Lexer::parse_raw_identifier (Location loc)
19051929
int length = 0;
19061930
current_char = peek_input ();
19071931
// loop through entire name
1908-
while (ISALPHA (current_char) || ISDIGIT (current_char)
1909-
|| current_char == '_')
1932+
while (is_identifier_continue (current_char))
19101933
{
19111934
length++;
19121935

@@ -2042,21 +2065,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
20422065
{
20432066
std::string str;
20442067
str.reserve (16); // default
2045-
str += current_char;
2068+
str += current_char32.as_string ();
20462069

20472070
bool first_is_underscore = current_char == '_';
20482071

20492072
int length = 1;
2050-
current_char = peek_input ();
2073+
current_char32 = peek_codepoint_input ();
2074+
20512075
// loop through entire name
2052-
while (ISALPHA (current_char) || ISDIGIT (current_char)
2053-
|| current_char == '_')
2076+
while (is_identifier_continue (current_char32.value))
20542077
{
2078+
auto s = current_char32.as_string ();
20552079
length++;
20562080

2057-
str += current_char;
2058-
skip_input ();
2059-
current_char = peek_input ();
2081+
str += current_char32.as_string ();
2082+
skip_codepoint_input ();
2083+
current_char32 = peek_codepoint_input ();
20602084
}
20612085

20622086
current_column += length;
@@ -2444,28 +2468,29 @@ Lexer::parse_char_or_lifetime (Location loc)
24442468

24452469
return Token::make_char (loc, current_char32);
24462470
}
2447-
else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
2448-
|| current_char32.value == '_')
2471+
else if (is_identifier_start (current_char32.value))
24492472
{
24502473
// parse lifetime name
24512474
std::string str;
24522475
str += current_char32;
24532476
length++;
24542477

2455-
current_char = peek_input ();
2456-
while (ISDIGIT (current_char) || ISALPHA (current_char)
2457-
|| current_char == '_')
2478+
current_char32 = peek_codepoint_input ();
2479+
while (is_identifier_continue (current_char32.value))
24582480
{
2459-
str += current_char;
2460-
skip_input ();
2461-
current_char = peek_input ();
2481+
str += current_char32;
2482+
skip_codepoint_input ();
2483+
current_char32 = peek_codepoint_input ();
24622484
length++;
24632485
}
24642486

24652487
current_column += length;
24662488

24672489
loc += length - 1;
24682490

2491+
// TODO some keywords cannot be used for a lifetime label #2306
2492+
// https://doc.rust-lang.org/reference/tokens.html
2493+
24692494
str.shrink_to_fit ();
24702495
return Token::make_lifetime (loc, std::move (str));
24712496
}
@@ -2637,6 +2662,8 @@ Lexer::peek_codepoint_input ()
26372662
void
26382663
Lexer::skip_codepoint_input ()
26392664
{
2665+
if (peek_input () == EOF)
2666+
return;
26402667
int toSkip = get_input_codepoint_length ();
26412668
gcc_assert (toSkip >= 1);
26422669

gcc/rust/lex/rust-lex.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ class Lexer
118118
// Advances current input char to n + 1 chars ahead of current position.
119119
void skip_input (int n);
120120

121-
// Returns char n chars ahead of current position.
122-
int peek_input ();
123121
// Peeks the current char.
122+
int peek_input ();
123+
// Returns char n bytes ahead of current position.
124124
int peek_input (int n);
125125

126126
// Classifies keyword (i.e. gets id for keyword).
@@ -137,6 +137,7 @@ class Lexer
137137

138138
int get_input_codepoint_length ();
139139
int test_get_input_codepoint_n_length (int n_start_offset);
140+
// Peeks the current utf-8 char
140141
Codepoint peek_codepoint_input ();
141142
Codepoint test_peek_codepoint_input (int n);
142143
void skip_codepoint_input ();
@@ -220,6 +221,7 @@ class Lexer
220221
int current_column;
221222
// Current character.
222223
int current_char;
224+
Codepoint current_char32;
223225
// Line map.
224226
Linemap *line_map;
225227

0 commit comments

Comments
 (0)