22
22
#include "rust-linemap.h"
23
23
#include "rust-session-manager.h"
24
24
#include "safe-ctype.h"
25
+ #include "cpplib.h"
25
26
26
27
namespace Rust {
27
28
// TODO: move to separate compilation unit?
@@ -103,11 +104,17 @@ check_valid_float_dot_end (char character)
103
104
return character != '.' && character != '_' && !ISALPHA (character);
104
105
}
105
106
106
- // ISSPACE from safe-ctype but may change in future
107
107
bool
108
- is_whitespace (char character)
108
+ is_whitespace (int character)
109
109
{
110
- return ISSPACE (character);
110
+ // https://doc.rust-lang.org/reference/whitespace.html
111
+ return character == '\t' || character == '\n' || character == '\v'
112
+ || character == '\f' || character == '\r' || character == ' '
113
+ || character == 0x0085 // next line
114
+ || character == 0x200e // left-to-right mark
115
+ || character == 0x200f // right-to-left mark
116
+ || character == 0x2028 // line separator
117
+ || character == 0x2029; // pragraph separator
111
118
}
112
119
113
120
bool
@@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character)
116
123
return character == 'x' || character == 'o' || character == 'b';
117
124
}
118
125
126
+ bool
127
+ is_identifier_start (int codepoint)
128
+ {
129
+ return (check_xid_property (codepoint) & XID_START) || codepoint == '_';
130
+ }
131
+
132
+ bool
133
+ is_identifier_continue (int codepoint)
134
+ {
135
+ return check_xid_property (codepoint) & XID_CONTINUE;
136
+ }
137
+
119
138
Lexer::Lexer (const std::string &input)
120
139
: input (RAIIFile::create_error ()), current_line (1), current_column (1),
121
140
line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()),
@@ -284,22 +303,22 @@ Lexer::build_token ()
284
303
while (true)
285
304
{
286
305
Location loc = get_current_location ();
287
- current_char = peek_input ();
288
- skip_input ();
289
306
290
307
// detect UTF8 bom
291
308
//
292
309
// Must be the first thing on the first line.
293
310
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is
294
311
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
295
- if (current_line == 1 && current_column == 1 && current_char == 0xef
296
- && peek_input () == 0xbb && peek_input (1 ) == 0xbf)
312
+ if (current_line == 1 && current_column == 1 && peek_input () == 0xef
313
+ && peek_input (1 ) == 0xbb && peek_input (2 ) == 0xbf)
297
314
{
298
- skip_input (1);
299
- current_char = peek_input ();
300
- skip_input ();
315
+ skip_input (2);
301
316
}
302
317
318
+ current_char = peek_input ();
319
+ current_char32 = peek_codepoint_input ();
320
+ skip_codepoint_input ();
321
+
303
322
// detect shebang
304
323
// Must be the first thing on the first line, starting with #!
305
324
// But since an attribute can also start with an #! we don't count it as a
@@ -312,6 +331,7 @@ Lexer::build_token ()
312
331
int n = 1;
313
332
while (true)
314
333
{
334
+ // TODO use utf-8 codepoint to skip whitespaces
315
335
int next_char = peek_input (n);
316
336
if (is_whitespace (next_char))
317
337
n++;
@@ -1052,7 +1072,8 @@ Lexer::build_token ()
1052
1072
int peek = peek_input ();
1053
1073
int peek1 = peek_input (1);
1054
1074
1055
- if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
1075
+ // TODO (tamaron) parse Unicode ident
1076
+ if (peek == '#' && is_identifier_start (peek1))
1056
1077
{
1057
1078
TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
1058
1079
if (raw_ident_ptr != nullptr)
@@ -1069,8 +1090,8 @@ Lexer::build_token ()
1069
1090
}
1070
1091
}
1071
1092
1072
- // find identifiers and keywords
1073
- if (ISALPHA (current_char) || current_char == '_' )
1093
+ // find identifiers and keywords.
1094
+ if (is_identifier_start (current_char32.value) )
1074
1095
return parse_identifier_or_keyword (loc);
1075
1096
1076
1097
// int and float literals
@@ -1468,6 +1489,7 @@ Lexer::parse_partial_string_continue ()
1468
1489
int additional_length_offset = 1;
1469
1490
1470
1491
// string continue
1492
+ // TODO use utf-8 codepoint to skip whitespaces
1471
1493
while (is_whitespace (current_char))
1472
1494
{
1473
1495
if (current_char == '\n')
@@ -1611,6 +1633,7 @@ Lexer::parse_partial_unicode_escape ()
1611
1633
// wrong bracketm whitespace or single/double quotes are wrong
1612
1634
// termination, otherwise it is a wrong character, then skip to the actual
1613
1635
// terminator.
1636
+ // TODO use utf-8 codepoint to skip whitespaces
1614
1637
if (current_char == '{' || is_whitespace (current_char)
1615
1638
|| current_char == '\'' || current_char == '"')
1616
1639
{
@@ -1623,6 +1646,7 @@ Lexer::parse_partial_unicode_escape ()
1623
1646
rust_error_at (get_current_location (),
1624
1647
"invalid character %<%c%> in unicode escape",
1625
1648
current_char);
1649
+ // TODO use utf-8 codepoint to skip whitespaces
1626
1650
while (current_char != '}' && current_char != '{'
1627
1651
&& !is_whitespace (current_char) && current_char != '\''
1628
1652
&& current_char != '"')
@@ -1905,8 +1929,7 @@ Lexer::parse_raw_identifier (Location loc)
1905
1929
int length = 0;
1906
1930
current_char = peek_input ();
1907
1931
// loop through entire name
1908
- while (ISALPHA (current_char) || ISDIGIT (current_char)
1909
- || current_char == '_')
1932
+ while (is_identifier_continue (current_char))
1910
1933
{
1911
1934
length++;
1912
1935
@@ -2042,21 +2065,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
2042
2065
{
2043
2066
std::string str;
2044
2067
str.reserve (16); // default
2045
- str += current_char ;
2068
+ str += current_char32.as_string () ;
2046
2069
2047
2070
bool first_is_underscore = current_char == '_';
2048
2071
2049
2072
int length = 1;
2050
- current_char = peek_input ();
2073
+ current_char32 = peek_codepoint_input ();
2074
+
2051
2075
// loop through entire name
2052
- while (ISALPHA (current_char) || ISDIGIT (current_char)
2053
- || current_char == '_')
2076
+ while (is_identifier_continue (current_char32.value))
2054
2077
{
2078
+ auto s = current_char32.as_string ();
2055
2079
length++;
2056
2080
2057
- str += current_char ;
2058
- skip_input ();
2059
- current_char = peek_input ();
2081
+ str += current_char32.as_string () ;
2082
+ skip_codepoint_input ();
2083
+ current_char32 = peek_codepoint_input ();
2060
2084
}
2061
2085
2062
2086
current_column += length;
@@ -2444,28 +2468,29 @@ Lexer::parse_char_or_lifetime (Location loc)
2444
2468
2445
2469
return Token::make_char (loc, current_char32);
2446
2470
}
2447
- else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
2448
- || current_char32.value == '_')
2471
+ else if (is_identifier_start (current_char32.value))
2449
2472
{
2450
2473
// parse lifetime name
2451
2474
std::string str;
2452
2475
str += current_char32;
2453
2476
length++;
2454
2477
2455
- current_char = peek_input ();
2456
- while (ISDIGIT (current_char) || ISALPHA (current_char)
2457
- || current_char == '_')
2478
+ current_char32 = peek_codepoint_input ();
2479
+ while (is_identifier_continue (current_char32.value))
2458
2480
{
2459
- str += current_char ;
2460
- skip_input ();
2461
- current_char = peek_input ();
2481
+ str += current_char32 ;
2482
+ skip_codepoint_input ();
2483
+ current_char32 = peek_codepoint_input ();
2462
2484
length++;
2463
2485
}
2464
2486
2465
2487
current_column += length;
2466
2488
2467
2489
loc += length - 1;
2468
2490
2491
+ // TODO some keywords cannot be used for a lifetime label #2306
2492
+ // https://doc.rust-lang.org/reference/tokens.html
2493
+
2469
2494
str.shrink_to_fit ();
2470
2495
return Token::make_lifetime (loc, std::move (str));
2471
2496
}
@@ -2637,6 +2662,8 @@ Lexer::peek_codepoint_input ()
2637
2662
void
2638
2663
Lexer::skip_codepoint_input ()
2639
2664
{
2665
+ if (peek_input () == EOF)
2666
+ return;
2640
2667
int toSkip = get_input_codepoint_length ();
2641
2668
gcc_assert (toSkip >= 1);
2642
2669
0 commit comments