@@ -306,8 +306,7 @@ Lexer::build_token ()
306
306
Location loc = get_current_location ();
307
307
308
308
current_char = peek_input ();
309
- current_char32 = peek_codepoint_input ();
310
- skip_codepoint_input ();
309
+ skip_input ();
311
310
312
311
// detect shebang
313
312
// Must be the first thing on the first line, starting with #!
@@ -1080,7 +1079,7 @@ Lexer::build_token ()
1080
1079
}
1081
1080
1082
1081
// find identifiers and keywords.
1083
- if (is_identifier_start (current_char32 .value ))
1082
+ if (is_identifier_start (current_char .value ))
1084
1083
return parse_identifier_or_keyword (loc);
1085
1084
1086
1085
// int and float literals
@@ -1985,59 +1984,56 @@ Lexer::skip_broken_string_input (Codepoint current_char)
1985
1984
current_column);
1986
1985
}
1987
1986
1988
- // Parses a unicode string.
1987
+ // Parses a string.
1989
1988
TokenPtr
1990
1989
Lexer::parse_string (Location loc)
1991
1990
{
1992
- Codepoint current_char32;
1993
-
1994
1991
std::string str;
1995
1992
str.reserve (16 ); // some sensible default
1996
1993
1997
1994
int length = 1 ;
1998
- current_char32 = peek_codepoint_input ();
1995
+ current_char = peek_input ();
1999
1996
2000
1997
// FIXME: This fails if the input ends. How do we check for EOF?
2001
- while (current_char32 .value != ' "' && !current_char32 .is_eof ())
1998
+ while (current_char .value != ' "' && !current_char .is_eof ())
2002
1999
{
2003
- if (current_char32 .value == ' \\ ' )
2000
+ if (current_char .value == ' \\ ' )
2004
2001
{
2005
2002
// parse escape
2006
2003
auto utf8_escape_pair = parse_utf8_escape ();
2007
- current_char32 = std::get<0 > (utf8_escape_pair);
2004
+ current_char = std::get<0 > (utf8_escape_pair);
2008
2005
2009
- if (current_char32 == Codepoint (0 ) && std::get<2 > (utf8_escape_pair))
2006
+ if (current_char == Codepoint (0 ) && std::get<2 > (utf8_escape_pair))
2010
2007
length = std::get<1 > (utf8_escape_pair) - 1 ;
2011
2008
else
2012
2009
length += std::get<1 > (utf8_escape_pair);
2013
2010
2014
- if (current_char32 != Codepoint (0 )
2015
- || !std::get<2 > (utf8_escape_pair))
2016
- str += current_char32;
2017
-
2018
- // required as parsing utf8 escape only changes current_char
2019
- current_char32 = peek_codepoint_input ();
2011
+ if (current_char != Codepoint (0 ) || !std::get<2 > (utf8_escape_pair))
2012
+ str += current_char.as_string ();
2020
2013
2014
+ // FIXME: should remove this but can't.
2015
+ // `parse_utf8_escape` does not update `current_char` correctly.
2016
+ current_char = peek_input ();
2021
2017
continue ;
2022
2018
}
2023
2019
2024
- length += get_input_codepoint_length () ;
2020
+ length++ ;
2025
2021
2026
- str += current_char32 ;
2027
- skip_codepoint_input ();
2028
- current_char32 = peek_codepoint_input ();
2022
+ str += current_char ;
2023
+ skip_input ();
2024
+ current_char = peek_input ();
2029
2025
}
2030
2026
2031
2027
current_column += length;
2032
2028
2033
- if (current_char32 .value == ' "' )
2029
+ if (current_char .value == ' "' )
2034
2030
{
2035
2031
current_column++;
2036
2032
2037
2033
skip_input ();
2038
2034
current_char = peek_input ();
2039
2035
}
2040
- else if (current_char32 .is_eof ())
2036
+ else if (current_char .is_eof ())
2041
2037
{
2042
2038
rust_error_at (get_current_location (), " unended string literal" );
2043
2039
return Token::make (END_OF_FILE, get_current_location ());
@@ -2059,22 +2055,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
2059
2055
{
2060
2056
std::string str;
2061
2057
str.reserve (16 ); // default
2062
- str += current_char32 .as_string ();
2058
+ str += current_char .as_string ();
2063
2059
2064
2060
bool first_is_underscore = current_char == ' _' ;
2065
2061
2066
2062
int length = 1 ;
2067
- current_char32 = peek_codepoint_input ();
2063
+ current_char = peek_input ();
2068
2064
2069
2065
// loop through entire name
2070
- while (is_identifier_continue (current_char32 .value ))
2066
+ while (is_identifier_continue (current_char .value ))
2071
2067
{
2072
- auto s = current_char32 .as_string ();
2068
+ auto s = current_char .as_string ();
2073
2069
length++;
2074
2070
2075
- str += current_char32 .as_string ();
2076
- skip_codepoint_input ();
2077
- current_char32 = peek_codepoint_input ();
2071
+ str += current_char .as_string ();
2072
+ skip_input ();
2073
+ current_char = peek_input ();
2078
2074
}
2079
2075
2080
2076
current_column += length;
@@ -2128,11 +2124,11 @@ Lexer::parse_raw_string (Location loc, int initial_hash_count)
2128
2124
2129
2125
length++;
2130
2126
skip_input ();
2131
- Codepoint current_char32 = peek_codepoint_input ();
2127
+ current_char = peek_input ();
2132
2128
2133
- while (!current_char32 .is_eof ())
2129
+ while (!current_char .is_eof ())
2134
2130
{
2135
- if (current_char32 .value == ' "' )
2131
+ if (current_char .value == ' "' )
2136
2132
{
2137
2133
bool enough_hashes = true ;
2138
2134
@@ -2157,9 +2153,9 @@ Lexer::parse_raw_string (Location loc, int initial_hash_count)
2157
2153
2158
2154
length++;
2159
2155
2160
- str += current_char32 ;
2161
- skip_codepoint_input ();
2162
- current_char32 = peek_codepoint_input ();
2156
+ str += current_char. as_string () ;
2157
+ skip_input ();
2158
+ current_char = peek_input ();
2163
2159
}
2164
2160
2165
2161
current_column += length;
@@ -2411,29 +2407,27 @@ Lexer::parse_decimal_int_or_float (Location loc)
2411
2407
TokenPtr
2412
2408
Lexer::parse_char_or_lifetime (Location loc)
2413
2409
{
2414
- Codepoint current_char32;
2415
-
2416
2410
int length = 1 ;
2417
2411
2418
- current_char32 = peek_codepoint_input ();
2419
- if (current_char32 .is_eof ())
2412
+ current_char = peek_input ();
2413
+ if (current_char .is_eof ())
2420
2414
return nullptr ;
2421
2415
2422
2416
// parse escaped char literal
2423
- if (current_char32 .value == ' \\ ' )
2417
+ if (current_char .value == ' \\ ' )
2424
2418
{
2425
2419
// parse escape
2426
2420
auto utf8_escape_pair = parse_utf8_escape ();
2427
- current_char32 = std::get<0 > (utf8_escape_pair);
2421
+ Codepoint escaped_char = std::get<0 > (utf8_escape_pair);
2428
2422
length += std::get<1 > (utf8_escape_pair);
2429
2423
2430
- if (peek_codepoint_input ().value != ' \' ' )
2424
+ if (peek_input ().value != ' \' ' )
2431
2425
{
2432
2426
rust_error_at (get_current_location (), " unended character literal" );
2433
2427
}
2434
2428
else
2435
2429
{
2436
- skip_codepoint_input ();
2430
+ skip_input ();
2437
2431
current_char = peek_input ();
2438
2432
length++;
2439
2433
}
@@ -2442,15 +2436,16 @@ Lexer::parse_char_or_lifetime (Location loc)
2442
2436
2443
2437
loc += length - 1 ;
2444
2438
2445
- return Token::make_char (loc, current_char32 );
2439
+ return Token::make_char (loc, escaped_char );
2446
2440
}
2447
2441
else
2448
2442
{
2449
- skip_codepoint_input ();
2443
+ skip_input ();
2450
2444
2451
- if (peek_codepoint_input ().value == ' \' ' )
2445
+ if (peek_input ().value == ' \' ' )
2452
2446
{
2453
2447
// parse non-escaped char literal
2448
+ Codepoint non_escaped_char = current_char;
2454
2449
2455
2450
// skip the ' character
2456
2451
skip_input ();
@@ -2461,21 +2456,21 @@ Lexer::parse_char_or_lifetime (Location loc)
2461
2456
2462
2457
loc += 2 ;
2463
2458
2464
- return Token::make_char (loc, current_char32 );
2459
+ return Token::make_char (loc, non_escaped_char );
2465
2460
}
2466
- else if (is_identifier_start (current_char32 .value ))
2461
+ else if (is_identifier_start (current_char .value ))
2467
2462
{
2468
2463
// parse lifetime name
2469
2464
std::string str;
2470
- str += current_char32 ;
2465
+ str += current_char. as_string () ;
2471
2466
length++;
2472
2467
2473
- current_char32 = peek_codepoint_input ();
2474
- while (is_identifier_continue (current_char32 .value ))
2468
+ current_char = peek_input ();
2469
+ while (is_identifier_continue (current_char .value ))
2475
2470
{
2476
- str += current_char32 ;
2477
- skip_codepoint_input ();
2478
- current_char32 = peek_codepoint_input ();
2471
+ str += current_char. as_string () ;
2472
+ skip_input ();
2473
+ current_char = peek_input ();
2479
2474
length++;
2480
2475
}
2481
2476
@@ -2499,29 +2494,6 @@ Lexer::parse_char_or_lifetime (Location loc)
2499
2494
}
2500
2495
}
2501
2496
2502
- // TODO remove this function
2503
- // Returns the length of the codepoint at the current position.
2504
- int
2505
- Lexer::get_input_codepoint_length ()
2506
- {
2507
- return 1 ;
2508
- }
2509
-
2510
- // TODO remove this function
2511
- // Returns the codepoint at the current position.
2512
- Codepoint
2513
- Lexer::peek_codepoint_input ()
2514
- {
2515
- return peek_input ();
2516
- }
2517
-
2518
- // TODO remove this function
2519
- void
2520
- Lexer::skip_codepoint_input ()
2521
- {
2522
- skip_input ();
2523
- }
2524
-
2525
2497
void
2526
2498
Lexer::split_current_token (TokenId new_left, TokenId new_right)
2527
2499
{
0 commit comments