@@ -306,8 +306,7 @@ Lexer::build_token ()
306
306
Location loc = get_current_location ();
307
307
308
308
current_char = peek_input ();
309
- current_char32 = peek_codepoint_input ();
310
- skip_codepoint_input ();
309
+ skip_input ();
311
310
312
311
// detect shebang
313
312
// Must be the first thing on the first line, starting with #!
@@ -1089,7 +1088,7 @@ Lexer::build_token ()
1089
1088
}
1090
1089
1091
1090
// find identifiers and keywords.
1092
- if (is_identifier_start (current_char32 .value ))
1091
+ if (is_identifier_start (current_char .value ))
1093
1092
return parse_identifier_or_keyword (loc);
1094
1093
1095
1094
// int and float literals
@@ -1998,59 +1997,56 @@ Lexer::skip_broken_string_input (Codepoint current_char)
1998
1997
current_column);
1999
1998
}
2000
1999
2001
- // Parses a unicode string.
2000
+ // Parses a string.
2002
2001
TokenPtr
2003
2002
Lexer::parse_string (Location loc)
2004
2003
{
2005
- Codepoint current_char32;
2006
-
2007
2004
std::string str;
2008
2005
str.reserve (16 ); // some sensible default
2009
2006
2010
2007
int length = 1 ;
2011
- current_char32 = peek_codepoint_input ();
2008
+ current_char = peek_input ();
2012
2009
2013
2010
// FIXME: This fails if the input ends. How do we check for EOF?
2014
- while (current_char32 .value != ' "' && !current_char32 .is_eof ())
2011
+ while (current_char .value != ' "' && !current_char .is_eof ())
2015
2012
{
2016
- if (current_char32 .value == ' \\ ' )
2013
+ if (current_char .value == ' \\ ' )
2017
2014
{
2018
2015
// parse escape
2019
2016
auto utf8_escape_pair = parse_utf8_escape ();
2020
- current_char32 = std::get<0 > (utf8_escape_pair);
2017
+ current_char = std::get<0 > (utf8_escape_pair);
2021
2018
2022
- if (current_char32 == Codepoint (0 ) && std::get<2 > (utf8_escape_pair))
2019
+ if (current_char == Codepoint (0 ) && std::get<2 > (utf8_escape_pair))
2023
2020
length = std::get<1 > (utf8_escape_pair) - 1 ;
2024
2021
else
2025
2022
length += std::get<1 > (utf8_escape_pair);
2026
2023
2027
- if (current_char32 != Codepoint (0 )
2028
- || !std::get<2 > (utf8_escape_pair))
2029
- str += current_char32;
2030
-
2031
- // required as parsing utf8 escape only changes current_char
2032
- current_char32 = peek_codepoint_input ();
2024
+ if (current_char != Codepoint (0 ) || !std::get<2 > (utf8_escape_pair))
2025
+ str += current_char.as_string ();
2033
2026
2027
+ // FIXME: should remove this but can't.
2028
+ // `parse_utf8_escape` does not update `current_char` correctly.
2029
+ current_char = peek_input ();
2034
2030
continue ;
2035
2031
}
2036
2032
2037
- length += get_input_codepoint_length () ;
2033
+ length++ ;
2038
2034
2039
- str += current_char32 ;
2040
- skip_codepoint_input ();
2041
- current_char32 = peek_codepoint_input ();
2035
+ str += current_char ;
2036
+ skip_input ();
2037
+ current_char = peek_input ();
2042
2038
}
2043
2039
2044
2040
current_column += length;
2045
2041
2046
- if (current_char32 .value == ' "' )
2042
+ if (current_char .value == ' "' )
2047
2043
{
2048
2044
current_column++;
2049
2045
2050
2046
skip_input ();
2051
2047
current_char = peek_input ();
2052
2048
}
2053
- else if (current_char32 .is_eof ())
2049
+ else if (current_char .is_eof ())
2054
2050
{
2055
2051
rust_error_at (get_current_location (), " unended string literal" );
2056
2052
return Token::make (END_OF_FILE, get_current_location ());
@@ -2072,22 +2068,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
2072
2068
{
2073
2069
std::string str;
2074
2070
str.reserve (16 ); // default
2075
- str += current_char32 .as_string ();
2071
+ str += current_char .as_string ();
2076
2072
2077
2073
bool first_is_underscore = current_char == ' _' ;
2078
2074
2079
2075
int length = 1 ;
2080
- current_char32 = peek_codepoint_input ();
2076
+ current_char = peek_input ();
2081
2077
2082
2078
// loop through entire name
2083
- while (is_identifier_continue (current_char32 .value ))
2079
+ while (is_identifier_continue (current_char .value ))
2084
2080
{
2085
- auto s = current_char32 .as_string ();
2081
+ auto s = current_char .as_string ();
2086
2082
length++;
2087
2083
2088
- str += current_char32 .as_string ();
2089
- skip_codepoint_input ();
2090
- current_char32 = peek_codepoint_input ();
2084
+ str += current_char .as_string ();
2085
+ skip_input ();
2086
+ current_char = peek_input ();
2091
2087
}
2092
2088
2093
2089
current_column += length;
@@ -2141,11 +2137,11 @@ Lexer::parse_raw_string (Location loc, int initial_hash_count)
2141
2137
2142
2138
length++;
2143
2139
skip_input ();
2144
- Codepoint current_char32 = peek_codepoint_input ();
2140
+ current_char = peek_input ();
2145
2141
2146
- while (!current_char32 .is_eof ())
2142
+ while (!current_char .is_eof ())
2147
2143
{
2148
- if (current_char32 .value == ' "' )
2144
+ if (current_char .value == ' "' )
2149
2145
{
2150
2146
bool enough_hashes = true ;
2151
2147
@@ -2170,9 +2166,9 @@ Lexer::parse_raw_string (Location loc, int initial_hash_count)
2170
2166
2171
2167
length++;
2172
2168
2173
- str += current_char32 ;
2174
- skip_codepoint_input ();
2175
- current_char32 = peek_codepoint_input ();
2169
+ str += current_char. as_string () ;
2170
+ skip_input ();
2171
+ current_char = peek_input ();
2176
2172
}
2177
2173
2178
2174
current_column += length;
@@ -2424,29 +2420,27 @@ Lexer::parse_decimal_int_or_float (Location loc)
2424
2420
TokenPtr
2425
2421
Lexer::parse_char_or_lifetime (Location loc)
2426
2422
{
2427
- Codepoint current_char32;
2428
-
2429
2423
int length = 1 ;
2430
2424
2431
- current_char32 = peek_codepoint_input ();
2432
- if (current_char32 .is_eof ())
2425
+ current_char = peek_input ();
2426
+ if (current_char .is_eof ())
2433
2427
return nullptr ;
2434
2428
2435
2429
// parse escaped char literal
2436
- if (current_char32 .value == ' \\ ' )
2430
+ if (current_char .value == ' \\ ' )
2437
2431
{
2438
2432
// parse escape
2439
2433
auto utf8_escape_pair = parse_utf8_escape ();
2440
- current_char32 = std::get<0 > (utf8_escape_pair);
2434
+ Codepoint escaped_char = std::get<0 > (utf8_escape_pair);
2441
2435
length += std::get<1 > (utf8_escape_pair);
2442
2436
2443
- if (peek_codepoint_input ().value != ' \' ' )
2437
+ if (peek_input ().value != ' \' ' )
2444
2438
{
2445
2439
rust_error_at (get_current_location (), " unended character literal" );
2446
2440
}
2447
2441
else
2448
2442
{
2449
- skip_codepoint_input ();
2443
+ skip_input ();
2450
2444
current_char = peek_input ();
2451
2445
length++;
2452
2446
}
@@ -2455,15 +2449,16 @@ Lexer::parse_char_or_lifetime (Location loc)
2455
2449
2456
2450
loc += length - 1 ;
2457
2451
2458
- return Token::make_char (loc, current_char32 );
2452
+ return Token::make_char (loc, escaped_char );
2459
2453
}
2460
2454
else
2461
2455
{
2462
- skip_codepoint_input ();
2456
+ skip_input ();
2463
2457
2464
- if (peek_codepoint_input ().value == ' \' ' )
2458
+ if (peek_input ().value == ' \' ' )
2465
2459
{
2466
2460
// parse non-escaped char literal
2461
+ Codepoint non_escaped_char = current_char;
2467
2462
2468
2463
// skip the ' character
2469
2464
skip_input ();
@@ -2474,21 +2469,21 @@ Lexer::parse_char_or_lifetime (Location loc)
2474
2469
2475
2470
loc += 2 ;
2476
2471
2477
- return Token::make_char (loc, current_char32 );
2472
+ return Token::make_char (loc, non_escaped_char );
2478
2473
}
2479
- else if (is_identifier_start (current_char32 .value ))
2474
+ else if (is_identifier_start (current_char .value ))
2480
2475
{
2481
2476
// parse lifetime name
2482
2477
std::string str;
2483
- str += current_char32 ;
2478
+ str += current_char. as_string () ;
2484
2479
length++;
2485
2480
2486
- current_char32 = peek_codepoint_input ();
2487
- while (is_identifier_continue (current_char32 .value ))
2481
+ current_char = peek_input ();
2482
+ while (is_identifier_continue (current_char .value ))
2488
2483
{
2489
- str += current_char32 ;
2490
- skip_codepoint_input ();
2491
- current_char32 = peek_codepoint_input ();
2484
+ str += current_char. as_string () ;
2485
+ skip_input ();
2486
+ current_char = peek_input ();
2492
2487
length++;
2493
2488
}
2494
2489
@@ -2512,29 +2507,6 @@ Lexer::parse_char_or_lifetime (Location loc)
2512
2507
}
2513
2508
}
2514
2509
2515
- // TODO remove this function
2516
- // Returns the length of the codepoint at the current position.
2517
- int
2518
- Lexer::get_input_codepoint_length ()
2519
- {
2520
- return 1 ;
2521
- }
2522
-
2523
- // TODO remove this function
2524
- // Returns the codepoint at the current position.
2525
- Codepoint
2526
- Lexer::peek_codepoint_input ()
2527
- {
2528
- return peek_input ();
2529
- }
2530
-
2531
- // TODO remove this function
2532
- void
2533
- Lexer::skip_codepoint_input ()
2534
- {
2535
- skip_input ();
2536
- }
2537
-
2538
2510
void
2539
2511
Lexer::split_current_token (TokenId new_left, TokenId new_right)
2540
2512
{
0 commit comments