1
+ // Copyright (C) 2020-2023 Free Software Foundation, Inc.
2
+
3
+ // This file is part of GCC.
4
+
5
+ // GCC is free software; you can redistribute it and/or modify it under
6
+ // the terms of the GNU General Public License as published by the Free
7
+ // Software Foundation; either version 3, or (at your option) any later
8
+ // version.
9
+
10
+ // GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11
+ // WARRANTY; without even the implied warranty of MERCHANTABILITY or
12
+ // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13
+ // for more details.
14
+
15
+ // You should have received a copy of the GNU General Public License
16
+ // along with GCC; see the file COPYING3. If not see
17
+ // <http://www.gnu.org/licenses/>.
18
+
1
19
#ifndef RUST_INPUT_SOURCE_H
2
20
#define RUST_INPUT_SOURCE_H
3
21
4
22
#include " rust-codepoint.h"
5
23
#include " optional.h"
6
24
7
25
namespace Rust {
26
+
27
+ constexpr uint8_t UTF8_BOM1 = 0xEF ;
28
+ constexpr uint8_t UTF8_BOM2 = 0xBB ;
29
+ constexpr uint8_t UTF8_BOM3 = 0xBF ;
30
+
31
+ constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F ;
32
+ constexpr uint32_t CODEPOINT_INVALID = 0xFFFE ;
33
+
8
34
// Input source wrapper thing.
9
35
class InputSource
10
36
{
@@ -23,22 +49,22 @@ class InputSource
23
49
24
50
if ((int32_t ) input == EOF)
25
51
return Codepoint::eof ();
26
- else if (input < 128 )
52
+ else if (input <= MAX_ASCII_CODEPOINT )
27
53
{
28
54
// ascii -- 1 byte
29
55
return {input};
30
56
}
31
57
else if ((input & 0xC0 ) == 0x80 )
32
58
{
33
59
// invalid (continuation; can't be first char)
34
- return {0xFFFE };
60
+ return {CODEPOINT_INVALID };
35
61
}
36
62
else if ((input & 0xE0 ) == 0xC0 )
37
63
{
38
64
// 2 bytes
39
65
uint8_t input2 = next_byte ();
40
66
if ((input2 & 0xC0 ) != 0x80 )
41
- return {0xFFFE };
67
+ return {CODEPOINT_INVALID };
42
68
43
69
uint32_t output = ((input & 0x1F ) << 6 ) | ((input2 & 0x3F ) << 0 );
44
70
return output;
@@ -50,23 +76,23 @@ class InputSource
50
76
// If the second byte is equal to 0xBB then the input is no longer a
51
77
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
52
78
// BOM.
53
- if (input == 0xEF && input2 == 0xBB )
79
+ if (input == UTF8_BOM1 && input2 == UTF8_BOM2 )
54
80
{
55
81
uint8_t input3 = next_byte ();
56
- if (input3 == 0xBF )
82
+ if (input3 == UTF8_BOM3 )
57
83
// found BOM
58
84
return next_codepoint ();
59
85
else
60
- return {0xFFFE };
86
+ return {CODEPOINT_INVALID };
61
87
}
62
88
63
89
if ((input2 & 0xC0 ) != 0x80 )
64
- return {0xFFFE };
90
+ return {CODEPOINT_INVALID };
65
91
66
92
uint8_t input3 = next_byte ();
67
93
68
94
if ((input3 & 0xC0 ) != 0x80 )
69
- return {0xFFFE };
95
+ return {CODEPOINT_INVALID };
70
96
71
97
uint32_t output = ((input & 0x0F ) << 12 ) | ((input2 & 0x3F ) << 6 )
72
98
| ((input3 & 0x3F ) << 0 );
@@ -77,39 +103,42 @@ class InputSource
77
103
// 4 bytes
78
104
uint8_t input2 = next_byte ();
79
105
if ((input2 & 0xC0 ) != 0x80 )
80
- return {0xFFFE };
106
+ return {CODEPOINT_INVALID };
81
107
82
108
uint8_t input3 = next_byte ();
83
109
if ((input3 & 0xC0 ) != 0x80 )
84
- return {0xFFFE };
110
+ return {CODEPOINT_INVALID };
85
111
86
112
uint8_t input4 = next_byte ();
87
113
if ((input4 & 0xC0 ) != 0x80 )
88
- return {0xFFFE };
114
+ return {CODEPOINT_INVALID };
89
115
90
116
uint32_t output = ((input & 0x07 ) << 18 ) | ((input2 & 0x3F ) << 12 )
91
117
| ((input3 & 0x3F ) << 6 ) | ((input4 & 0x3F ) << 0 );
92
118
return {output};
93
119
}
94
120
else
95
121
{
96
- return {0xFFFE };
122
+ return {CODEPOINT_INVALID };
97
123
}
98
124
}
99
125
100
126
protected:
101
- // Check if the input source is valid as utf-8 and copy all characters to
102
- // `chars`.
127
+ // This method must be called by the constructor to initialize the input
128
+ // source. We cannot move this to the constructor because it calls a
129
+ // virtual method .
103
130
void init ()
104
131
{
132
+ // Check if the input source is valid as utf-8 and copy all characters to
133
+ // `chars`.
105
134
Codepoint char32 = next_codepoint ();
106
- while (!char32.is_eof () && char32 != 0xFFFE )
135
+ while (!char32.is_eof () && char32 != CODEPOINT_INVALID )
107
136
{
108
137
chars.push_back (char32);
109
138
char32 = next_codepoint ();
110
139
}
111
140
112
- if (char32 == 0xFFFE )
141
+ if (char32 == CODEPOINT_INVALID )
113
142
{
114
143
// Input source is not valid as utf-8.
115
144
is_valid_utf8 = false ;
@@ -158,11 +187,7 @@ class FileInputSource : public InputSource
158
187
159
188
public:
160
189
// Create new input source from file.
161
- FileInputSource (FILE *input) : InputSource (), input (input)
162
- {
163
- // TODO make this better?
164
- init ();
165
- }
190
+ FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
166
191
};
167
192
168
193
class BufferInputSource : public InputSource
@@ -175,15 +200,14 @@ class BufferInputSource : public InputSource
175
200
{
176
201
if (offs >= buffer.size ())
177
202
return EOF;
178
- return ( uint8_t ) buffer.at (offs++);
203
+ return static_cast < uint8_t > ( buffer.at (offs++) );
179
204
}
180
205
181
206
public:
182
207
// Create new input source from file.
183
208
BufferInputSource (const std::string &b, size_t offset)
184
209
: InputSource (), buffer (b), offs (offset)
185
210
{
186
- // TODO make this better?
187
211
init ();
188
212
}
189
213
};
0 commit comments