Skip to content

Commit 2b1d37c

Browse files
tamaroningP-E-P
authored andcommitted
gccrs: clean up Codepoint and InputSource
gcc/rust/ChangeLog: * lex/rust-codepoint.h: Moved to... * util/rust-codepoint.h: ...here. * lex/rust-input-source.h: Add missing license * util/rust-unicode.cc: Add missing license * util/rust-punycode.cc (extract_basic_string): Remove constant Signed-off-by: Raiki Tamura <[email protected]>
1 parent 969439f commit 2b1d37c

File tree

4 files changed

+66
-26
lines changed

4 files changed

+66
-26
lines changed

gcc/rust/lex/rust-input-source.h

+47-23
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,36 @@
1+
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
2+
3+
// This file is part of GCC.
4+
5+
// GCC is free software; you can redistribute it and/or modify it under
6+
// the terms of the GNU General Public License as published by the Free
7+
// Software Foundation; either version 3, or (at your option) any later
8+
// version.
9+
10+
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11+
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
12+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13+
// for more details.
14+
15+
// You should have received a copy of the GNU General Public License
16+
// along with GCC; see the file COPYING3. If not see
17+
// <http://www.gnu.org/licenses/>.
18+
119
#ifndef RUST_INPUT_SOURCE_H
220
#define RUST_INPUT_SOURCE_H
321

422
#include "rust-codepoint.h"
523
#include "optional.h"
624

725
namespace Rust {
26+
27+
constexpr uint8_t UTF8_BOM1 = 0xEF;
28+
constexpr uint8_t UTF8_BOM2 = 0xBB;
29+
constexpr uint8_t UTF8_BOM3 = 0xBF;
30+
31+
constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
32+
constexpr uint32_t CODEPOINT_INVALID = 0xFFFE;
33+
834
// Input source wrapper thing.
935
class InputSource
1036
{
@@ -23,22 +49,22 @@ class InputSource
2349

2450
if ((int32_t) input == EOF)
2551
return Codepoint::eof ();
26-
else if (input < 128)
52+
else if (input <= MAX_ASCII_CODEPOINT)
2753
{
2854
// ascii -- 1 byte
2955
return {input};
3056
}
3157
else if ((input & 0xC0) == 0x80)
3258
{
3359
// invalid (continuation; can't be first char)
34-
return {0xFFFE};
60+
return {CODEPOINT_INVALID};
3561
}
3662
else if ((input & 0xE0) == 0xC0)
3763
{
3864
// 2 bytes
3965
uint8_t input2 = next_byte ();
4066
if ((input2 & 0xC0) != 0x80)
41-
return {0xFFFE};
67+
return {CODEPOINT_INVALID};
4268

4369
uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
4470
return output;
@@ -50,23 +76,23 @@ class InputSource
5076
// If the second byte is equal to 0xBB then the input is no longer a
5177
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
5278
// BOM.
53-
if (input == 0xEF && input2 == 0xBB)
79+
if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
5480
{
5581
uint8_t input3 = next_byte ();
56-
if (input3 == 0xBF)
82+
if (input3 == UTF8_BOM3)
5783
// found BOM
5884
return next_codepoint ();
5985
else
60-
return {0xFFFE};
86+
return {CODEPOINT_INVALID};
6187
}
6288

6389
if ((input2 & 0xC0) != 0x80)
64-
return {0xFFFE};
90+
return {CODEPOINT_INVALID};
6591

6692
uint8_t input3 = next_byte ();
6793

6894
if ((input3 & 0xC0) != 0x80)
69-
return {0xFFFE};
95+
return {CODEPOINT_INVALID};
7096

7197
uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
7298
| ((input3 & 0x3F) << 0);
@@ -77,39 +103,42 @@ class InputSource
77103
// 4 bytes
78104
uint8_t input2 = next_byte ();
79105
if ((input2 & 0xC0) != 0x80)
80-
return {0xFFFE};
106+
return {CODEPOINT_INVALID};
81107

82108
uint8_t input3 = next_byte ();
83109
if ((input3 & 0xC0) != 0x80)
84-
return {0xFFFE};
110+
return {CODEPOINT_INVALID};
85111

86112
uint8_t input4 = next_byte ();
87113
if ((input4 & 0xC0) != 0x80)
88-
return {0xFFFE};
114+
return {CODEPOINT_INVALID};
89115

90116
uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
91117
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
92118
return {output};
93119
}
94120
else
95121
{
96-
return {0xFFFE};
122+
return {CODEPOINT_INVALID};
97123
}
98124
}
99125

100126
protected:
101-
// Check if the input source is valid as utf-8 and copy all characters to
102-
// `chars`.
127+
// This method must be called by the constructor to initialize the input
128+
// source. We cannot move this to the constructor because it calls a
129+
// virtual method .
103130
void init ()
104131
{
132+
// Check if the input source is valid as utf-8 and copy all characters to
133+
// `chars`.
105134
Codepoint char32 = next_codepoint ();
106-
while (!char32.is_eof () && char32 != 0xFFFE)
135+
while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
107136
{
108137
chars.push_back (char32);
109138
char32 = next_codepoint ();
110139
}
111140

112-
if (char32 == 0xFFFE)
141+
if (char32 == CODEPOINT_INVALID)
113142
{
114143
// Input source is not valid as utf-8.
115144
is_valid_utf8 = false;
@@ -158,11 +187,7 @@ class FileInputSource : public InputSource
158187

159188
public:
160189
// Create new input source from file.
161-
FileInputSource (FILE *input) : InputSource (), input (input)
162-
{
163-
// TODO make this better?
164-
init ();
165-
}
190+
FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
166191
};
167192

168193
class BufferInputSource : public InputSource
@@ -175,15 +200,14 @@ class BufferInputSource : public InputSource
175200
{
176201
if (offs >= buffer.size ())
177202
return EOF;
178-
return (uint8_t) buffer.at (offs++);
203+
return static_cast<uint8_t> (buffer.at (offs++));
179204
}
180205

181206
public:
182207
// Create new input source from file.
183208
BufferInputSource (const std::string &b, size_t offset)
184209
: InputSource (), buffer (b), offs (offset)
185210
{
186-
// TODO make this better?
187211
init ();
188212
}
189213
};
File renamed without changes.

gcc/rust/util/rust-punycode.cc

+1-3
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72;
3636
constexpr uint32_t INITIAL_N = 128;
3737
constexpr char DELIMITER = '-';
3838

39-
constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
40-
4139
std::string
4240
extract_basic_string (const std::vector<Codepoint> &src)
4341
{
4442
std::string basic_string;
4543
for (auto c : src)
4644
{
47-
if (c.value <= MAX_ASCII_CODEPOINT)
45+
if (c.value <= 0x7F)
4846
basic_string += c.as_string ();
4947
}
5048
return basic_string;

gcc/rust/util/rust-unicode.cc

+18
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,21 @@
1+
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
2+
3+
// This file is part of GCC.
4+
5+
// GCC is free software; you can redistribute it and/or modify it under
6+
// the terms of the GNU General Public License as published by the Free
7+
// Software Foundation; either version 3, or (at your option) any later
8+
// version.
9+
10+
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11+
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
12+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13+
// for more details.
14+
15+
// You should have received a copy of the GNU General Public License
16+
// along with GCC; see the file COPYING3. If not see
17+
// <http://www.gnu.org/licenses/>.
18+
119
#include "rust-system.h"
220
#include "optional.h"
321
#include "selftest.h"

0 commit comments

Comments
 (0)