gccrs: clean up Codepoint and InputSource

tamaroning · P-E-P · commit 2b1d37ced313 · 2023-08-11T13:38:25.000Z
gcc/rust/ChangeLog:

	* lex/rust-codepoint.h: Moved to...
	* util/rust-codepoint.h: ...here.
	* lex/rust-input-source.h: Add missing license
	* util/rust-unicode.cc: Add missing license
	* util/rust-punycode.cc (extract_basic_string): Remove constant

Signed-off-by: Raiki Tamura &lt;tamaron1203@gmail.com&gt;
diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h
@@ -1,10 +1,36 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
 #ifndef RUST_INPUT_SOURCE_H
 #define RUST_INPUT_SOURCE_H
 
 #include "rust-codepoint.h"
 #include "optional.h"
 
 namespace Rust {
+
+constexpr uint8_t UTF8_BOM1 = 0xEF;
+constexpr uint8_t UTF8_BOM2 = 0xBB;
+constexpr uint8_t UTF8_BOM3 = 0xBF;
+
+constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
+constexpr uint32_t CODEPOINT_INVALID = 0xFFFE;
+
 // Input source wrapper thing.
 class InputSource
 {
@@ -23,22 +49,22 @@ class InputSource
 
     if ((int32_t) input == EOF)
       return Codepoint::eof ();
-    else if (input < 128)
+    else if (input <= MAX_ASCII_CODEPOINT)
       {
 	// ascii -- 1 byte
 	return {input};
       }
     else if ((input & 0xC0) == 0x80)
       {
 	// invalid (continuation; can't be first char)
-	return {0xFFFE};
+	return {CODEPOINT_INVALID};
       }
     else if ((input & 0xE0) == 0xC0)
       {
 	// 2 bytes
 	uint8_t input2 = next_byte ();
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
 	return output;
@@ -50,23 +76,23 @@ class InputSource
 	// If the second byte is equal to 0xBB then the input is no longer a
 	// valid UTF-8 char. Then, we check if the third byte makes up a UTF
 	// BOM.
-	if (input == 0xEF && input2 == 0xBB)
+	if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
 	  {
 	    uint8_t input3 = next_byte ();
-	    if (input3 == 0xBF)
+	    if (input3 == UTF8_BOM3)
 	      // found BOM
 	      return next_codepoint ();
 	    else
-	      return {0xFFFE};
+	      return {CODEPOINT_INVALID};
 	  }
 
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input3 = next_byte ();
 
 	if ((input3 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
 			  | ((input3 & 0x3F) << 0);
@@ -77,39 +103,42 @@ class InputSource
 	// 4 bytes
 	uint8_t input2 = next_byte ();
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input3 = next_byte ();
 	if ((input3 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input4 = next_byte ();
 	if ((input4 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
 			  | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
 	return {output};
       }
     else
       {
-	return {0xFFFE};
+	return {CODEPOINT_INVALID};
       }
   }
 
 protected:
-  // Check if the input source is valid as utf-8 and copy all characters to
-  // `chars`.
+  // This method must be called by the constructor to initialize the input
+  // source. We cannot move this to the constructor because it calls a
+  // virtual method .
   void init ()
   {
+    // Check if the input source is valid as utf-8 and copy all characters to
+    // `chars`.
     Codepoint char32 = next_codepoint ();
-    while (!char32.is_eof () && char32 != 0xFFFE)
+    while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
       {
 	chars.push_back (char32);
 	char32 = next_codepoint ();
       }
 
-    if (char32 == 0xFFFE)
+    if (char32 == CODEPOINT_INVALID)
       {
 	// Input source is not valid as utf-8.
 	is_valid_utf8 = false;
@@ -158,11 +187,7 @@ class FileInputSource : public InputSource
 
 public:
   // Create new input source from file.
-  FileInputSource (FILE *input) : InputSource (), input (input)
-  {
-    // TODO make this better?
-    init ();
-  }
+  FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
 };
 
 class BufferInputSource : public InputSource
@@ -175,15 +200,14 @@ class BufferInputSource : public InputSource
   {
     if (offs >= buffer.size ())
       return EOF;
-    return (uint8_t) buffer.at (offs++);
+    return static_cast<uint8_t> (buffer.at (offs++));
   }
 
 public:
   // Create new input source from file.
   BufferInputSource (const std::string &b, size_t offset)
     : InputSource (), buffer (b), offs (offset)
   {
-    // TODO make this better?
     init ();
   }
 };
diff --git a/gcc/rust/util/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h
diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc
@@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72;
 constexpr uint32_t INITIAL_N = 128;
 constexpr char DELIMITER = '-';
 
-constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
-
 std::string
 extract_basic_string (const std::vector<Codepoint> &src)
 {
   std::string basic_string;
   for (auto c : src)
     {
-      if (c.value <= MAX_ASCII_CODEPOINT)
+      if (c.value <= 0x7F)
 	basic_string += c.as_string ();
     }
   return basic_string;
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
@@ -1,3 +1,21 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
 #include "rust-system.h"
 #include "optional.h"
 #include "selftest.h"

Original file line number	Diff line number	Diff line change
`@@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72;`
`36`	`36`	`constexpr uint32_t INITIAL_N = 128;`
`37`	`37`	`constexpr char DELIMITER = '-';`
`38`	`38`
`39`		`-constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;`
`40`		`-`
`41`	`39`	`std::string`
`42`	`40`	`extract_basic_string (const std::vector<Codepoint> &src)`
`43`	`41`	`{`
`44`	`42`	`std::string basic_string;`
`45`	`43`	`for (auto c : src)`
`46`	`44`	`{`
`47`		`- if (c.value <= MAX_ASCII_CODEPOINT)`
	`45`	`+ if (c.value <= 0x7F)`
`48`	`46`	`basic_string += c.as_string ();`
`49`	`47`	`}`
`50`	`48`	`return basic_string;`