diff --git a/gcc/rust/Make-lang.in b/gcc/rust/Make-lang.in index aa68640f4f70..a4b5e1dc4a21 100644 --- a/gcc/rust/Make-lang.in +++ b/gcc/rust/Make-lang.in @@ -186,6 +186,7 @@ GRS_OBJS = \ rust/rust-feature-gate.o \ rust/rust-dir-owner.o \ rust/rust-unicode.o \ + rust/rust-punycode.o \ $(END) # removed object files from here diff --git a/gcc/rust/rust-lang.cc b/gcc/rust/rust-lang.cc index 44dc3fc48eed..e544d03e3588 100644 --- a/gcc/rust/rust-lang.cc +++ b/gcc/rust/rust-lang.cc @@ -40,6 +40,7 @@ #include "rust-lex.h" #include "optional.h" #include "rust-unicode.h" +#include "rust-punycode.h" #include // note: header files must be in this order or else forward declarations don't @@ -456,6 +457,7 @@ run_rust_tests () // Call tests for the rust frontend here rust_input_source_test (); rust_utf8_normalize_test (); + rust_punycode_encode_test (); rust_cfg_parser_test (); rust_privacy_ctx_test (); rust_crate_name_validation_test (); diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc new file mode 100644 index 000000000000..a35d54aa6f5a --- /dev/null +++ b/gcc/rust/util/rust-punycode.cc @@ -0,0 +1,180 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +// This file provides functions for punycode conversion +// See https://datatracker.ietf.org/doc/html/rfc3492 + +#include "rust-system.h" +#include "rust-unicode.h" +#include "optional.h" +#include "selftest.h" + +namespace Rust { + +// https://tools.ietf.org/html/rfc3492#section-4. +constexpr uint32_t BASE = 36; +constexpr uint32_t TMIN = 1; +constexpr uint32_t TMAX = 26; +constexpr uint32_t SKEW = 38; +constexpr uint32_t DAMP = 700; +constexpr uint32_t INITIAL_BIAS = 72; +constexpr uint32_t INITIAL_N = 128; +constexpr char DELIMITER = '-'; + +constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; + +std::string +extract_basic_string (const std::vector &src) +{ + std::string basic_string; + for (auto c : src) + { + if (c.value <= MAX_ASCII_CODEPOINT) + basic_string += c.as_string (); + } + return basic_string; +} + +uint32_t +adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first) +{ + delta /= is_first ? DAMP : 2; + delta += delta / n_points; + uint32_t k = 0; + + while (delta > (BASE - TMIN) * TMAX / 2) + { + delta /= BASE - TMIN; + k += BASE; + } + return k + (BASE - TMIN + 1) * delta / (delta + SKEW); +} + +uint32_t +clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs, + const uint32_t max) +{ + if (min + rhs >= lhs) + return min; + else if (max + rhs <= lhs) + return max; + else + return lhs - rhs; +} + +uint32_t +min_gt_or_eq (const std::vector &l, const uint32_t threshold) +{ + uint32_t min = UINT32_MAX; + for (auto c : l) + if (c.value >= threshold && c.value < min) + min = c.value; + return min; +} + +char +encode_digit (const uint32_t d) +{ + return d + 22 + (d < 26 ? 75 : 0); +} + +tl::optional +encode_punycode (const Utf8String &input) +{ + std::vector input_chars = input.get_chars (); + + uint32_t n = INITIAL_N; + uint32_t delta = 0; + uint32_t bias = INITIAL_BIAS; + + std::string output = extract_basic_string (input_chars); + uint32_t h = output.size (); + const uint32_t b = h; + if (b > 0) + output += DELIMITER; + + while (h < input_chars.size ()) + { + const uint32_t m = min_gt_or_eq (input_chars, n); + + if (m - n > ((UINT32_MAX - delta) / (h + 1))) + return tl::nullopt; + + delta += (m - n) * (h + 1); + n = m; + + for (const auto c : input_chars) + { + if (c.value < n) + delta++; + else if (c.value == n) + { + uint32_t q = delta; + // encode as a variable length integer + for (uint32_t k = 1;; k++) + { + const uint32_t kb = k * BASE; + const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX); + if (q < t) + break; + + output += encode_digit (t + (q - t) % (BASE - t)); + q = (q - t) / (BASE - t); + } + output += encode_digit (q); + + bias = adapt_bias (delta, h + 1, h == b); + delta = 0; + h++; + } + } + delta++; + n++; + } + + return {output}; +} + +} // namespace Rust + +namespace selftest { + +void +encode_assert (const std::string &input, const std::string &expected) +{ + Rust::Utf8String input_utf8 + = Rust::Utf8String::make_utf8_string (input).value (); + std::string actual = Rust::encode_punycode (input_utf8).value (); + ASSERT_EQ (actual, expected); +} + +void +rust_punycode_encode_test () +{ + encode_assert ("abc", "abc-"); + encode_assert ("12345", "12345-"); + encode_assert ("香港", "j6w193g"); + + // Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1 + encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn"); + encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye"); + encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb"); + encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a"); +} + +} // namespace selftest diff --git a/gcc/rust/util/rust-punycode.h b/gcc/rust/util/rust-punycode.h new file mode 100644 index 000000000000..ffb139a5ff3f --- /dev/null +++ b/gcc/rust/util/rust-punycode.h @@ -0,0 +1,46 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + +#ifndef RUST_PUNYCODE_H +#define RUST_PUNYCODE_H + +#include "rust-unicode.h" +#include "optional.h" + +namespace Rust { + +/* Encode a string as punycode. Returns a string if encoding is successful. + * Returns nullopt otherwise. Note that a returned string contains only ASCII + * characters and does not start with `xn--`. */ +tl::optional +encode_punycode (const Utf8String &src); + +} // namespace Rust + +#if CHECKING_P + +namespace selftest { + +void +rust_punycode_encode_test (); + +} // namespace selftest + +#endif // CHECKING_P + +#endif