From cb57eb96c45c2731de4faca07d8156c6b063e84f Mon Sep 17 00:00:00 2001 From: Guillaume Gomez Date: Thu, 3 Apr 2025 19:44:29 +0200 Subject: [PATCH 1/5] Copy `compiler/rustc_lexer/src/unescape` source code --- Cargo.toml | 4 + src/lib.rs | 438 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/tests.rs | 286 +++++++++++++++++++++++++++++++++ 3 files changed, 728 insertions(+) create mode 100644 Cargo.toml create mode 100644 src/lib.rs create mode 100644 src/tests.rs diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..4f7e710 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,4 @@ +[package] +name = "literal-escaper" +version = "0.0.1" +edition = "2021" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..d6ea424 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,438 @@ +//! Utilities for validating string and char literals and turning them into +//! values they represent. + +use std::ops::Range; +use std::str::Chars; + +use Mode::*; + +#[cfg(test)] +mod tests; + +/// Errors and warnings that can occur during string unescaping. They mostly +/// relate to malformed escape sequences, but there are a few that are about +/// other problems. +#[derive(Debug, PartialEq, Eq)] +pub enum EscapeError { + /// Expected 1 char, but 0 were found. + ZeroChars, + /// Expected 1 char, but more than 1 were found. + MoreThanOneChar, + + /// Escaped '\' character without continuation. + LoneSlash, + /// Invalid escape character (e.g. '\z'). + InvalidEscape, + /// Raw '\r' encountered. + BareCarriageReturn, + /// Raw '\r' encountered in raw string. + BareCarriageReturnInRawString, + /// Unescaped character that was expected to be escaped (e.g. raw '\t'). + EscapeOnlyChar, + + /// Numeric character escape is too short (e.g. '\x1'). + TooShortHexEscape, + /// Invalid character in numeric escape (e.g. '\xz') + InvalidCharInHexEscape, + /// Character code in numeric escape is non-ascii (e.g. '\xFF'). + OutOfRangeHexEscape, + + /// '\u' not followed by '{'. + NoBraceInUnicodeEscape, + /// Non-hexadecimal value in '\u{..}'. + InvalidCharInUnicodeEscape, + /// '\u{}' + EmptyUnicodeEscape, + /// No closing brace in '\u{..}', e.g. '\u{12'. + UnclosedUnicodeEscape, + /// '\u{_12}' + LeadingUnderscoreUnicodeEscape, + /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}' + OverlongUnicodeEscape, + /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'. + LoneSurrogateUnicodeEscape, + /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'. + OutOfRangeUnicodeEscape, + + /// Unicode escape code in byte literal. + UnicodeEscapeInByte, + /// Non-ascii character in byte literal, byte string literal, or raw byte string literal. + NonAsciiCharInByte, + + // `\0` in a C string literal. + NulInCStr, + + /// After a line ending with '\', the next line contains whitespace + /// characters that are not skipped. + UnskippedWhitespaceWarning, + + /// After a line ending with '\', multiple lines are skipped. + MultipleSkippedLinesWarning, +} + +impl EscapeError { + /// Returns true for actual errors, as opposed to warnings. + pub fn is_fatal(&self) -> bool { + !matches!( + self, + EscapeError::UnskippedWhitespaceWarning | EscapeError::MultipleSkippedLinesWarning + ) + } +} + +/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without +/// quotes) and produces a sequence of escaped characters or errors. +/// +/// Values are returned by invoking `callback`. For `Char` and `Byte` modes, +/// the callback will be called exactly once. +pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + match mode { + Char | Byte => { + let mut chars = src.chars(); + let res = unescape_char_or_byte(&mut chars, mode); + callback(0..(src.len() - chars.as_str().len()), res); + } + Str | ByteStr => unescape_non_raw_common(src, mode, callback), + RawStr | RawByteStr => check_raw_common(src, mode, callback), + RawCStr => check_raw_common(src, mode, &mut |r, mut result| { + if let Ok('\0') = result { + result = Err(EscapeError::NulInCStr); + } + callback(r, result) + }), + CStr => unreachable!(), + } +} + +/// Used for mixed utf8 string literals, i.e. those that allow both unicode +/// chars and high bytes. +pub enum MixedUnit { + /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes) + /// and Unicode chars (written directly or via `\u` escapes). + /// + /// For example, if '¥' appears in a string it is represented here as + /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte + /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]` + Char(char), + + /// Used for high bytes (`\x80`..`\xff`). + /// + /// For example, if `\xa5` appears in a string it is represented here as + /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant + /// byte string as the single byte `0xa5`. + HighByte(u8), +} + +impl From for MixedUnit { + fn from(c: char) -> Self { + MixedUnit::Char(c) + } +} + +impl From for MixedUnit { + fn from(n: u8) -> Self { + if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) } + } +} + +/// Takes the contents of a mixed-utf8 literal (without quotes) and produces +/// a sequence of escaped characters or errors. +/// +/// Values are returned by invoking `callback`. +pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + match mode { + CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| { + if let Ok(MixedUnit::Char('\0')) = result { + result = Err(EscapeError::NulInCStr); + } + callback(r, result) + }), + Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), + } +} + +/// Takes a contents of a char literal (without quotes), and returns an +/// unescaped char or an error. +pub fn unescape_char(src: &str) -> Result { + unescape_char_or_byte(&mut src.chars(), Char) +} + +/// Takes a contents of a byte literal (without quotes), and returns an +/// unescaped byte or an error. +pub fn unescape_byte(src: &str) -> Result { + unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char) +} + +/// What kind of literal do we parse. +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Mode { + Char, + + Byte, + + Str, + RawStr, + + ByteStr, + RawByteStr, + + CStr, + RawCStr, +} + +impl Mode { + pub fn in_double_quotes(self) -> bool { + match self { + Str | RawStr | ByteStr | RawByteStr | CStr | RawCStr => true, + Char | Byte => false, + } + } + + /// Are `\x80`..`\xff` allowed? + fn allow_high_bytes(self) -> bool { + match self { + Char | Str => false, + Byte | ByteStr | CStr => true, + RawStr | RawByteStr | RawCStr => unreachable!(), + } + } + + /// Are unicode (non-ASCII) chars allowed? + #[inline] + fn allow_unicode_chars(self) -> bool { + match self { + Byte | ByteStr | RawByteStr => false, + Char | Str | RawStr | CStr | RawCStr => true, + } + } + + /// Are unicode escapes (`\u`) allowed? + fn allow_unicode_escapes(self) -> bool { + match self { + Byte | ByteStr => false, + Char | Str | CStr => true, + RawByteStr | RawStr | RawCStr => unreachable!(), + } + } + + pub fn prefix_noraw(self) -> &'static str { + match self { + Char | Str | RawStr => "", + Byte | ByteStr | RawByteStr => "b", + CStr | RawCStr => "c", + } + } +} + +fn scan_escape + From>( + chars: &mut Chars<'_>, + mode: Mode, +) -> Result { + // Previous character was '\\', unescape what follows. + let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { + '"' => '"', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '0' => '\0', + 'x' => { + // Parse hexadecimal character code. + + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let value = (hi * 16 + lo) as u8; + + return if !mode.allow_high_bytes() && !value.is_ascii() { + Err(EscapeError::OutOfRangeHexEscape) + } else { + // This may be a high byte, but that will only happen if `T` is + // `MixedUnit`, because of the `allow_high_bytes` check above. + Ok(T::from(value)) + }; + } + 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), + _ => return Err(EscapeError::InvalidEscape), + }; + Ok(T::from(res)) +} + +fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { + // We've parsed '\u', now we have to parse '{..}'. + + if chars.next() != Some('{') { + return Err(EscapeError::NoBraceInUnicodeEscape); + } + + // First character must be a hexadecimal digit. + let mut n_digits = 1; + let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { + '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), + '}' => return Err(EscapeError::EmptyUnicodeEscape), + c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + }; + + // First character is valid, now parse the rest of the number + // and closing brace. + loop { + match chars.next() { + None => return Err(EscapeError::UnclosedUnicodeEscape), + Some('_') => continue, + Some('}') => { + if n_digits > 6 { + return Err(EscapeError::OverlongUnicodeEscape); + } + + // Incorrect syntax has higher priority for error reporting + // than unallowed value for a literal. + if !allow_unicode_escapes { + return Err(EscapeError::UnicodeEscapeInByte); + } + + break std::char::from_u32(value).ok_or({ + if value > 0x10FFFF { + EscapeError::OutOfRangeUnicodeEscape + } else { + EscapeError::LoneSurrogateUnicodeEscape + } + }); + } + Some(c) => { + let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + n_digits += 1; + if n_digits > 6 { + // Stop updating value since we're sure that it's incorrect already. + continue; + } + value = value * 16 + digit; + } + }; + } +} + +#[inline] +fn ascii_check(c: char, allow_unicode_chars: bool) -> Result { + if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) } +} + +fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { + let c = chars.next().ok_or(EscapeError::ZeroChars)?; + let res = match c { + '\\' => scan_escape(chars, mode), + '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + _ => ascii_check(c, mode.allow_unicode_chars()), + }?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); + } + Ok(res) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + let mut chars = src.chars(); + let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop + + // The `start` and `end` computation here is complicated because + // `skip_ascii_whitespace` makes us to skip over chars without counting + // them in the range computation. + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { + '\\' => { + match chars.clone().next() { + Some('\n') => { + // Rust language specification requires us to skip whitespaces + // if unescaped '\' character is followed by '\n'. + // For details see [Rust language reference] + // (https://doc.rust-lang.org/reference/tokens.html#string-literals). + skip_ascii_whitespace(&mut chars, start, &mut |range, err| { + callback(range, Err(err)) + }); + continue; + } + _ => scan_escape::(&mut chars, mode), + } + } + '"' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + _ => ascii_check(c, allow_unicode_chars).map(T::from), + }; + let end = src.len() - chars.as_str().len(); + callback(start..end, res); + } +} + +fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) +where + F: FnMut(Range, EscapeError), +{ + let tail = chars.as_str(); + let first_non_space = tail + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(tail.len()); + if tail[1..first_non_space].contains('\n') { + // The +1 accounts for the escaping slash. + let end = start + first_non_space + 1; + callback(start..end, EscapeError::MultipleSkippedLinesWarning); + } + let tail = &tail[first_non_space..]; + if let Some(c) = tail.chars().next() { + if c.is_whitespace() { + // For error reporting, we would like the span to contain the character that was not + // skipped. The +1 is necessary to account for the leading \ that started the escape. + let end = start + first_non_space + c.len_utf8() + 1; + callback(start..end, EscapeError::UnskippedWhitespaceWarning); + } + } + *chars = tail.chars(); +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only produce errors on bare CR. +fn check_raw_common(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + let mut chars = src.chars(); + let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop + + // The `start` and `end` computation here matches the one in + // `unescape_non_raw_common` for consistency, even though this function + // doesn't have to worry about skipping any chars. + while let Some(c) = chars.next() { + let start = src.len() - chars.as_str().len() - c.len_utf8(); + let res = match c { + '\r' => Err(EscapeError::BareCarriageReturnInRawString), + _ => ascii_check(c, allow_unicode_chars), + }; + let end = src.len() - chars.as_str().len(); + callback(start..end, res); + } +} + +#[inline] +pub fn byte_from_char(c: char) -> u8 { + let res = c as u32; + debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); + res as u8 +} diff --git a/src/tests.rs b/src/tests.rs new file mode 100644 index 0000000..5b99495 --- /dev/null +++ b/src/tests.rs @@ -0,0 +1,286 @@ +use super::*; + +#[test] +fn test_unescape_char_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + assert_eq!(unescape_char(literal_text), Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + check(r"\u{0}x", EscapeError::MoreThanOneChar); + check(r"\u{1F63b}}", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\💩", EscapeError::InvalidEscape); + check(r"\●", EscapeError::InvalidEscape); + check("\\\r", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xы", EscapeError::InvalidCharInHexEscape); + check(r"\x🦀", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + check(r"\xff", EscapeError::OutOfRangeHexEscape); + check(r"\xFF", EscapeError::OutOfRangeHexEscape); + check(r"\x80", EscapeError::OutOfRangeHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + + check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape); + + check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape); +} + +#[test] +fn test_unescape_char_good() { + fn check(literal_text: &str, expected_char: char) { + assert_eq!(unescape_char(literal_text), Ok(expected_char)); + } + + check("a", 'a'); + check("ы", 'ы'); + check("🦀", '🦀'); + + check(r#"\""#, '"'); + check(r"\n", '\n'); + check(r"\r", '\r'); + check(r"\t", '\t'); + check(r"\\", '\\'); + check(r"\'", '\''); + check(r"\0", '\0'); + + check(r"\x00", '\0'); + check(r"\x5a", 'Z'); + check(r"\x5A", 'Z'); + check(r"\x7f", 127 as char); + + check(r"\u{0}", '\0'); + check(r"\u{000000}", '\0'); + check(r"\u{41}", 'A'); + check(r"\u{0041}", 'A'); + check(r"\u{00_41}", 'A'); + check(r"\u{4__1__}", 'A'); + check(r"\u{1F63b}", '😻'); +} + +#[test] +fn test_unescape_str_warn() { + fn check(literal: &str, expected: &[(Range, Result)]) { + let mut unescaped = Vec::with_capacity(literal.len()); + unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); + assert_eq!(unescaped, expected); + } + + // Check we can handle escaped newlines at the end of a file. + check("\\\n", &[]); + check("\\\n ", &[]); + + check( + "\\\n \u{a0} x", + &[ + (0..5, Err(EscapeError::UnskippedWhitespaceWarning)), + (3..5, Ok('\u{a0}')), + (5..6, Ok(' ')), + (6..7, Ok('x')), + ], + ); + check("\\\n \n x", &[(0..7, Err(EscapeError::MultipleSkippedLinesWarning)), (7..8, Ok('x'))]); +} + +#[test] +fn test_unescape_str_good() { + fn check(literal_text: &str, expected: &str) { + let mut buf = Ok(String::with_capacity(literal_text.len())); + unescape_unicode(literal_text, Mode::Str, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + assert_eq!(buf.as_deref(), Ok(expected)) + } + + check("foo", "foo"); + check("", ""); + check(" \t\n", " \t\n"); + + check("hello \\\n world", "hello world"); + check("thread's", "thread's") +} + +#[test] +fn test_unescape_byte_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + assert_eq!(unescape_byte(literal_text), Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\💩", EscapeError::InvalidEscape); + check(r"\●", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xы", EscapeError::InvalidCharInHexEscape); + check(r"\x🦀", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + + check("ы", EscapeError::NonAsciiCharInByte); + check("🦀", EscapeError::NonAsciiCharInByte); + + check(r"\u{0}", EscapeError::UnicodeEscapeInByte); + check(r"\u{000000}", EscapeError::UnicodeEscapeInByte); + check(r"\u{41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0041}", EscapeError::UnicodeEscapeInByte); + check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0}x", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte); + check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{D800}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte); +} + +#[test] +fn test_unescape_byte_good() { + fn check(literal_text: &str, expected_byte: u8) { + assert_eq!(unescape_byte(literal_text), Ok(expected_byte)); + } + + check("a", b'a'); + + check(r#"\""#, b'"'); + check(r"\n", b'\n'); + check(r"\r", b'\r'); + check(r"\t", b'\t'); + check(r"\\", b'\\'); + check(r"\'", b'\''); + check(r"\0", b'\0'); + + check(r"\x00", b'\0'); + check(r"\x5a", b'Z'); + check(r"\x5A", b'Z'); + check(r"\x7f", 127); + check(r"\x80", 128); + check(r"\xff", 255); + check(r"\xFF", 255); +} + +#[test] +fn test_unescape_byte_str_good() { + fn check(literal_text: &str, expected: &[u8]) { + let mut buf = Ok(Vec::with_capacity(literal_text.len())); + unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(byte_from_char(c)), + Err(e) => buf = Err((range, e)), + } + } + }); + assert_eq!(buf.as_deref(), Ok(expected)) + } + + check("foo", b"foo"); + check("", b""); + check(" \t\n", b" \t\n"); + + check("hello \\\n world", b"hello world"); + check("thread's", b"thread's") +} + +#[test] +fn test_unescape_raw_str() { + fn check(literal: &str, expected: &[(Range, Result)]) { + let mut unescaped = Vec::with_capacity(literal.len()); + unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); + assert_eq!(unescaped, expected); + } + + check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); + check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); +} + +#[test] +fn test_unescape_raw_byte_str() { + fn check(literal: &str, expected: &[(Range, Result)]) { + let mut unescaped = Vec::with_capacity(literal.len()); + unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); + assert_eq!(unescaped, expected); + } + + check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); + check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]); + check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]); +} From ea70862157477379eb7af86754f4c51ae15773f9 Mon Sep 17 00:00:00 2001 From: Guillaume Gomez Date: Thu, 3 Apr 2025 19:44:48 +0200 Subject: [PATCH 2/5] Add .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2c96eb1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target/ +Cargo.lock From b22cd3065b49e11afc1e36b141ed9b97c0590335 Mon Sep 17 00:00:00 2001 From: Guillaume Gomez Date: Thu, 3 Apr 2025 19:49:55 +0200 Subject: [PATCH 3/5] Add CI check --- .github/workflows/CI.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 .github/workflows/CI.yml diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..8f11abd --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,24 @@ +on: + pull_request: + push: + +permissions: + contents: read + +name: CI +env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + +jobs: + checks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + components: clippy, rustfmt + - run: cargo fmt -- --check + - run: cargo test + - run: cargo clippy From cc1915199f87f05f6b65c7a4c41112f2f643eccd Mon Sep 17 00:00:00 2001 From: Guillaume Gomez Date: Thu, 3 Apr 2025 21:03:52 +0200 Subject: [PATCH 4/5] Unignore `Cargo.lock` --- .gitignore | 1 - Cargo.lock | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 Cargo.lock diff --git a/.gitignore b/.gitignore index 2c96eb1..2f7896d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ target/ -Cargo.lock diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..d52908f --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "literal-escaper" +version = "0.0.1" From d4e5cff947c4fd2e327d14c6dea4ec962fb7293f Mon Sep 17 00:00:00 2001 From: Guillaume Gomez Date: Thu, 3 Apr 2025 21:13:04 +0200 Subject: [PATCH 5/5] Fix fmt --- src/lib.rs | 20 ++++++++++++++++---- src/tests.rs | 46 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d6ea424..cd6faff 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -134,7 +134,11 @@ impl From for MixedUnit { impl From for MixedUnit { fn from(n: u8) -> Self { - if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) } + if n.is_ascii() { + MixedUnit::Char(n as char) + } else { + MixedUnit::HighByte(n) + } } } @@ -280,7 +284,9 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result return Err(EscapeError::LeadingUnderscoreUnicodeEscape), '}' => return Err(EscapeError::EmptyUnicodeEscape), - c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + c => c + .to_digit(16) + .ok_or(EscapeError::InvalidCharInUnicodeEscape)?, }; // First character is valid, now parse the rest of the number @@ -309,7 +315,9 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { - let digit: u32 = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + let digit: u32 = c + .to_digit(16) + .ok_or(EscapeError::InvalidCharInUnicodeEscape)?; n_digits += 1; if n_digits > 6 { // Stop updating value since we're sure that it's incorrect already. @@ -323,7 +331,11 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result Result { - if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) } + if allow_unicode_chars || c.is_ascii() { + Ok(c) + } else { + Err(EscapeError::NonAsciiCharInByte) + } } fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { diff --git a/src/tests.rs b/src/tests.rs index 5b99495..a4bbdc0 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -100,7 +100,9 @@ fn test_unescape_char_good() { fn test_unescape_str_warn() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); + unescape_unicode(literal, Mode::Str, &mut |range, res| { + unescaped.push((range, res)) + }); assert_eq!(unescaped, expected); } @@ -117,7 +119,13 @@ fn test_unescape_str_warn() { (6..7, Ok('x')), ], ); - check("\\\n \n x", &[(0..7, Err(EscapeError::MultipleSkippedLinesWarning)), (7..8, Ok('x'))]); + check( + "\\\n \n x", + &[ + (0..7, Err(EscapeError::MultipleSkippedLinesWarning)), + (7..8, Ok('x')), + ], + ); } #[test] @@ -264,23 +272,45 @@ fn test_unescape_byte_str_good() { fn test_unescape_raw_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); + unescape_unicode(literal, Mode::RawStr, &mut |range, res| { + unescaped.push((range, res)) + }); assert_eq!(unescaped, expected); } - check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); - check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); + check( + "\r", + &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))], + ); + check( + "\rx", + &[ + (0..1, Err(EscapeError::BareCarriageReturnInRawString)), + (1..2, Ok('x')), + ], + ); } #[test] fn test_unescape_raw_byte_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); + unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| { + unescaped.push((range, res)) + }); assert_eq!(unescaped, expected); } - check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); + check( + "\r", + &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))], + ); check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByte))]); - check("🦀a", &[(0..4, Err(EscapeError::NonAsciiCharInByte)), (4..5, Ok('a'))]); + check( + "🦀a", + &[ + (0..4, Err(EscapeError::NonAsciiCharInByte)), + (4..5, Ok('a')), + ], + ); }