From 426e31dee54d0573a0e0e518e443dbb38e3cd14b Mon Sep 17 00:00:00 2001
From: "Adam H. Leventhal" <ahl@oxide.computer>
Date: Wed, 6 Dec 2023 15:48:11 -0800
Subject: [PATCH] fix #4471 and #5138

---
 src/string.rs              | 136 +++++++++++++++++++++++++++++++++++--
 tests/source/issue-4471.rs |  13 ++++
 tests/target/issue-4471.rs |  22 ++++++
 3 files changed, 165 insertions(+), 6 deletions(-)
 create mode 100644 tests/source/issue-4471.rs
 create mode 100644 tests/target/issue-4471.rs
diff --git a/src/string.rs b/src/string.rs
index cb666fff695..fb3974c527d 100644
--- a/src/string.rs
+++ b/src/string.rs
@@ -61,6 +61,74 @@ impl<'a> StringFormat<'a> {
     }
 }
 
+/// Look for unicode escape codes surrounded by braces; return the number of
+/// characters on success.
+fn is_valid_braced_unicode(mut chars: std::str::Chars<'_>) -> Option<usize> {
+    match (chars.next(), chars.next()) {
+        (Some('{'), Some('}')) => return None,
+        (Some('{'), Some(_)) => (),
+        _ => return None,
+    }
+
+    for count in 3..=7 {
+        match chars.next() {
+            Some('}') => return Some(count),
+            None => return None,
+            _ => (),
+        }
+    }
+
+    if let Some('}') = chars.next() {
+        Some(8)
+    } else {
+        None
+    }
+}
+
+/// Look for valid escape sequences; return the number of characters on
+/// success.
+fn is_valid_escape(input: &str) -> Option<usize> {
+    let mut chars = input.chars();
+    match (chars.next(), chars.next()) {
+        (Some('\\'), Some('n')) => Some(2),
+        (Some('\\'), Some('r')) => Some(2),
+        (Some('\\'), Some('t')) => Some(2),
+        (Some('\\'), Some('\\')) => Some(2),
+        (Some('\\'), Some('0')) => Some(2),
+        (Some('\\'), Some('\'')) => Some(2),
+        (Some('\\'), Some('"')) => Some(2),
+        (Some('\\'), Some('x')) => {
+            if chars.count() >= 2 {
+                Some(4)
+            } else {
+                None
+            }
+        }
+        (Some('\\'), Some('u')) => is_valid_braced_unicode(chars).map(|n| n + 2),
+        _ => None,
+    }
+}
+
+fn segment(input: &str) -> Vec<&str> {
+    let mut out = Vec::new();
+    let mut current = input;
+
+    'outer: loop {
+        for (ii, _) in current.char_indices() {
+            if let Some(n) = is_valid_escape(&current[ii..]) {
+                out.extend(current[..ii].graphemes(false));
+                out.push(&current[ii..ii + n]);
+                current = &current[ii + n..];
+                continue 'outer;
+            }
+        }
+        out.extend(current.graphemes(false));
+        break;
+    }
+
+    out
+}
+
 pub(crate) fn rewrite_string<'a>(
     orig: &str,
     fmt: &StringFormat<'a>,
@@ -73,10 +141,11 @@ pub(crate) fn rewrite_string<'a>(
 
     // Strip line breaks.
     // With this regex applied, all remaining whitespaces are significant
-    let strip_line_breaks_re = Regex::new(r"([^\\](\\\\)*)\\[\n\r][[:space:]]*").unwrap();
+    let strip_line_breaks_re = Regex::new(r"(([^\\]|\\\\)(\\\\)*)\\[\n\r][[:space:]]*").unwrap();
     let stripped_str = strip_line_breaks_re.replace_all(orig, "$1");
 
-    let graphemes = UnicodeSegmentation::graphemes(&*stripped_str, false).collect::<Vec<&str>>();
+    // let graphemes = UnicodeSegmentation::graphemes(&*stripped_str, false).collect::<Vec<&str>>();
+    let graphemes = segment(&stripped_str);
 
     // `cur_start` is the position in `orig` of the start of the current line.
     let mut cur_start = 0;
@@ -317,16 +386,14 @@ fn break_string(max_width: usize, trim_end: bool, line_end: &str, input: &[&str]
         // No whitespace found, try looking for a punctuation instead
         _ => match (0..max_width_index_in_input)
             .rev()
-            .skip_while(|pos| !is_valid_linebreak(input, *pos))
-            .next()
+            .find(|pos| is_valid_linebreak(input, *pos))
         {
             // Found a punctuation and what is on its left side is big enough.
             Some(index) if index >= MIN_STRING => break_at(index),
             // Either no boundary character was found to the left of `input[max_chars]`, or the line
             // got too small. We try searching for a boundary character to the right.
             _ => match (max_width_index_in_input..input.len())
-                .skip_while(|pos| !is_valid_linebreak(input, *pos))
-                .next()
+                .find(|pos| is_valid_linebreak(input, *pos))
             {
                 // A boundary was found after the line limit
                 Some(index) => break_at(index),
@@ -342,6 +409,10 @@ fn is_valid_linebreak(input: &[&str], pos: usize) -> bool {
     if is_whitespace {
         return true;
     }
+    let is_escape = input[pos].starts_with('\\');
+    if is_escape {
+        return true;
+    }
     let is_punctuation = is_punctuation(input[pos]);
     if is_punctuation && !is_part_of_type(input, pos) {
         return true;
@@ -378,6 +449,7 @@ mod test {
     use super::{break_string, detect_url, rewrite_string, SnippetState, StringFormat};
     use crate::config::Config;
     use crate::shape::{Indent, Shape};
+    use crate::string::{is_valid_braced_unicode, is_valid_escape, segment};
     use unicode_segmentation::UnicodeSegmentation;
 
     #[test]
@@ -722,4 +794,56 @@ mod test {
         let graphemes = UnicodeSegmentation::graphemes(&*string, false).collect::<Vec<&str>>();
         assert_eq!(detect_url(&graphemes, 8), Some(21));
     }
+
+    #[test]
+    fn test_unicode_escapes() {
+        assert_eq!(is_valid_braced_unicode("{1}".chars()), Some(3));
+        assert_eq!(is_valid_braced_unicode("{12}".chars()), Some(4));
+        assert_eq!(is_valid_braced_unicode("{123}".chars()), Some(5));
+        assert_eq!(is_valid_braced_unicode("{1234}".chars()), Some(6));
+        assert_eq!(is_valid_braced_unicode("{12345}".chars()), Some(7));
+        assert_eq!(is_valid_braced_unicode("{123456}".chars()), Some(8));
+
+        assert_eq!(is_valid_braced_unicode("}".chars()), None);
+        assert_eq!(is_valid_braced_unicode("}abc".chars()), None);
+        assert_eq!(is_valid_braced_unicode("abc".chars()), None);
+        assert_eq!(is_valid_braced_unicode("{}".chars()), None);
+        assert_eq!(is_valid_braced_unicode("{1234567}".chars()), None);
+        assert_eq!(is_valid_braced_unicode("{12345679".chars()), None);
+    }
+
+    #[test]
+    fn test_valid_escapes() {
+        assert_eq!(is_valid_escape("\\u{1}"), Some(5));
+        assert_eq!(is_valid_escape("\\u{12}"), Some(6));
+        assert_eq!(is_valid_escape("\\u{123}"), Some(7));
+        assert_eq!(is_valid_escape("\\u{1234}"), Some(8));
+        assert_eq!(is_valid_escape("\\u{12345}"), Some(9));
+        assert_eq!(is_valid_escape("\\u{123456}"), Some(10));
+
+        assert_eq!(is_valid_escape("\\u}"), None);
+        assert_eq!(is_valid_escape("\\u}abc"), None);
+        assert_eq!(is_valid_escape("\\uabc"), None);
+        assert_eq!(is_valid_escape("\\u{}"), None);
+        assert_eq!(is_valid_escape("\\u{1234567}"), None);
+        assert_eq!(is_valid_escape("\\u{12345679"), None);
+
+        assert_eq!(is_valid_escape("\\n"), Some(2));
+        assert_eq!(is_valid_escape("\\r"), Some(2));
+        assert_eq!(is_valid_escape("\\t"), Some(2));
+        assert_eq!(is_valid_escape("\\'"), Some(2));
+        assert_eq!(is_valid_escape("\\\""), Some(2));
+        assert_eq!(is_valid_escape("\\\\"), Some(2));
+
+        assert_eq!(is_valid_escape("\\xab"), Some(4));
+
+        assert_eq!(is_valid_escape("\\f"), None);
+    }
+
+    #[test]
+    fn test_segment() {
+        assert_eq!(segment("input"), vec!["i", "n", "p", "u", "t"]);
+        assert_eq!(segment(r#"i\nput"#), vec!["i", "\\n", "p", "u", "t"]);
+        assert_eq!(segment(r#"\x61\u{61}"#), vec![r#"\x61"#, r#"\u{61}"#]);
+    }
 }
diff --git a/tests/source/issue-4471.rs b/tests/source/issue-4471.rs
new file mode 100644
index 00000000000..4ac72104012
--- /dev/null
+++ b/tests/source/issue-4471.rs
@@ -0,0 +1,13 @@
+// rustfmt-format_strings: true
+
+fn x() {
+    const ASCII_ESCAPE: &str = "id\u{1f}1\u{1f}/Users/nixon/dev/rs/gitstatusd\u{1f}1c9be4fe5460a30e70de9cbf99c3ec7064296b28\u{1f}master\u{1f}\u{1f}\u{1f}\u{1f}\u{1f}7\u{1f}0\u{1f}1\u{1f}0\u{1f}1\u{1f}0\u{1f}0\u{1f}0\u{1f}\u{1f}0\u{1f}0\u{1f}0\u{1f}\u{1f}\u{1f}0\u{1f}0\u{1f}0\u{1f}0";
+    let _ = "\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"";
+    let _ = "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\";
+    let _ = "a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61";
+    let _ = "a\x61aaaa\x61a\x61a\x61aaaa\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61aaaaaa\x61a\x61a\x61a";
+
+    "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\a";
+    "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\";
+    "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\";
+}
\ No newline at end of file
diff --git a/tests/target/issue-4471.rs b/tests/target/issue-4471.rs
new file mode 100644
index 00000000000..a8609eea58d
--- /dev/null
+++ b/tests/target/issue-4471.rs
@@ -0,0 +1,22 @@
+// rustfmt-format_strings: true
+
+fn x() {
+    const ASCII_ESCAPE: &str = "id\u{1f}1\u{1f}/Users/nixon/dev/rs/gitstatusd\u{1f}\
+                                1c9be4fe5460a30e70de9cbf99c3ec7064296b28\u{1f}master\u{1f}\u{1f}\
+                                \u{1f}\u{1f}\u{1f}7\u{1f}0\u{1f}1\u{1f}0\u{1f}1\u{1f}0\u{1f}\
+                                0\u{1f}0\u{1f}\u{1f}0\u{1f}0\u{1f}0\u{1f}\u{1f}\u{1f}0\u{1f}\
+                                0\u{1f}0\u{1f}0";
+    let _ = "\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\
+             \"\"\"";
+    let _ = "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
+             \\\\\\";
+    let _ = "a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61";
+    let _ = "a\x61aaaa\x61a\x61a\x61aaaa\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61a\x61\
+             aaaaaa\x61a\x61a\x61a";
+
+    "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\a";
+    "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
+     \\";
+    "\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
+     \\\\";
+}