Upgrade chumsky to 0.10.1

vogelsgesang · vogelsgesang · commit fbf83cfc10d0 · 2025-05-03T10:29:24.000+02:00
Chumsky 0.10 now tracks position using byte offsets instead of char
offsets.

This implictly fixes a bug in `split_token`, where we were accidentally
indexing a string at an invalid offset, pointing between Unicode
characters.

However, there are still more Unicode related issues. E.g.,
`lsp_pos_to_offset` and `offset_to_lsp_pos` are still treating
positions inconsistently.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,7 @@ regex = "1.11.1"
 tokio = { version = "1.43.0", features = ["full"] }
 tower-lsp = { version = "0.20.0", features = ["proposed"] }
 serde = { version = "1.0", features = ["derive"] }
-chumsky = "0.9.3"
+chumsky = "0.10.1"
 dashmap = "6.1.0"
 ropey = "1.6.1"
 prost = "0.13.4"
diff --git a/src/diagnostic.rs b/src/diagnostic.rs
@@ -1,10 +1,12 @@
-use std::path::Path;
+use std::fmt::Write as _;
+use std::{ops::Deref, path::Path};
 
-use chumsky::error::Simple;
+use chumsky::error::Rich;
 use regex::Regex;
 use ropey::Rope;
 use tower_lsp::lsp_types::{Diagnostic, DiagnosticSeverity, DiagnosticTag};
 
+use crate::tokenizer::Span;
 use crate::{
     bazel_flags::{combine_key_value_flags, BazelFlags, FlagLookupType},
     file_utils::resolve_bazelrc_path,
@@ -14,40 +16,45 @@ use crate::{
 
 pub fn diagnostics_from_parser<'a>(
     rope: &'a Rope,
-    errors: &'a [Simple<char>],
+    errors: &'a [Rich<'a, char>],
 ) -> impl Iterator<Item = Diagnostic> + 'a {
     errors.iter().filter_map(move |item| {
-        let (message, span) = match item.reason() {
-            chumsky::error::SimpleReason::Unclosed { span, delimiter } => {
-                (format!("Unclosed delimiter {}", delimiter), span.clone())
-            }
-            chumsky::error::SimpleReason::Unexpected => (
-                format!(
-                    "{}, expected {}",
-                    if item.found().is_some() {
-                        "Unexpected token in input"
-                    } else {
-                        "Unexpected end of input"
-                    },
-                    if item.expected().len() == 0 {
-                        "something else".to_string()
-                    } else {
-                        item.expected()
-                            .map(|expected| match expected {
-                                Some(expected) => expected.to_string(),
-                                None => "end of input".to_string(),
-                            })
-                            .collect::<Vec<_>>()
-                            .join(", ")
+        let (message, err_span) = match item.reason() {
+            chumsky::error::RichReason::ExpectedFound { expected, found } => {
+                let mut s = String::new();
+                if let Some(found) = found {
+                    write!(s, "Found {}", found.deref()).unwrap();
+                } else {
+                    write!(&mut s, "Unexpected end of input").unwrap();
+                }
+                write!(&mut s, ", expected ").unwrap();
+                match &expected[..] {
+                    [] => {
+                        write!(s, "something else").unwrap();
                     }
-                ),
-                item.span(),
-            ),
-            chumsky::error::SimpleReason::Custom(msg) => (msg.to_string(), item.span()),
+                    [expected] => {
+                        write!(s, "{}", expected).unwrap();
+                    }
+                    _ => {
+                        for expected in &expected[..expected.len() - 1] {
+                            write!(s, "{}", expected).unwrap();
+                            write!(s, ", ").unwrap();
+                        }
+                        write!(s, "or ").unwrap();
+                        write!(s, "{}", expected.last().unwrap()).unwrap();
+                    }
+                }
+                (s, item.span())
+            }
+            chumsky::error::RichReason::Custom(msg) => (msg.to_string(), item.span()),
         };
 
+        let span = &Span {
+            start: err_span.start,
+            end: err_span.end,
+        };
         || -> Option<Diagnostic> {
-            Some(Diagnostic::new_simple(range_to_lsp(rope, &span)?, message))
+            Some(Diagnostic::new_simple(range_to_lsp(rope, span)?, message))
         }()
     })
 }
diff --git a/src/language_server.rs b/src/language_server.rs
@@ -28,7 +28,7 @@ pub struct AnalyzedDocument {
     rope: Rope,
     semantic_tokens: Vec<RCSemanticToken>,
     indexed_lines: IndexedLines,
-    parser_errors: Vec<chumsky::prelude::Simple<char>>,
+    has_parser_errors: bool,
 }
 
 #[derive(Deserialize, Serialize, Debug)]
@@ -78,9 +78,9 @@ impl Backend {
             params.uri.to_string(),
             AnalyzedDocument {
                 rope,
-                parser_errors: errors,
                 semantic_tokens,
                 indexed_lines,
+                has_parser_errors: !errors.is_empty(),
             },
         );
 
@@ -319,7 +319,7 @@ impl LanguageServer for Backend {
             .ok_or(Error::invalid_params("Unknown document!"))?;
         let rope = &doc.rope;
 
-        if !doc.parser_errors.is_empty() {
+        if doc.has_parser_errors {
             return Err(Error::invalid_params(
                 "Formatting can only be applied if there are no parsing errors",
             ));
@@ -346,7 +346,7 @@ impl LanguageServer for Backend {
             .ok_or(Error::invalid_params("Unknown document!"))?;
         let rope = &doc.rope;
 
-        if !doc.parser_errors.is_empty() {
+        if doc.has_parser_errors {
             return Err(Error::invalid_params(
                 "Formatting can only be applied if there are no parsing errors",
             ));
diff --git a/src/parser.rs b/src/parser.rs
@@ -1,4 +1,4 @@
-use chumsky::{error::Simple, Parser};
+use chumsky::{error::Rich, Parser};
 
 use crate::tokenizer::{tokenizer, Span, Spanned, Token};
 
@@ -18,10 +18,10 @@ pub struct Line {
     pub span: Span,
 }
 
-pub struct ParserResult {
+pub struct ParserResult<'a> {
     pub tokens: Vec<Spanned<Token>>,
     pub lines: Vec<Line>,
-    pub errors: Vec<Simple<char>>,
+    pub errors: Vec<Rich<'a, char>>,
 }
 
 // Splits a token at a given separator, keeping the position tracking
@@ -110,7 +110,7 @@ fn parse(tokens: &[(Token, Span)], orig: &str) -> Vec<Line> {
         };
     }
     if let Some(mut l) = current_line.take() {
-        let implicit_final_newline = orig.chars().count();
+        let implicit_final_newline = orig.len();
         l.span = current_line_start..implicit_final_newline;
         result_lines.push(l);
     }
@@ -121,8 +121,9 @@ fn parse(tokens: &[(Token, Span)], orig: &str) -> Vec<Line> {
 // Parser for bazelrc files.
 pub fn parse_from_str(str: &str) -> ParserResult {
     // Tokenize
-    let (tokens_opt, errors) = tokenizer().parse_recovery(str);
-    let tokens = tokens_opt.unwrap_or(Vec::new());
+    let tokenizer_result = tokenizer().parse(str);
+    let tokens = tokenizer_result.output().unwrap_or(&Vec::new()).clone();
+    let errors = tokenizer_result.into_errors();
 
     // Parse
     let lines = parse(&tokens, str);
@@ -359,3 +360,21 @@ fn test_empty_lines() {
         )
     );
 }
+
+#[test]
+fn test_unicode() {
+    // Check that we keep also keep a representation for empty lines
+    assert_eq!(
+        parse_from_str("build:🔥 --❄️=🔥").lines,
+        vec!(Line {
+            command: Some(("build".to_string(), 0..5)),
+            config: Some(("🔥".to_string(), 5..10)),
+            flags: vec!(Flag {
+                name: Some(("--❄️".to_string(), 11..19)),
+                value: Some(("🔥".to_string(), 19..24))
+            }),
+            comment: None,
+            span: 0..24
+        })
+    );
+}
diff --git a/src/tokenizer.rs b/src/tokenizer.rs