pomsky-lang
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 9 additions & 9 deletions b/‎.github/workflows/test.yml‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 4 additions & 4 deletions b/‎Cargo.lock‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pomsky-bin/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎pomsky-bin/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pomsky-bin/src/lib.rs‎
Lines changed: 18 additions & 0 deletions b/‎pomsky-bin/src/lib.rs‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎pomsky-lib/src/exprs/char_class/char_set_item.rs‎
Lines changed: 1 addition & 1 deletion b/‎pomsky-lib/src/exprs/char_class/char_set_item.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pomsky-lib/src/regex/optimize.rs‎
Lines changed: 152 additions & 21 deletions b/‎pomsky-lib/src/regex/optimize.rs‎
Lines changed: 152 additions & 21 deletions
@@ -55,22 +55,22 @@ jobs:
         os: [ubuntu-latest, macos-latest, windows-latest]
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@stable
 
       - name: Setup Java
-        uses: actions/setup-java@v4
+        uses: actions/setup-java@v5
         with:
-          java-version: '17'
+          java-version: '24'
           distribution: temurin
 
       - name: Setup .NET
-        uses: actions/setup-dotnet@v4
+        uses: actions/setup-dotnet@v5
         if: ${{ matrix.os == 'ubuntu-latest' }}
         with:
-          dotnet-version: '7.x.x'
+          dotnet-version: '8.x.x'
 
       - name: Setup Deno
         uses: denoland/setup-deno@v1
@@ -87,15 +87,15 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
 
       - name: Setup Rust
         uses: dtolnay/rust-toolchain@nightly
 
       - name: Setup Java
-        uses: actions/setup-java@v3
+        uses: actions/setup-java@v5
         with:
-          java-version: '17'
+          java-version: '24'
           distribution: temurin
 
       - name: Setup .NET
@@ -106,7 +106,7 @@ jobs:
       - name: Setup Deno
         uses: denoland/setup-deno@v1
         with:
-          deno-version: v1.x
+          deno-version: v2.x
 
       - name: Setup grcov
         env:
 
@@ -37,7 +37,7 @@ supports-color = "3.0.2"
 helptext = { version = "0.1.0", path = "../helptext" }
 serde_json = "1.0.91"
 serde = { version = "1.0.152", features = ["derive"] }
-pcre2 = { version = "0.2.5", optional = true }
+pcre2 = { version = "0.2.10", optional = true }
 regex = { version = "1.11.1", optional = true }
 ignore = { version = "0.4.23", optional = true }
 
 
@@ -148,6 +148,24 @@ fn compile(
 
             if !test_errors.is_empty() {
                 diagnostics.extend(test_errors);
+                if let Some(last) = diagnostics.last_mut() {
+                    let mut prev_help = last.help.take().unwrap_or_default();
+                    if !prev_help.is_empty() {
+                        prev_help.push('\n');
+                    }
+                    prev_help += "executed with ";
+                    match options.flavor {
+                        RegexFlavor::Pcre => {
+                            let (major, minor) = pcre2::version();
+                            prev_help += &format!("PCRE2 version {major}.{minor}");
+                        }
+                        flavor => {
+                            prev_help += &format!("{flavor:?}");
+                        }
+                    }
+                    last.help = Some(prev_help);
+                }
+
                 return CompilationResult::error(
                     path,
                     start.elapsed().as_micros(),
 
@@ -66,7 +66,7 @@ impl RegexCompoundCharSet {
 }
 
 #[cfg_attr(feature = "dbg", derive(Debug))]
-#[derive(Default)]
+#[derive(Default, PartialEq, Eq, Clone)]
 pub(crate) struct RegexCharSet {
     pub(crate) negative: bool,
     pub(crate) set: UnicodeSet,
 
@@ -2,7 +2,10 @@ use std::{mem, ops::Add};
 
 use pomsky_syntax::exprs::RepetitionKind;
 
-use crate::{exprs::group::RegexGroupKind, unicode_set::UnicodeSet};
+use crate::exprs::alternation::RegexAlternation;
+use crate::exprs::group::{RegexGroup, RegexGroupKind};
+use crate::exprs::repetition::{RegexQuantifier, RegexRepetition};
+use crate::unicode_set::UnicodeSet;
 
 use super::{Regex, RegexCharSet};
 
@@ -66,33 +69,58 @@ impl Regex {
                 }
             }
             Regex::Alternation(a) => {
+                if let Some(Regex::Literal(l)) = a.parts.first()
+                    && l.is_empty()
+                {
+                    a.parts.remove(0);
+                    let parts = mem::take(&mut a.parts);
+                    *self = Regex::Repetition(Box::new(RegexRepetition::new(
+                        Regex::Alternation(RegexAlternation { parts }),
+                        RepetitionKind { lower_bound: 0, upper_bound: Some(1) },
+                        RegexQuantifier::Lazy,
+                    )));
+                    return self.optimize();
+                }
+                if let Some(Regex::Literal(l)) = a.parts.last()
+                    && l.is_empty()
+                {
+                    a.parts.pop();
+                    let parts = mem::take(&mut a.parts);
+                    *self = Regex::Repetition(Box::new(RegexRepetition::new(
+                        Regex::Alternation(RegexAlternation { parts }),
+                        RepetitionKind { lower_bound: 0, upper_bound: Some(1) },
+                        RegexQuantifier::Greedy,
+                    )));
+                    return self.optimize();
+                }
+
                 for part in &mut a.parts {
                     part.optimize();
                 }
 
-                let mut i = 0;
-                while i < a.parts.len() - 1 {
-                    let (p1, p2) = a.parts.split_at_mut(i + 1);
-                    let lhs = &mut p1[i];
-                    let rhs = &mut p2[0];
+                let mut merged = false;
 
+                reduce_many_mut(&mut a.parts, |lhs, rhs| {
                     if lhs.is_single_char() && rhs.is_single_char() {
-                        match (lhs, rhs) {
+                        match (&mut *lhs, rhs) {
                             (Regex::Literal(lit1), Regex::Literal(lit2)) => {
+                                if lit1 == lit2 {
+                                    return true;
+                                }
                                 let mut set = UnicodeSet::new();
                                 set.add_char(lit1.chars().next().unwrap());
                                 set.add_char(lit2.chars().next().unwrap());
-                                a.parts[i] = Regex::CharSet(RegexCharSet::new(set));
-                                a.parts.remove(i + 1);
+                                *lhs = Regex::CharSet(RegexCharSet::new(set));
+                                true
                             }
-                            (Regex::Literal(lit), Regex::CharSet(set))
-                            | (Regex::CharSet(set), Regex::Literal(lit))
-                                if !set.negative =>
+                            (Regex::Literal(lit), Regex::CharSet(char_set))
+                            | (Regex::CharSet(char_set), Regex::Literal(lit))
+                                if !char_set.negative =>
                             {
-                                let mut set = std::mem::take(set);
-                                set.set.add_char(lit.chars().next().unwrap());
-                                a.parts[i] = Regex::CharSet(set);
-                                a.parts.remove(i + 1);
+                                let mut char_set = std::mem::take(char_set);
+                                char_set.set.add_char(lit.chars().next().unwrap());
+                                *lhs = Regex::CharSet(char_set);
+                                true
                             }
                             (Regex::CharSet(set1), Regex::CharSet(set2))
                                 if !set1.negative && !set2.negative =>
@@ -103,14 +131,21 @@ impl Regex {
                                 for prop in set2.set.props() {
                                     set1.set.add_prop(prop);
                                 }
-                                a.parts.remove(i + 1);
-                            }
-                            _ => {
-                                i += 1;
+                                true
                             }
+                            _ => false,
                         }
+                    } else if merge_common_prefix(lhs, rhs) {
+                        merged = true;
+                        true
                     } else {
-                        i += 1;
+                        false
+                    }
+                });
+
+                if merged {
+                    for part in &mut a.parts {
+                        part.optimize();
                     }
                 }
 
@@ -219,3 +254,99 @@ fn mul_repetitions(a: u32, b: u32) -> Option<u32> {
         Some(res)
     }
 }
+
+/// Merge adjacent elements in the Vec using the `reducer`, which processes two elements at a time.
+///
+/// When the reducer returns `true`, this indicates that they were merged into the first element
+/// in-place, so the second one needs to be removed.
+fn reduce_many_mut<T>(slice: &mut Vec<T>, mut reducer: impl FnMut(&mut T, &mut T) -> bool) {
+    let mut i = 0;
+    while i < slice.len() - 1 {
+        let (p1, p2) = slice.split_at_mut(i + 1);
+        let lhs = &mut p1[i];
+        let rhs = &mut p2[0];
+
+        let res = reducer(lhs, rhs);
+        if res {
+            slice.remove(i + 1);
+        } else {
+            i += 1;
+        }
+    }
+}
+
+fn merge_common_prefix(lhs: &mut Regex, rhs: &mut Regex) -> bool {
+    let prefix1 = prefix(lhs);
+    let prefix2 = prefix(rhs);
+
+    if let (Some(prefix1), Some(prefix2)) = (prefix1, prefix2)
+        && prefix1 == prefix2
+    {
+        let prefix = match prefix1 {
+            Prefix::Dot => Regex::Dot,
+            Prefix::Char(c) => Regex::Literal(c.to_string()),
+            Prefix::CharSet(char_set) => Regex::CharSet(char_set.clone()),
+        };
+
+        remove_prefix(lhs);
+        remove_prefix(rhs);
+
+        let group = if let Regex::Alternation(alt) = lhs {
+            alt.parts.push(mem::take(rhs));
+            vec![prefix, mem::take(lhs)]
+        } else {
+            let alts = vec![mem::take(lhs), mem::take(rhs)];
+            vec![prefix, Regex::Alternation(RegexAlternation::new(alts))]
+        };
+        *lhs = Regex::Group(RegexGroup::new(group, RegexGroupKind::Normal));
+
+        true
+    } else {
+        false
+    }
+}
+
+#[derive(PartialEq, Eq)]
+enum Prefix<'a> {
+    Dot,
+    Char(char),
+    CharSet(&'a RegexCharSet),
+}
+
+fn prefix(regex: &Regex) -> Option<Prefix<'_>> {
+    match regex {
+        Regex::Literal(lit) => lit.chars().next().map(Prefix::Char),
+        Regex::CharSet(char_set) => Some(Prefix::CharSet(char_set)),
+        Regex::Dot => Some(Prefix::Dot),
+        Regex::Group(group) if group.kind == RegexGroupKind::Normal => {
+            group.parts.first().and_then(prefix)
+        }
+        _ => None,
+    }
+}
+
+fn remove_prefix(regex: &mut Regex) {
+    match regex {
+        Regex::Literal(lit) => {
+            let len = lit.chars().next().unwrap().len_utf8();
+            lit.drain(0..len);
+        }
+        Regex::CharSet(_) | Regex::Dot => {
+            *regex = Regex::Literal(String::new());
+        }
+        Regex::Group(group) => {
+            if let Some(part) = group.parts.first_mut() {
+                remove_prefix(part);
+            }
+            if let Some(Regex::Literal(s)) = group.parts.first()
+                && s.is_empty()
+            {
+                group.parts.remove(0);
+                if group.parts.len() == 1 {
+                    *regex = group.parts.pop().unwrap();
+                }
+            }
+        }
+        _ => {}
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ impl RegexCompoundCharSet {`
`66`	`66`	`}`
`67`	`67`
`68`	`68`	`#[cfg_attr(feature = "dbg", derive(Debug))]`
`69`		`-#[derive(Default)]`
	`69`	`+#[derive(Default, PartialEq, Eq, Clone)]`
`70`	`70`	`pub(crate) struct RegexCharSet {`
`71`	`71`	`pub(crate) negative: bool,`
`72`	`72`	`pub(crate) set: UnicodeSet,`