Skip to content

Commit dde445a

Browse files
committed
feat: alternation optimizations, show regex engine used when test fails
1 parent 99af6ef commit dde445a

File tree

9 files changed

+219
-46
lines changed

9 files changed

+219
-46
lines changed

.github/workflows/test.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,22 +55,22 @@ jobs:
5555
os: [ubuntu-latest, macos-latest, windows-latest]
5656
steps:
5757
- name: Checkout
58-
uses: actions/checkout@v4
58+
uses: actions/checkout@v5
5959

6060
- name: Setup Rust
6161
uses: dtolnay/rust-toolchain@stable
6262

6363
- name: Setup Java
64-
uses: actions/setup-java@v4
64+
uses: actions/setup-java@v5
6565
with:
66-
java-version: '17'
66+
java-version: '24'
6767
distribution: temurin
6868

6969
- name: Setup .NET
70-
uses: actions/setup-dotnet@v4
70+
uses: actions/setup-dotnet@v5
7171
if: ${{ matrix.os == 'ubuntu-latest' }}
7272
with:
73-
dotnet-version: '7.x.x'
73+
dotnet-version: '8.x.x'
7474

7575
- name: Setup Deno
7676
uses: denoland/setup-deno@v1
@@ -87,15 +87,15 @@ jobs:
8787
runs-on: ubuntu-latest
8888
steps:
8989
- name: Checkout
90-
uses: actions/checkout@v4
90+
uses: actions/checkout@v5
9191

9292
- name: Setup Rust
9393
uses: dtolnay/rust-toolchain@nightly
9494

9595
- name: Setup Java
96-
uses: actions/setup-java@v3
96+
uses: actions/setup-java@v5
9797
with:
98-
java-version: '17'
98+
java-version: '24'
9999
distribution: temurin
100100

101101
- name: Setup .NET
@@ -106,7 +106,7 @@ jobs:
106106
- name: Setup Deno
107107
uses: denoland/setup-deno@v1
108108
with:
109-
deno-version: v1.x
109+
deno-version: v2.x
110110

111111
- name: Setup grcov
112112
env:

Cargo.lock

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pomsky-bin/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ supports-color = "3.0.2"
3737
helptext = { version = "0.1.0", path = "../helptext" }
3838
serde_json = "1.0.91"
3939
serde = { version = "1.0.152", features = ["derive"] }
40-
pcre2 = { version = "0.2.5", optional = true }
40+
pcre2 = { version = "0.2.10", optional = true }
4141
regex = { version = "1.11.1", optional = true }
4242
ignore = { version = "0.4.23", optional = true }
4343

pomsky-bin/src/lib.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,24 @@ fn compile(
148148

149149
if !test_errors.is_empty() {
150150
diagnostics.extend(test_errors);
151+
if let Some(last) = diagnostics.last_mut() {
152+
let mut prev_help = last.help.take().unwrap_or_default();
153+
if !prev_help.is_empty() {
154+
prev_help.push('\n');
155+
}
156+
prev_help += "executed with ";
157+
match options.flavor {
158+
RegexFlavor::Pcre => {
159+
let (major, minor) = pcre2::version();
160+
prev_help += &format!("PCRE2 version {major}.{minor}");
161+
}
162+
flavor => {
163+
prev_help += &format!("{flavor:?}");
164+
}
165+
}
166+
last.help = Some(prev_help);
167+
}
168+
151169
return CompilationResult::error(
152170
path,
153171
start.elapsed().as_micros(),

pomsky-lib/src/exprs/char_class/char_set_item.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ impl RegexCompoundCharSet {
6666
}
6767

6868
#[cfg_attr(feature = "dbg", derive(Debug))]
69-
#[derive(Default)]
69+
#[derive(Default, PartialEq, Eq, Clone)]
7070
pub(crate) struct RegexCharSet {
7171
pub(crate) negative: bool,
7272
pub(crate) set: UnicodeSet,

pomsky-lib/src/regex/optimize.rs

Lines changed: 152 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@ use std::{mem, ops::Add};
22

33
use pomsky_syntax::exprs::RepetitionKind;
44

5-
use crate::{exprs::group::RegexGroupKind, unicode_set::UnicodeSet};
5+
use crate::exprs::alternation::RegexAlternation;
6+
use crate::exprs::group::{RegexGroup, RegexGroupKind};
7+
use crate::exprs::repetition::{RegexQuantifier, RegexRepetition};
8+
use crate::unicode_set::UnicodeSet;
69

710
use super::{Regex, RegexCharSet};
811

@@ -66,33 +69,58 @@ impl Regex {
6669
}
6770
}
6871
Regex::Alternation(a) => {
72+
if let Some(Regex::Literal(l)) = a.parts.first()
73+
&& l.is_empty()
74+
{
75+
a.parts.remove(0);
76+
let parts = mem::take(&mut a.parts);
77+
*self = Regex::Repetition(Box::new(RegexRepetition::new(
78+
Regex::Alternation(RegexAlternation { parts }),
79+
RepetitionKind { lower_bound: 0, upper_bound: Some(1) },
80+
RegexQuantifier::Lazy,
81+
)));
82+
return self.optimize();
83+
}
84+
if let Some(Regex::Literal(l)) = a.parts.last()
85+
&& l.is_empty()
86+
{
87+
a.parts.pop();
88+
let parts = mem::take(&mut a.parts);
89+
*self = Regex::Repetition(Box::new(RegexRepetition::new(
90+
Regex::Alternation(RegexAlternation { parts }),
91+
RepetitionKind { lower_bound: 0, upper_bound: Some(1) },
92+
RegexQuantifier::Greedy,
93+
)));
94+
return self.optimize();
95+
}
96+
6997
for part in &mut a.parts {
7098
part.optimize();
7199
}
72100

73-
let mut i = 0;
74-
while i < a.parts.len() - 1 {
75-
let (p1, p2) = a.parts.split_at_mut(i + 1);
76-
let lhs = &mut p1[i];
77-
let rhs = &mut p2[0];
101+
let mut merged = false;
78102

103+
reduce_many_mut(&mut a.parts, |lhs, rhs| {
79104
if lhs.is_single_char() && rhs.is_single_char() {
80-
match (lhs, rhs) {
105+
match (&mut *lhs, rhs) {
81106
(Regex::Literal(lit1), Regex::Literal(lit2)) => {
107+
if lit1 == lit2 {
108+
return true;
109+
}
82110
let mut set = UnicodeSet::new();
83111
set.add_char(lit1.chars().next().unwrap());
84112
set.add_char(lit2.chars().next().unwrap());
85-
a.parts[i] = Regex::CharSet(RegexCharSet::new(set));
86-
a.parts.remove(i + 1);
113+
*lhs = Regex::CharSet(RegexCharSet::new(set));
114+
true
87115
}
88-
(Regex::Literal(lit), Regex::CharSet(set))
89-
| (Regex::CharSet(set), Regex::Literal(lit))
90-
if !set.negative =>
116+
(Regex::Literal(lit), Regex::CharSet(char_set))
117+
| (Regex::CharSet(char_set), Regex::Literal(lit))
118+
if !char_set.negative =>
91119
{
92-
let mut set = std::mem::take(set);
93-
set.set.add_char(lit.chars().next().unwrap());
94-
a.parts[i] = Regex::CharSet(set);
95-
a.parts.remove(i + 1);
120+
let mut char_set = std::mem::take(char_set);
121+
char_set.set.add_char(lit.chars().next().unwrap());
122+
*lhs = Regex::CharSet(char_set);
123+
true
96124
}
97125
(Regex::CharSet(set1), Regex::CharSet(set2))
98126
if !set1.negative && !set2.negative =>
@@ -103,14 +131,21 @@ impl Regex {
103131
for prop in set2.set.props() {
104132
set1.set.add_prop(prop);
105133
}
106-
a.parts.remove(i + 1);
107-
}
108-
_ => {
109-
i += 1;
134+
true
110135
}
136+
_ => false,
111137
}
138+
} else if merge_common_prefix(lhs, rhs) {
139+
merged = true;
140+
true
112141
} else {
113-
i += 1;
142+
false
143+
}
144+
});
145+
146+
if merged {
147+
for part in &mut a.parts {
148+
part.optimize();
114149
}
115150
}
116151

@@ -219,3 +254,99 @@ fn mul_repetitions(a: u32, b: u32) -> Option<u32> {
219254
Some(res)
220255
}
221256
}
257+
258+
/// Merge adjacent elements in the Vec using the `reducer`, which processes two elements at a time.
259+
///
260+
/// When the reducer returns `true`, this indicates that they were merged into the first element
261+
/// in-place, so the second one needs to be removed.
262+
fn reduce_many_mut<T>(slice: &mut Vec<T>, mut reducer: impl FnMut(&mut T, &mut T) -> bool) {
263+
let mut i = 0;
264+
while i < slice.len() - 1 {
265+
let (p1, p2) = slice.split_at_mut(i + 1);
266+
let lhs = &mut p1[i];
267+
let rhs = &mut p2[0];
268+
269+
let res = reducer(lhs, rhs);
270+
if res {
271+
slice.remove(i + 1);
272+
} else {
273+
i += 1;
274+
}
275+
}
276+
}
277+
278+
fn merge_common_prefix(lhs: &mut Regex, rhs: &mut Regex) -> bool {
279+
let prefix1 = prefix(lhs);
280+
let prefix2 = prefix(rhs);
281+
282+
if let (Some(prefix1), Some(prefix2)) = (prefix1, prefix2)
283+
&& prefix1 == prefix2
284+
{
285+
let prefix = match prefix1 {
286+
Prefix::Dot => Regex::Dot,
287+
Prefix::Char(c) => Regex::Literal(c.to_string()),
288+
Prefix::CharSet(char_set) => Regex::CharSet(char_set.clone()),
289+
};
290+
291+
remove_prefix(lhs);
292+
remove_prefix(rhs);
293+
294+
let group = if let Regex::Alternation(alt) = lhs {
295+
alt.parts.push(mem::take(rhs));
296+
vec![prefix, mem::take(lhs)]
297+
} else {
298+
let alts = vec![mem::take(lhs), mem::take(rhs)];
299+
vec![prefix, Regex::Alternation(RegexAlternation::new(alts))]
300+
};
301+
*lhs = Regex::Group(RegexGroup::new(group, RegexGroupKind::Normal));
302+
303+
true
304+
} else {
305+
false
306+
}
307+
}
308+
309+
#[derive(PartialEq, Eq)]
310+
enum Prefix<'a> {
311+
Dot,
312+
Char(char),
313+
CharSet(&'a RegexCharSet),
314+
}
315+
316+
fn prefix(regex: &Regex) -> Option<Prefix<'_>> {
317+
match regex {
318+
Regex::Literal(lit) => lit.chars().next().map(Prefix::Char),
319+
Regex::CharSet(char_set) => Some(Prefix::CharSet(char_set)),
320+
Regex::Dot => Some(Prefix::Dot),
321+
Regex::Group(group) if group.kind == RegexGroupKind::Normal => {
322+
group.parts.first().and_then(prefix)
323+
}
324+
_ => None,
325+
}
326+
}
327+
328+
fn remove_prefix(regex: &mut Regex) {
329+
match regex {
330+
Regex::Literal(lit) => {
331+
let len = lit.chars().next().unwrap().len_utf8();
332+
lit.drain(0..len);
333+
}
334+
Regex::CharSet(_) | Regex::Dot => {
335+
*regex = Regex::Literal(String::new());
336+
}
337+
Regex::Group(group) => {
338+
if let Some(part) = group.parts.first_mut() {
339+
remove_prefix(part);
340+
}
341+
if let Some(Regex::Literal(s)) = group.parts.first()
342+
&& s.is_empty()
343+
{
344+
group.parts.remove(0);
345+
if group.parts.len() == 1 {
346+
*regex = group.parts.pop().unwrap();
347+
}
348+
}
349+
}
350+
_ => {}
351+
}
352+
}

0 commit comments

Comments
 (0)