@@ -2,7 +2,10 @@ use std::{mem, ops::Add};
22
33use pomsky_syntax:: exprs:: RepetitionKind ;
44
5- use crate :: { exprs:: group:: RegexGroupKind , unicode_set:: UnicodeSet } ;
5+ use crate :: exprs:: alternation:: RegexAlternation ;
6+ use crate :: exprs:: group:: { RegexGroup , RegexGroupKind } ;
7+ use crate :: exprs:: repetition:: { RegexQuantifier , RegexRepetition } ;
8+ use crate :: unicode_set:: UnicodeSet ;
69
710use super :: { Regex , RegexCharSet } ;
811
@@ -66,33 +69,58 @@ impl Regex {
6669 }
6770 }
6871 Regex :: Alternation ( a) => {
72+ if let Some ( Regex :: Literal ( l) ) = a. parts . first ( )
73+ && l. is_empty ( )
74+ {
75+ a. parts . remove ( 0 ) ;
76+ let parts = mem:: take ( & mut a. parts ) ;
77+ * self = Regex :: Repetition ( Box :: new ( RegexRepetition :: new (
78+ Regex :: Alternation ( RegexAlternation { parts } ) ,
79+ RepetitionKind { lower_bound : 0 , upper_bound : Some ( 1 ) } ,
80+ RegexQuantifier :: Lazy ,
81+ ) ) ) ;
82+ return self . optimize ( ) ;
83+ }
84+ if let Some ( Regex :: Literal ( l) ) = a. parts . last ( )
85+ && l. is_empty ( )
86+ {
87+ a. parts . pop ( ) ;
88+ let parts = mem:: take ( & mut a. parts ) ;
89+ * self = Regex :: Repetition ( Box :: new ( RegexRepetition :: new (
90+ Regex :: Alternation ( RegexAlternation { parts } ) ,
91+ RepetitionKind { lower_bound : 0 , upper_bound : Some ( 1 ) } ,
92+ RegexQuantifier :: Greedy ,
93+ ) ) ) ;
94+ return self . optimize ( ) ;
95+ }
96+
6997 for part in & mut a. parts {
7098 part. optimize ( ) ;
7199 }
72100
73- let mut i = 0 ;
74- while i < a. parts . len ( ) - 1 {
75- let ( p1, p2) = a. parts . split_at_mut ( i + 1 ) ;
76- let lhs = & mut p1[ i] ;
77- let rhs = & mut p2[ 0 ] ;
101+ let mut merged = false ;
78102
103+ reduce_many_mut ( & mut a. parts , |lhs, rhs| {
79104 if lhs. is_single_char ( ) && rhs. is_single_char ( ) {
80- match ( lhs, rhs) {
105+ match ( & mut * lhs, rhs) {
81106 ( Regex :: Literal ( lit1) , Regex :: Literal ( lit2) ) => {
107+ if lit1 == lit2 {
108+ return true ;
109+ }
82110 let mut set = UnicodeSet :: new ( ) ;
83111 set. add_char ( lit1. chars ( ) . next ( ) . unwrap ( ) ) ;
84112 set. add_char ( lit2. chars ( ) . next ( ) . unwrap ( ) ) ;
85- a . parts [ i ] = Regex :: CharSet ( RegexCharSet :: new ( set) ) ;
86- a . parts . remove ( i + 1 ) ;
113+ * lhs = Regex :: CharSet ( RegexCharSet :: new ( set) ) ;
114+ true
87115 }
88- ( Regex :: Literal ( lit) , Regex :: CharSet ( set ) )
89- | ( Regex :: CharSet ( set ) , Regex :: Literal ( lit) )
90- if !set . negative =>
116+ ( Regex :: Literal ( lit) , Regex :: CharSet ( char_set ) )
117+ | ( Regex :: CharSet ( char_set ) , Regex :: Literal ( lit) )
118+ if !char_set . negative =>
91119 {
92- let mut set = std:: mem:: take ( set ) ;
93- set . set . add_char ( lit. chars ( ) . next ( ) . unwrap ( ) ) ;
94- a . parts [ i ] = Regex :: CharSet ( set ) ;
95- a . parts . remove ( i + 1 ) ;
120+ let mut char_set = std:: mem:: take ( char_set ) ;
121+ char_set . set . add_char ( lit. chars ( ) . next ( ) . unwrap ( ) ) ;
122+ * lhs = Regex :: CharSet ( char_set ) ;
123+ true
96124 }
97125 ( Regex :: CharSet ( set1) , Regex :: CharSet ( set2) )
98126 if !set1. negative && !set2. negative =>
@@ -103,14 +131,21 @@ impl Regex {
103131 for prop in set2. set . props ( ) {
104132 set1. set . add_prop ( prop) ;
105133 }
106- a. parts . remove ( i + 1 ) ;
107- }
108- _ => {
109- i += 1 ;
134+ true
110135 }
136+ _ => false ,
111137 }
138+ } else if merge_common_prefix ( lhs, rhs) {
139+ merged = true ;
140+ true
112141 } else {
113- i += 1 ;
142+ false
143+ }
144+ } ) ;
145+
146+ if merged {
147+ for part in & mut a. parts {
148+ part. optimize ( ) ;
114149 }
115150 }
116151
@@ -219,3 +254,99 @@ fn mul_repetitions(a: u32, b: u32) -> Option<u32> {
219254 Some ( res)
220255 }
221256}
257+
258+ /// Merge adjacent elements in the Vec using the `reducer`, which processes two elements at a time.
259+ ///
260+ /// When the reducer returns `true`, this indicates that they were merged into the first element
261+ /// in-place, so the second one needs to be removed.
262+ fn reduce_many_mut < T > ( slice : & mut Vec < T > , mut reducer : impl FnMut ( & mut T , & mut T ) -> bool ) {
263+ let mut i = 0 ;
264+ while i < slice. len ( ) - 1 {
265+ let ( p1, p2) = slice. split_at_mut ( i + 1 ) ;
266+ let lhs = & mut p1[ i] ;
267+ let rhs = & mut p2[ 0 ] ;
268+
269+ let res = reducer ( lhs, rhs) ;
270+ if res {
271+ slice. remove ( i + 1 ) ;
272+ } else {
273+ i += 1 ;
274+ }
275+ }
276+ }
277+
278+ fn merge_common_prefix ( lhs : & mut Regex , rhs : & mut Regex ) -> bool {
279+ let prefix1 = prefix ( lhs) ;
280+ let prefix2 = prefix ( rhs) ;
281+
282+ if let ( Some ( prefix1) , Some ( prefix2) ) = ( prefix1, prefix2)
283+ && prefix1 == prefix2
284+ {
285+ let prefix = match prefix1 {
286+ Prefix :: Dot => Regex :: Dot ,
287+ Prefix :: Char ( c) => Regex :: Literal ( c. to_string ( ) ) ,
288+ Prefix :: CharSet ( char_set) => Regex :: CharSet ( char_set. clone ( ) ) ,
289+ } ;
290+
291+ remove_prefix ( lhs) ;
292+ remove_prefix ( rhs) ;
293+
294+ let group = if let Regex :: Alternation ( alt) = lhs {
295+ alt. parts . push ( mem:: take ( rhs) ) ;
296+ vec ! [ prefix, mem:: take( lhs) ]
297+ } else {
298+ let alts = vec ! [ mem:: take( lhs) , mem:: take( rhs) ] ;
299+ vec ! [ prefix, Regex :: Alternation ( RegexAlternation :: new( alts) ) ]
300+ } ;
301+ * lhs = Regex :: Group ( RegexGroup :: new ( group, RegexGroupKind :: Normal ) ) ;
302+
303+ true
304+ } else {
305+ false
306+ }
307+ }
308+
309+ #[ derive( PartialEq , Eq ) ]
310+ enum Prefix < ' a > {
311+ Dot ,
312+ Char ( char ) ,
313+ CharSet ( & ' a RegexCharSet ) ,
314+ }
315+
316+ fn prefix ( regex : & Regex ) -> Option < Prefix < ' _ > > {
317+ match regex {
318+ Regex :: Literal ( lit) => lit. chars ( ) . next ( ) . map ( Prefix :: Char ) ,
319+ Regex :: CharSet ( char_set) => Some ( Prefix :: CharSet ( char_set) ) ,
320+ Regex :: Dot => Some ( Prefix :: Dot ) ,
321+ Regex :: Group ( group) if group. kind == RegexGroupKind :: Normal => {
322+ group. parts . first ( ) . and_then ( prefix)
323+ }
324+ _ => None ,
325+ }
326+ }
327+
328+ fn remove_prefix ( regex : & mut Regex ) {
329+ match regex {
330+ Regex :: Literal ( lit) => {
331+ let len = lit. chars ( ) . next ( ) . unwrap ( ) . len_utf8 ( ) ;
332+ lit. drain ( 0 ..len) ;
333+ }
334+ Regex :: CharSet ( _) | Regex :: Dot => {
335+ * regex = Regex :: Literal ( String :: new ( ) ) ;
336+ }
337+ Regex :: Group ( group) => {
338+ if let Some ( part) = group. parts . first_mut ( ) {
339+ remove_prefix ( part) ;
340+ }
341+ if let Some ( Regex :: Literal ( s) ) = group. parts . first ( )
342+ && s. is_empty ( )
343+ {
344+ group. parts . remove ( 0 ) ;
345+ if group. parts . len ( ) == 1 {
346+ * regex = group. parts . pop ( ) . unwrap ( ) ;
347+ }
348+ }
349+ }
350+ _ => { }
351+ }
352+ }
0 commit comments