@@ -447,6 +447,14 @@ impl Hir {
447
447
. map ( |b| ClassBytesRange { start : b, end : b } ) ;
448
448
return Hir :: class ( Class :: Bytes ( ClassBytes :: new ( it) ) ) ;
449
449
}
450
+ // Similar to singleton chars, we can also look for alternations of
451
+ // classes. Those can be smushed into a single class.
452
+ if let Some ( cls) = class_chars ( & new) {
453
+ return Hir :: class ( cls) ;
454
+ }
455
+ if let Some ( cls) = class_bytes ( & new) {
456
+ return Hir :: class ( cls) ;
457
+ }
450
458
let props = Properties :: alternation ( & new) ;
451
459
Hir { kind : HirKind :: Alternation ( new) , props }
452
460
}
@@ -854,6 +862,23 @@ impl ClassUnicode {
854
862
None
855
863
}
856
864
}
865
+
866
+ /// If this class consists of only ASCII ranges, then return its
867
+ /// corresponding and equivalent byte class.
868
+ pub fn to_byte_class ( & self ) -> Option < ClassBytes > {
869
+ if !self . is_all_ascii ( ) {
870
+ return None ;
871
+ }
872
+ Some ( ClassBytes :: new ( self . ranges ( ) . iter ( ) . map ( |r| {
873
+ // Since we are guaranteed that our codepoint range is ASCII, the
874
+ // 'u8::try_from' calls below are guaranteed to be correct.
875
+ ClassBytesRange {
876
+ // MSRV(1.59): Use 'u8::try_from(c)' instead.
877
+ start : u8:: try_from ( u32:: from ( r. start ) ) . unwrap ( ) ,
878
+ end : u8:: try_from ( u32:: from ( r. end ) ) . unwrap ( ) ,
879
+ }
880
+ } ) ) )
881
+ }
857
882
}
858
883
859
884
/// An iterator over all ranges in a Unicode character class.
@@ -1120,6 +1145,23 @@ impl ClassBytes {
1120
1145
None
1121
1146
}
1122
1147
}
1148
+
1149
+ /// If this class consists of only ASCII ranges, then return its
1150
+ /// corresponding and equivalent Unicode class.
1151
+ pub fn to_unicode_class ( & self ) -> Option < ClassUnicode > {
1152
+ if !self . is_all_ascii ( ) {
1153
+ return None ;
1154
+ }
1155
+ Some ( ClassUnicode :: new ( self . ranges ( ) . iter ( ) . map ( |r| {
1156
+ // Since we are guaranteed that our byte range is ASCII, the
1157
+ // 'char::from' calls below are correct and will not erroneously
1158
+ // convert a raw byte value into its corresponding codepoint.
1159
+ ClassUnicodeRange {
1160
+ start : char:: from ( r. start ) ,
1161
+ end : char:: from ( r. end ) ,
1162
+ }
1163
+ } ) ) )
1164
+ }
1123
1165
}
1124
1166
1125
1167
/// An iterator over all ranges in a byte character class.
@@ -1936,6 +1978,44 @@ impl Iterator for LookSetIter {
1936
1978
}
1937
1979
}
1938
1980
1981
+ /// Given a sequence of HIR values where each value corresponds to a Unicode
1982
+ /// class (or an all-ASCII byte class), return a single Unicode class
1983
+ /// corresponding to the union of the classes found.
1984
+ fn class_chars ( hirs : & [ Hir ] ) -> Option < Class > {
1985
+ let mut cls = ClassUnicode :: new ( vec ! [ ] ) ;
1986
+ for hir in hirs. iter ( ) {
1987
+ match * hir. kind ( ) {
1988
+ HirKind :: Class ( Class :: Unicode ( ref cls2) ) => {
1989
+ cls. union ( cls2) ;
1990
+ }
1991
+ HirKind :: Class ( Class :: Bytes ( ref cls2) ) => {
1992
+ cls. union ( & cls2. to_unicode_class ( ) ?) ;
1993
+ }
1994
+ _ => return None ,
1995
+ } ;
1996
+ }
1997
+ Some ( Class :: Unicode ( cls) )
1998
+ }
1999
+
2000
+ /// Given a sequence of HIR values where each value corresponds to a byte class
2001
+ /// (or an all-ASCII Unicode class), return a single byte class corresponding
2002
+ /// to the union of the classes found.
2003
+ fn class_bytes ( hirs : & [ Hir ] ) -> Option < Class > {
2004
+ let mut cls = ClassBytes :: new ( vec ! [ ] ) ;
2005
+ for hir in hirs. iter ( ) {
2006
+ match * hir. kind ( ) {
2007
+ HirKind :: Class ( Class :: Unicode ( ref cls2) ) => {
2008
+ cls. union ( & cls2. to_byte_class ( ) ?) ;
2009
+ }
2010
+ HirKind :: Class ( Class :: Bytes ( ref cls2) ) => {
2011
+ cls. union ( cls2) ;
2012
+ }
2013
+ _ => return None ,
2014
+ } ;
2015
+ }
2016
+ Some ( Class :: Bytes ( cls) )
2017
+ }
2018
+
1939
2019
/// Given a sequence of HIR values where each value corresponds to a literal
1940
2020
/// that is a single `char`, return that sequence of `char`s. Otherwise return
1941
2021
/// None. No deduplication is done.
0 commit comments