Skip to content

Commit e06a946

Browse files
committed
syntax: rewrite 'cls1|..|clsN' as '[cls1..clsN]'
Whenever we have an alternation where each of its branches are just classes, we can always combined that into a single class. Single classes are generally going to be cheaper to process further down the pipeline. Namely, instead of needing to branch between them at a higher level in an NFA graph, they can handled as one single unit.
1 parent 2904c80 commit e06a946

File tree

2 files changed

+119
-0
lines changed

2 files changed

+119
-0
lines changed

regex-syntax/src/hir/mod.rs

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,14 @@ impl Hir {
447447
.map(|b| ClassBytesRange { start: b, end: b });
448448
return Hir::class(Class::Bytes(ClassBytes::new(it)));
449449
}
450+
// Similar to singleton chars, we can also look for alternations of
451+
// classes. Those can be smushed into a single class.
452+
if let Some(cls) = class_chars(&new) {
453+
return Hir::class(cls);
454+
}
455+
if let Some(cls) = class_bytes(&new) {
456+
return Hir::class(cls);
457+
}
450458
let props = Properties::alternation(&new);
451459
Hir { kind: HirKind::Alternation(new), props }
452460
}
@@ -854,6 +862,23 @@ impl ClassUnicode {
854862
None
855863
}
856864
}
865+
866+
/// If this class consists of only ASCII ranges, then return its
867+
/// corresponding and equivalent byte class.
868+
pub fn to_byte_class(&self) -> Option<ClassBytes> {
869+
if !self.is_all_ascii() {
870+
return None;
871+
}
872+
Some(ClassBytes::new(self.ranges().iter().map(|r| {
873+
// Since we are guaranteed that our codepoint range is ASCII, the
874+
// 'u8::try_from' calls below are guaranteed to be correct.
875+
ClassBytesRange {
876+
// MSRV(1.59): Use 'u8::try_from(c)' instead.
877+
start: u8::try_from(u32::from(r.start)).unwrap(),
878+
end: u8::try_from(u32::from(r.end)).unwrap(),
879+
}
880+
})))
881+
}
857882
}
858883

859884
/// An iterator over all ranges in a Unicode character class.
@@ -1120,6 +1145,23 @@ impl ClassBytes {
11201145
None
11211146
}
11221147
}
1148+
1149+
/// If this class consists of only ASCII ranges, then return its
1150+
/// corresponding and equivalent Unicode class.
1151+
pub fn to_unicode_class(&self) -> Option<ClassUnicode> {
1152+
if !self.is_all_ascii() {
1153+
return None;
1154+
}
1155+
Some(ClassUnicode::new(self.ranges().iter().map(|r| {
1156+
// Since we are guaranteed that our byte range is ASCII, the
1157+
// 'char::from' calls below are correct and will not erroneously
1158+
// convert a raw byte value into its corresponding codepoint.
1159+
ClassUnicodeRange {
1160+
start: char::from(r.start),
1161+
end: char::from(r.end),
1162+
}
1163+
})))
1164+
}
11231165
}
11241166

11251167
/// An iterator over all ranges in a byte character class.
@@ -1936,6 +1978,44 @@ impl Iterator for LookSetIter {
19361978
}
19371979
}
19381980

1981+
/// Given a sequence of HIR values where each value corresponds to a Unicode
1982+
/// class (or an all-ASCII byte class), return a single Unicode class
1983+
/// corresponding to the union of the classes found.
1984+
fn class_chars(hirs: &[Hir]) -> Option<Class> {
1985+
let mut cls = ClassUnicode::new(vec![]);
1986+
for hir in hirs.iter() {
1987+
match *hir.kind() {
1988+
HirKind::Class(Class::Unicode(ref cls2)) => {
1989+
cls.union(cls2);
1990+
}
1991+
HirKind::Class(Class::Bytes(ref cls2)) => {
1992+
cls.union(&cls2.to_unicode_class()?);
1993+
}
1994+
_ => return None,
1995+
};
1996+
}
1997+
Some(Class::Unicode(cls))
1998+
}
1999+
2000+
/// Given a sequence of HIR values where each value corresponds to a byte class
2001+
/// (or an all-ASCII Unicode class), return a single byte class corresponding
2002+
/// to the union of the classes found.
2003+
fn class_bytes(hirs: &[Hir]) -> Option<Class> {
2004+
let mut cls = ClassBytes::new(vec![]);
2005+
for hir in hirs.iter() {
2006+
match *hir.kind() {
2007+
HirKind::Class(Class::Unicode(ref cls2)) => {
2008+
cls.union(&cls2.to_byte_class()?);
2009+
}
2010+
HirKind::Class(Class::Bytes(ref cls2)) => {
2011+
cls.union(cls2);
2012+
}
2013+
_ => return None,
2014+
};
2015+
}
2016+
Some(Class::Bytes(cls))
2017+
}
2018+
19392019
/// Given a sequence of HIR values where each value corresponds to a literal
19402020
/// that is a single `char`, return that sequence of `char`s. Otherwise return
19412021
/// None. No deduplication is done.

regex-syntax/src/hir/translate.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1963,6 +1963,45 @@ mod tests {
19631963
);
19641964
}
19651965

1966+
// Tests the HIR transformation of things like '[a-z]|[A-Z]' into
1967+
// '[A-Za-z]'. In other words, an alternation of just classes is always
1968+
// equivalent to a single class corresponding to the union of the branches
1969+
// in that class. (Unless some branches match invalid UTF-8 and others
1970+
// match non-ASCII Unicode.)
1971+
#[test]
1972+
fn cat_class_flattened() {
1973+
assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')]));
1974+
// Combining all of the letter properties should give us the one giant
1975+
// letter property.
1976+
#[cfg(feature = "unicode-gencat")]
1977+
assert_eq!(
1978+
t(r"(?x)
1979+
\p{Lowercase_Letter}
1980+
|\p{Uppercase_Letter}
1981+
|\p{Titlecase_Letter}
1982+
|\p{Modifier_Letter}
1983+
|\p{Other_Letter}
1984+
"),
1985+
hir_uclass_query(ClassQuery::Binary("letter"))
1986+
);
1987+
// Byte classes that can truly match invalid UTF-8 cannot be combined
1988+
// with Unicode classes.
1989+
assert_eq!(
1990+
t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"),
1991+
hir_alt(vec![
1992+
hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]),
1993+
hir_bclass(&[(b'\x90', b'\xFF')]),
1994+
hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]),
1995+
])
1996+
);
1997+
// Byte classes on their own can be combined, even if some are ASCII
1998+
// and others are invalid UTF-8.
1999+
assert_eq!(
2000+
t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"),
2001+
hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]),
2002+
);
2003+
}
2004+
19662005
#[test]
19672006
fn class_ascii() {
19682007
assert_eq!(

0 commit comments

Comments
 (0)