From a0b67a2724bf6f9c86747f69b3d7ff0849f8827a Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Thu, 1 Feb 2024 23:19:21 -0500 Subject: [PATCH 1/5] Cleanup regex_expressions.rs to remove _regexp_match function #9106 --- Cargo.toml | 1 + datafusion/physical-expr/Cargo.toml | 1 + .../physical-expr/src/regex_expressions.rs | 101 +++--------------- 3 files changed, 15 insertions(+), 88 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4c0e7bde26b8..e99618dd9f9e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ arrow-flight = { version = "50.0.0", features = ["flight-sql-experimental"] } arrow-ipc = { version = "50.0.0", default-features = false, features = ["lz4"] } arrow-ord = { version = "50.0.0", default-features = false } arrow-schema = { version = "50.0.0", default-features = false } +arrow-string = { version = "50.0.0", default-features = false } async-trait = "0.1.73" bigdecimal = "0.4.1" bytes = "1.4" diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index dc3ecdb14fb5..029f4565c04a 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -49,6 +49,7 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-ord = { workspace = true } arrow-schema = { workspace = true } +arrow-string = { workspace = true } base64 = { version = "0.21", optional = true } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs index bdd272563e75..926b5f601bce 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/physical-expr/src/regex_expressions.rs @@ -25,8 +25,7 @@ use arrow::array::{ new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray, OffsetSizeTrait, }; -use arrow_array::builder::{GenericStringBuilder, ListBuilder}; -use arrow_schema::ArrowError; + use datafusion_common::{arrow_datafusion_err, plan_err}; use datafusion_common::{ cast::as_generic_string_array, internal_err, DataFusionError, Result, @@ -61,19 +60,20 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result { 2 => { let values = as_generic_string_array::(&args[0])?; let regex = as_generic_string_array::(&args[1])?; - _regexp_match(values, regex, None).map_err(|e| arrow_datafusion_err!(e)) + arrow_string::regexp::regexp_match(values, regex, None) + .map_err(|e| arrow_datafusion_err!(e)) } 3 => { let values = as_generic_string_array::(&args[0])?; let regex = as_generic_string_array::(&args[1])?; - let flags = Some(as_generic_string_array::(&args[2])?); + let flags = as_generic_string_array::(&args[2])?; - match flags { - Some(f) if f.iter().any(|s| s == Some("g")) => { - plan_err!("regexp_match() does not support the \"global\" option") - }, - _ => _regexp_match(values, regex, flags).map_err(|e| arrow_datafusion_err!(e)), + if flags.iter().any(|s| s == Some("g")) { + return plan_err!("regexp_match() does not support the \"global\" option") } + + arrow_string::regexp::regexp_match(values, regex, Some(flags)) + .map_err(|e| arrow_datafusion_err!(e)) } other => internal_err!( "regexp_match was called with {other} arguments. It requires at least 2 and at most 3." @@ -81,83 +81,6 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result { } } -/// TODO: Remove this once it is included in arrow-rs new release. -/// -fn _regexp_match( - array: &GenericStringArray, - regex_array: &GenericStringArray, - flags_array: Option<&GenericStringArray>, -) -> std::result::Result { - let mut patterns: std::collections::HashMap = - std::collections::HashMap::new(); - let builder: GenericStringBuilder = - GenericStringBuilder::with_capacity(0, 0); - let mut list_builder = ListBuilder::new(builder); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(value) => format!("(?{value}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - (Some(_), Some(pattern)) if pattern == *"" => { - list_builder.values().append_value(""); - list_builder.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.insert(pattern.clone(), re); - patterns.get(&pattern).unwrap() - } - }; - match re.captures(value) { - Some(caps) => { - let mut iter = caps.iter(); - if caps.len() > 1 { - iter.next(); - } - for m in iter.flatten() { - list_builder.values().append_value(m.as_str()); - } - - list_builder.append(true); - } - None => list_builder.append(false), - } - } - _ => list_builder.append(false), - } - Ok(()) - }) - .collect::, ArrowError>>()?; - Ok(Arc::new(list_builder.finish())) -} - /// replace POSIX capture groups (like \1) with Rust Regex group (like ${1}) /// used by regexp_replace fn regex_replace_posix_groups(replacement: &str) -> String { @@ -284,7 +207,7 @@ fn _regexp_replace_early_abort( Ok(new_null_array(input_array.data_type(), input_array.len())) } -/// Special cased regex_replace implementation for the scenerio where +/// Special cased regex_replace implementation for the scenario where /// the pattern, replacement and flags are static (arrays that are derived /// from scalars). This means we can skip regex caching system and basically /// hold a single Regex object for the replace operation. This also speeds @@ -409,10 +332,12 @@ pub fn specialize_regexp_replace( #[cfg(test)] mod tests { - use super::*; use arrow::array::*; + use datafusion_common::ScalarValue; + use super::*; + #[test] fn test_case_sensitive_regexp_match() { let values = StringArray::from(vec!["abc"; 5]); From 25f690dcc54c0e077f9b877165af142fce174181 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 2 Feb 2024 09:27:21 -0500 Subject: [PATCH 2/5] Adding datafusion-cli Cargo.lock --- datafusion-cli/Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 4d5b3b711d33..94c57bd770ec 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1264,6 +1264,7 @@ dependencies = [ "arrow-buffer", "arrow-ord", "arrow-schema", + "arrow-string", "base64", "blake2", "blake3", From 2b851e82a9ba54f60e8f452bd8244f9f2cfd2ded Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 2 Feb 2024 19:09:33 -0500 Subject: [PATCH 3/5] Update datafusion/physical-expr/src/regex_expressions.rs Removed extraneous line as per code review suggestion Co-authored-by: Liang-Chi Hsieh --- datafusion/physical-expr/src/regex_expressions.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs index 926b5f601bce..4c7b15d93f56 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/physical-expr/src/regex_expressions.rs @@ -333,7 +333,6 @@ pub fn specialize_regexp_replace( #[cfg(test)] mod tests { use arrow::array::*; - use datafusion_common::ScalarValue; use super::*; From b67442b89e7232df9820a1872f2aeaa44072de9e Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 2 Feb 2024 19:10:05 -0500 Subject: [PATCH 4/5] Update datafusion/physical-expr/src/regex_expressions.rs Removed extraneous lines as per code review Co-authored-by: Liang-Chi Hsieh --- datafusion/physical-expr/src/regex_expressions.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs index 4c7b15d93f56..fc99802ec248 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/physical-expr/src/regex_expressions.rs @@ -334,7 +334,6 @@ pub fn specialize_regexp_replace( mod tests { use arrow::array::*; use datafusion_common::ScalarValue; - use super::*; #[test] From ff6983a435660a987e213005aadeefc0e6150550 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Fri, 2 Feb 2024 20:55:00 -0500 Subject: [PATCH 5/5] Make rustfmt happy again. --- datafusion/physical-expr/src/regex_expressions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs index fc99802ec248..83733da86484 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/physical-expr/src/regex_expressions.rs @@ -332,9 +332,9 @@ pub fn specialize_regexp_replace( #[cfg(test)] mod tests { + use super::*; use arrow::array::*; use datafusion_common::ScalarValue; - use super::*; #[test] fn test_case_sensitive_regexp_match() {