diff --git a/Cargo.toml b/Cargo.toml index 4c0e7bde26b8..e99618dd9f9e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ arrow-flight = { version = "50.0.0", features = ["flight-sql-experimental"] } arrow-ipc = { version = "50.0.0", default-features = false, features = ["lz4"] } arrow-ord = { version = "50.0.0", default-features = false } arrow-schema = { version = "50.0.0", default-features = false } +arrow-string = { version = "50.0.0", default-features = false } async-trait = "0.1.73" bigdecimal = "0.4.1" bytes = "1.4" diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 4d5b3b711d33..94c57bd770ec 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1264,6 +1264,7 @@ dependencies = [ "arrow-buffer", "arrow-ord", "arrow-schema", + "arrow-string", "base64", "blake2", "blake3", diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index dc3ecdb14fb5..029f4565c04a 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -49,6 +49,7 @@ arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-ord = { workspace = true } arrow-schema = { workspace = true } +arrow-string = { workspace = true } base64 = { version = "0.21", optional = true } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs index bdd272563e75..83733da86484 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/physical-expr/src/regex_expressions.rs @@ -25,8 +25,7 @@ use arrow::array::{ new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray, OffsetSizeTrait, }; -use arrow_array::builder::{GenericStringBuilder, ListBuilder}; -use arrow_schema::ArrowError; + use datafusion_common::{arrow_datafusion_err, plan_err}; use datafusion_common::{ cast::as_generic_string_array, internal_err, DataFusionError, Result, @@ -61,19 +60,20 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result { 2 => { let values = as_generic_string_array::(&args[0])?; let regex = as_generic_string_array::(&args[1])?; - _regexp_match(values, regex, None).map_err(|e| arrow_datafusion_err!(e)) + arrow_string::regexp::regexp_match(values, regex, None) + .map_err(|e| arrow_datafusion_err!(e)) } 3 => { let values = as_generic_string_array::(&args[0])?; let regex = as_generic_string_array::(&args[1])?; - let flags = Some(as_generic_string_array::(&args[2])?); + let flags = as_generic_string_array::(&args[2])?; - match flags { - Some(f) if f.iter().any(|s| s == Some("g")) => { - plan_err!("regexp_match() does not support the \"global\" option") - }, - _ => _regexp_match(values, regex, flags).map_err(|e| arrow_datafusion_err!(e)), + if flags.iter().any(|s| s == Some("g")) { + return plan_err!("regexp_match() does not support the \"global\" option") } + + arrow_string::regexp::regexp_match(values, regex, Some(flags)) + .map_err(|e| arrow_datafusion_err!(e)) } other => internal_err!( "regexp_match was called with {other} arguments. It requires at least 2 and at most 3." @@ -81,83 +81,6 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result { } } -/// TODO: Remove this once it is included in arrow-rs new release. -/// -fn _regexp_match( - array: &GenericStringArray, - regex_array: &GenericStringArray, - flags_array: Option<&GenericStringArray>, -) -> std::result::Result { - let mut patterns: std::collections::HashMap = - std::collections::HashMap::new(); - let builder: GenericStringBuilder = - GenericStringBuilder::with_capacity(0, 0); - let mut list_builder = ListBuilder::new(builder); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(value) => format!("(?{value}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - // Required for Postgres compatibility: - // SELECT regexp_match('foobarbequebaz', ''); = {""} - (Some(_), Some(pattern)) if pattern == *"" => { - list_builder.values().append_value(""); - list_builder.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - ArrowError::ComputeError(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.insert(pattern.clone(), re); - patterns.get(&pattern).unwrap() - } - }; - match re.captures(value) { - Some(caps) => { - let mut iter = caps.iter(); - if caps.len() > 1 { - iter.next(); - } - for m in iter.flatten() { - list_builder.values().append_value(m.as_str()); - } - - list_builder.append(true); - } - None => list_builder.append(false), - } - } - _ => list_builder.append(false), - } - Ok(()) - }) - .collect::, ArrowError>>()?; - Ok(Arc::new(list_builder.finish())) -} - /// replace POSIX capture groups (like \1) with Rust Regex group (like ${1}) /// used by regexp_replace fn regex_replace_posix_groups(replacement: &str) -> String { @@ -284,7 +207,7 @@ fn _regexp_replace_early_abort( Ok(new_null_array(input_array.data_type(), input_array.len())) } -/// Special cased regex_replace implementation for the scenerio where +/// Special cased regex_replace implementation for the scenario where /// the pattern, replacement and flags are static (arrays that are derived /// from scalars). This means we can skip regex caching system and basically /// hold a single Regex object for the replace operation. This also speeds