Skip to content

Commit

Permalink
Cleanup regex_expressions.rs to remove _regexp_match function (#9107)
Browse files Browse the repository at this point in the history
* Cleanup regex_expressions.rs to remove _regexp_match function #9106

* Adding datafusion-cli Cargo.lock

* Update datafusion/physical-expr/src/regex_expressions.rs

Removed extraneous line as per code review suggestion

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

* Update datafusion/physical-expr/src/regex_expressions.rs

Removed extraneous lines as per code review

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>

* Make rustfmt happy again.

---------

Co-authored-by: Liang-Chi Hsieh <viirya@gmail.com>
  • Loading branch information
Omega359 and viirya authored Feb 3, 2024
1 parent 7641a32 commit 9d1502b
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 87 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ arrow-flight = { version = "50.0.0", features = ["flight-sql-experimental"] }
arrow-ipc = { version = "50.0.0", default-features = false, features = ["lz4"] }
arrow-ord = { version = "50.0.0", default-features = false }
arrow-schema = { version = "50.0.0", default-features = false }
arrow-string = { version = "50.0.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "0.4.1"
bytes = "1.4"
Expand Down
1 change: 1 addition & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/physical-expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ arrow-array = { workspace = true }
arrow-buffer = { workspace = true }
arrow-ord = { workspace = true }
arrow-schema = { workspace = true }
arrow-string = { workspace = true }
base64 = { version = "0.21", optional = true }
blake2 = { version = "^0.10.2", optional = true }
blake3 = { version = "1.0", optional = true }
Expand Down
97 changes: 10 additions & 87 deletions datafusion/physical-expr/src/regex_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ use arrow::array::{
new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray,
OffsetSizeTrait,
};
use arrow_array::builder::{GenericStringBuilder, ListBuilder};
use arrow_schema::ArrowError;

use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
Expand Down Expand Up @@ -61,103 +60,27 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
2 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
_regexp_match(values, regex, None).map_err(|e| arrow_datafusion_err!(e))
arrow_string::regexp::regexp_match(values, regex, None)
.map_err(|e| arrow_datafusion_err!(e))
}
3 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
let flags = Some(as_generic_string_array::<T>(&args[2])?);
let flags = as_generic_string_array::<T>(&args[2])?;

match flags {
Some(f) if f.iter().any(|s| s == Some("g")) => {
plan_err!("regexp_match() does not support the \"global\" option")
},
_ => _regexp_match(values, regex, flags).map_err(|e| arrow_datafusion_err!(e)),
if flags.iter().any(|s| s == Some("g")) {
return plan_err!("regexp_match() does not support the \"global\" option")
}

arrow_string::regexp::regexp_match(values, regex, Some(flags))
.map_err(|e| arrow_datafusion_err!(e))
}
other => internal_err!(
"regexp_match was called with {other} arguments. It requires at least 2 and at most 3."
),
}
}

/// TODO: Remove this once it is included in arrow-rs new release.
/// <https://github.com/apache/arrow-rs/pull/5235>
fn _regexp_match<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex_array: &GenericStringArray<OffsetSize>,
flags_array: Option<&GenericStringArray<OffsetSize>>,
) -> std::result::Result<ArrayRef, ArrowError> {
let mut patterns: std::collections::HashMap<String, Regex> =
std::collections::HashMap::new();
let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
let mut list_builder = ListBuilder::new(builder);

let complete_pattern = match flags_array {
Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
|(pattern, flags)| {
pattern.map(|pattern| match flags {
Some(value) => format!("(?{value}){pattern}"),
None => pattern.to_string(),
})
},
)) as Box<dyn Iterator<Item = Option<String>>>,
None => Box::new(
regex_array
.iter()
.map(|pattern| pattern.map(|pattern| pattern.to_string())),
),
};

array
.iter()
.zip(complete_pattern)
.map(|(value, pattern)| {
match (value, pattern) {
// Required for Postgres compatibility:
// SELECT regexp_match('foobarbequebaz', ''); = {""}
(Some(_), Some(pattern)) if pattern == *"" => {
list_builder.values().append_value("");
list_builder.append(true);
}
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
Some(re) => re,
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {e:?}"
))
})?;
patterns.insert(pattern.clone(), re);
patterns.get(&pattern).unwrap()
}
};
match re.captures(value) {
Some(caps) => {
let mut iter = caps.iter();
if caps.len() > 1 {
iter.next();
}
for m in iter.flatten() {
list_builder.values().append_value(m.as_str());
}

list_builder.append(true);
}
None => list_builder.append(false),
}
}
_ => list_builder.append(false),
}
Ok(())
})
.collect::<std::result::Result<Vec<()>, ArrowError>>()?;
Ok(Arc::new(list_builder.finish()))
}

/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
/// used by regexp_replace
fn regex_replace_posix_groups(replacement: &str) -> String {
Expand Down Expand Up @@ -284,7 +207,7 @@ fn _regexp_replace_early_abort<T: OffsetSizeTrait>(
Ok(new_null_array(input_array.data_type(), input_array.len()))
}

/// Special cased regex_replace implementation for the scenerio where
/// Special cased regex_replace implementation for the scenario where
/// the pattern, replacement and flags are static (arrays that are derived
/// from scalars). This means we can skip regex caching system and basically
/// hold a single Regex object for the replace operation. This also speeds
Expand Down

0 comments on commit 9d1502b

Please sign in to comment.