Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup regex_expressions.rs to remove _regexp_match function #9107

Merged
merged 5 commits into from
Feb 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ arrow-flight = { version = "50.0.0", features = ["flight-sql-experimental"] }
arrow-ipc = { version = "50.0.0", default-features = false, features = ["lz4"] }
arrow-ord = { version = "50.0.0", default-features = false }
arrow-schema = { version = "50.0.0", default-features = false }
arrow-string = { version = "50.0.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "0.4.1"
bytes = "1.4"
Expand Down
1 change: 1 addition & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/physical-expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ arrow-array = { workspace = true }
arrow-buffer = { workspace = true }
arrow-ord = { workspace = true }
arrow-schema = { workspace = true }
arrow-string = { workspace = true }
base64 = { version = "0.21", optional = true }
blake2 = { version = "^0.10.2", optional = true }
blake3 = { version = "1.0", optional = true }
Expand Down
97 changes: 10 additions & 87 deletions datafusion/physical-expr/src/regex_expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ use arrow::array::{
new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray,
OffsetSizeTrait,
};
use arrow_array::builder::{GenericStringBuilder, ListBuilder};
use arrow_schema::ArrowError;

Omega359 marked this conversation as resolved.
Show resolved Hide resolved
use datafusion_common::{arrow_datafusion_err, plan_err};
use datafusion_common::{
cast::as_generic_string_array, internal_err, DataFusionError, Result,
Expand Down Expand Up @@ -61,103 +60,27 @@ pub fn regexp_match<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
2 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
_regexp_match(values, regex, None).map_err(|e| arrow_datafusion_err!(e))
arrow_string::regexp::regexp_match(values, regex, None)
.map_err(|e| arrow_datafusion_err!(e))
}
3 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
let flags = Some(as_generic_string_array::<T>(&args[2])?);
let flags = as_generic_string_array::<T>(&args[2])?;

match flags {
Some(f) if f.iter().any(|s| s == Some("g")) => {
plan_err!("regexp_match() does not support the \"global\" option")
},
_ => _regexp_match(values, regex, flags).map_err(|e| arrow_datafusion_err!(e)),
if flags.iter().any(|s| s == Some("g")) {
return plan_err!("regexp_match() does not support the \"global\" option")
}

arrow_string::regexp::regexp_match(values, regex, Some(flags))
.map_err(|e| arrow_datafusion_err!(e))
}
other => internal_err!(
"regexp_match was called with {other} arguments. It requires at least 2 and at most 3."
),
}
}

/// TODO: Remove this once it is included in arrow-rs new release.
/// <https://github.com/apache/arrow-rs/pull/5235>
fn _regexp_match<OffsetSize: OffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
regex_array: &GenericStringArray<OffsetSize>,
flags_array: Option<&GenericStringArray<OffsetSize>>,
) -> std::result::Result<ArrayRef, ArrowError> {
let mut patterns: std::collections::HashMap<String, Regex> =
std::collections::HashMap::new();
let builder: GenericStringBuilder<OffsetSize> =
GenericStringBuilder::with_capacity(0, 0);
let mut list_builder = ListBuilder::new(builder);

let complete_pattern = match flags_array {
Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
|(pattern, flags)| {
pattern.map(|pattern| match flags {
Some(value) => format!("(?{value}){pattern}"),
None => pattern.to_string(),
})
},
)) as Box<dyn Iterator<Item = Option<String>>>,
None => Box::new(
regex_array
.iter()
.map(|pattern| pattern.map(|pattern| pattern.to_string())),
),
};

array
.iter()
.zip(complete_pattern)
.map(|(value, pattern)| {
match (value, pattern) {
// Required for Postgres compatibility:
// SELECT regexp_match('foobarbequebaz', ''); = {""}
(Some(_), Some(pattern)) if pattern == *"" => {
list_builder.values().append_value("");
list_builder.append(true);
}
(Some(value), Some(pattern)) => {
let existing_pattern = patterns.get(&pattern);
let re = match existing_pattern {
Some(re) => re,
None => {
let re = Regex::new(pattern.as_str()).map_err(|e| {
ArrowError::ComputeError(format!(
"Regular expression did not compile: {e:?}"
))
})?;
patterns.insert(pattern.clone(), re);
patterns.get(&pattern).unwrap()
}
};
match re.captures(value) {
Some(caps) => {
let mut iter = caps.iter();
if caps.len() > 1 {
iter.next();
}
for m in iter.flatten() {
list_builder.values().append_value(m.as_str());
}

list_builder.append(true);
}
None => list_builder.append(false),
}
}
_ => list_builder.append(false),
}
Ok(())
})
.collect::<std::result::Result<Vec<()>, ArrowError>>()?;
Ok(Arc::new(list_builder.finish()))
}

/// replace POSIX capture groups (like \1) with Rust Regex group (like ${1})
/// used by regexp_replace
fn regex_replace_posix_groups(replacement: &str) -> String {
Expand Down Expand Up @@ -284,7 +207,7 @@ fn _regexp_replace_early_abort<T: OffsetSizeTrait>(
Ok(new_null_array(input_array.data_type(), input_array.len()))
}

/// Special cased regex_replace implementation for the scenerio where
/// Special cased regex_replace implementation for the scenario where
/// the pattern, replacement and flags are static (arrays that are derived
/// from scalars). This means we can skip regex caching system and basically
/// hold a single Regex object for the replace operation. This also speeds
Expand Down
Loading