diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index cbd892672152..81a33361008f 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -37,14 +37,14 @@ mod simplification; fn test_octet_length() { #[rustfmt::skip] evaluate_expr_test( - octet_length(col("list")), + octet_length(col("id")), vec![ "+------+", "| expr |", "+------+", - "| 5 |", - "| 18 |", - "| 6 |", + "| 1 |", + "| 1 |", + "| 1 |", "+------+", ], ); diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index d1553b3315e7..320e1303a21b 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -125,6 +125,11 @@ pub enum TypeSignature { /// Fixed number of arguments of numeric types. /// See to know which type is considered numeric Numeric(usize), + /// Fixed number of arguments of all the same string types. + /// The precedence of type from high to low is Utf8View, LargeUtf8 and Utf8. + /// Null is considerd as Utf8 by default + /// Dictionary with string value type is also handled. + String(usize), } #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] @@ -190,8 +195,11 @@ impl TypeSignature { .collect::>() .join(", ")] } + TypeSignature::String(num) => { + vec![format!("String({num})")] + } TypeSignature::Numeric(num) => { - vec![format!("Numeric({})", num)] + vec![format!("Numeric({num})")] } TypeSignature::Exact(types) | TypeSignature::Coercible(types) => { vec![Self::join_types(types, ", ")] @@ -280,6 +288,14 @@ impl Signature { } } + /// A specified number of numeric arguments + pub fn string(arg_count: usize, volatility: Volatility) -> Self { + Self { + type_signature: TypeSignature::String(arg_count), + volatility, + } + } + /// An arbitrary number of arguments of any type. pub fn variadic_any(volatility: Volatility) -> Self { Self { diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index e7c4f65a1b4e..6d66b8b4df44 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -959,7 +959,7 @@ fn string_concat_internal_coercion( /// based on the observation that StringArray to StringViewArray is cheap but not vice versa. /// /// Between Utf8 and LargeUtf8, we coerce to LargeUtf8. -fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { // If Utf8View is in any side, we coerce to Utf8View. diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 9000ac2538e6..143e00fa409e 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -26,8 +26,9 @@ use datafusion_common::{ utils::{coerced_fixed_size_list_to_list, list_ndims}, Result, }; -use datafusion_expr_common::signature::{ - ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD, +use datafusion_expr_common::{ + signature::{ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD}, + type_coercion::binary::string_coercion, }; use std::sync::Arc; @@ -176,6 +177,7 @@ fn is_well_supported_signature(type_signature: &TypeSignature) -> bool { type_signature, TypeSignature::UserDefined | TypeSignature::Numeric(_) + | TypeSignature::String(_) | TypeSignature::Coercible(_) | TypeSignature::Any(_) ) @@ -381,6 +383,67 @@ fn get_valid_types( .iter() .map(|valid_type| current_types.iter().map(|_| valid_type.clone()).collect()) .collect(), + TypeSignature::String(number) => { + if *number < 1 { + return plan_err!( + "The signature expected at least one argument but received {}", + current_types.len() + ); + } + if *number != current_types.len() { + return plan_err!( + "The signature expected {} arguments but received {}", + number, + current_types.len() + ); + } + + fn coercion_rule( + lhs_type: &DataType, + rhs_type: &DataType, + ) -> Result { + match (lhs_type, rhs_type) { + (DataType::Null, DataType::Null) => Ok(DataType::Utf8), + (DataType::Null, data_type) | (data_type, DataType::Null) => { + coercion_rule(data_type, &DataType::Utf8) + } + (DataType::Dictionary(_, lhs), DataType::Dictionary(_, rhs)) => { + coercion_rule(lhs, rhs) + } + (DataType::Dictionary(_, v), other) + | (other, DataType::Dictionary(_, v)) => coercion_rule(v, other), + _ => { + if let Some(coerced_type) = string_coercion(lhs_type, rhs_type) { + Ok(coerced_type) + } else { + plan_err!( + "{} and {} are not coercible to a common string type", + lhs_type, + rhs_type + ) + } + } + } + } + + // Length checked above, safe to unwrap + let mut coerced_type = current_types.first().unwrap().to_owned(); + for t in current_types.iter().skip(1) { + coerced_type = coercion_rule(&coerced_type, t)?; + } + + fn base_type_or_default_type(data_type: &DataType) -> DataType { + if data_type.is_null() { + DataType::Utf8 + } else if let DataType::Dictionary(_, v) = data_type { + base_type_or_default_type(v) + } else { + data_type.to_owned() + } + } + + vec![vec![base_type_or_default_type(&coerced_type); *number]] + } TypeSignature::Numeric(number) => { if *number < 1 { return plan_err!( diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs index e47818bc86a4..e850673ef8af 100644 --- a/datafusion/functions/src/macros.rs +++ b/datafusion/functions/src/macros.rs @@ -284,7 +284,7 @@ macro_rules! make_math_binary_udf { use arrow::datatypes::DataType; use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; - use datafusion_expr::TypeSignature::*; + use datafusion_expr::TypeSignature; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] @@ -298,8 +298,8 @@ macro_rules! make_math_binary_udf { Self { signature: Signature::one_of( vec![ - Exact(vec![Float32, Float32]), - Exact(vec![Float64, Float64]), + TypeSignature::Exact(vec![Float32, Float32]), + TypeSignature::Exact(vec![Float64, Float64]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/math/nans.rs b/datafusion/functions/src/math/nans.rs index 2bd704a7de2e..b02839b40bd9 100644 --- a/datafusion/functions/src/math/nans.rs +++ b/datafusion/functions/src/math/nans.rs @@ -19,10 +19,9 @@ use arrow::datatypes::DataType; use datafusion_common::{exec_err, DataFusionError, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, TypeSignature}; use arrow::array::{ArrayRef, BooleanArray, Float32Array, Float64Array}; -use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; @@ -43,7 +42,10 @@ impl IsNanFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Float32]), Exact(vec![Float64])], + vec![ + TypeSignature::Exact(vec![Float32]), + TypeSignature::Exact(vec![Float64]), + ], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/math/power.rs b/datafusion/functions/src/math/power.rs index 5b790fb56ddf..831f983d5916 100644 --- a/datafusion/functions/src/math/power.rs +++ b/datafusion/functions/src/math/power.rs @@ -25,10 +25,9 @@ use datafusion_common::{ }; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; -use datafusion_expr::{ColumnarValue, Expr, ScalarUDF}; +use datafusion_expr::{ColumnarValue, Expr, ScalarUDF, TypeSignature}; use arrow::array::{ArrayRef, Float64Array, Int64Array}; -use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; @@ -52,7 +51,10 @@ impl PowerFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Int64, Int64]), Exact(vec![Float64, Float64])], + vec![ + TypeSignature::Exact(vec![Int64, Int64]), + TypeSignature::Exact(vec![Float64, Float64]), + ], Volatility::Immutable, ), aliases: vec![String::from("pow")], diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index e245ea9fa72f..a698913fff54 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -26,8 +26,7 @@ use datafusion_common::{ cast::as_generic_string_array, internal_err, DataFusionError, Result, }; use datafusion_expr::scalar_doc_sections::DOC_SECTION_REGEX; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Documentation}; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::{Arc, OnceLock}; @@ -87,10 +86,10 @@ impl RegexpLikeFunc { Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), + TypeSignature::Exact(vec![Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 498b591620ee..bfec97f92c36 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -26,8 +26,7 @@ use datafusion_common::{arrow_datafusion_err, plan_err}; use datafusion_common::{ cast::as_generic_string_array, internal_err, DataFusionError, Result, }; -use datafusion_expr::ColumnarValue; -use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, TypeSignature}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; @@ -53,10 +52,10 @@ impl RegexpMatchFunc { // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8, Utf8)`. // If that fails, it proceeds to `(LargeUtf8, Utf8)`. // TODO: Native support Utf8View for regexp_match. - Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), + TypeSignature::Exact(vec![Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index 3eb72a1fb5f5..bce8752af28b 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -33,7 +33,7 @@ use datafusion_common::{ }; use datafusion_expr::function::Hint; use datafusion_expr::ColumnarValue; -use datafusion_expr::TypeSignature::*; +use datafusion_expr::TypeSignature; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use regex::Regex; use std::any::Any; @@ -56,10 +56,10 @@ impl RegexpReplaceFunc { Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![Utf8View, Utf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8, Utf8]), - Exact(vec![Utf8View, Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8View, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8, Utf8, Utf8, Utf8]), + TypeSignature::Exact(vec![Utf8View, Utf8, Utf8, Utf8]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 1e828d066786..8d61661f97b8 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -39,13 +39,8 @@ impl Default for AsciiFunc { impl AsciiFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/bit_length.rs b/datafusion/functions/src/string/bit_length.rs index bd22c1504baf..7d162e7d411b 100644 --- a/datafusion/functions/src/string/bit_length.rs +++ b/datafusion/functions/src/string/bit_length.rs @@ -39,13 +39,8 @@ impl Default for BitLengthFunc { impl BitLengthFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index b2e79a7b8930..82b7599f0735 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -22,9 +22,9 @@ use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Documentation, Volatility}; -use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, +}; use std::any::Any; use std::sync::OnceLock; @@ -49,18 +49,9 @@ impl Default for BTrimFunc { impl BTrimFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8View]), - Exact(vec![Utf8]), - ], + vec![TypeSignature::String(2), TypeSignature::String(1)], Volatility::Immutable, ), aliases: vec![String::from("trim")], diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index 7fc1fa876c11..0f75731aa1c3 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -16,19 +16,17 @@ // under the License. use crate::utils::make_scalar_function; - use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray, StringViewArray}; +use arrow::compute::regexp_is_match; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{Boolean, LargeUtf8, Utf8, Utf8View}; use datafusion_common::exec_err; use datafusion_common::DataFusionError; use datafusion_common::Result; -use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, Signature, Volatility}; -use datafusion_expr::{Documentation, ScalarUDFImpl}; - -use arrow::compute::regexp_is_match; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; use std::sync::{Arc, OnceLock}; @@ -45,22 +43,8 @@ impl Default for ContainsFunc { impl ContainsFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8View, Utf8]), - Exact(vec![Utf8View, LargeUtf8]), - Exact(vec![Utf8, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8, LargeUtf8]), - Exact(vec![LargeUtf8, Utf8View]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), } } } @@ -132,39 +116,6 @@ pub fn contains(args: &[ArrayRef]) -> Result { Ok(Arc::new(res) as ArrayRef) } - (Utf8View, Utf8) => { - let mod_str = args[0].as_string_view(); - let match_str = args[1].as_string::(); - let res = regexp_is_match::< - StringViewArray, - GenericStringArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (Utf8View, LargeUtf8) => { - let mod_str = args[0].as_string_view(); - let match_str = args[1].as_string::(); - let res = regexp_is_match::< - StringViewArray, - GenericStringArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (Utf8, Utf8View) => { - let mod_str = args[0].as_string::(); - let match_str = args[1].as_string_view(); - let res = regexp_is_match::< - GenericStringArray, - StringViewArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } (Utf8, Utf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); @@ -176,39 +127,6 @@ pub fn contains(args: &[ArrayRef]) -> Result { Ok(Arc::new(res) as ArrayRef) } - (Utf8, LargeUtf8) => { - let mod_str = args[0].as_string::(); - let match_str = args[1].as_string::(); - let res = regexp_is_match::< - GenericStringArray, - GenericStringArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (LargeUtf8, Utf8View) => { - let mod_str = args[0].as_string::(); - let match_str = args[1].as_string_view(); - let res = regexp_is_match::< - GenericStringArray, - StringViewArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } - (LargeUtf8, Utf8) => { - let mod_str = args[0].as_string::(); - let match_str = args[1].as_string::(); - let res = regexp_is_match::< - GenericStringArray, - GenericStringArray, - GenericStringArray, - >(mod_str, match_str, None)?; - - Ok(Arc::new(res) as ArrayRef) - } (LargeUtf8, LargeUtf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); @@ -225,95 +143,3 @@ pub fn contains(args: &[ArrayRef]) -> Result { } } } - -#[cfg(test)] -mod tests { - use crate::string::contains::ContainsFunc; - use crate::utils::test::test_function; - use arrow::array::Array; - use arrow::{array::BooleanArray, datatypes::DataType::Boolean}; - use datafusion_common::Result; - use datafusion_common::ScalarValue; - use datafusion_expr::ColumnarValue; - use datafusion_expr::ScalarUDFImpl; - #[test] - fn test_functions() -> Result<()> { - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::Scalar(ScalarValue::from("alphabet")), - ColumnarValue::Scalar(ScalarValue::from("alph")), - ], - Ok(Some(true)), - bool, - Boolean, - BooleanArray - ); - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::Scalar(ScalarValue::from("alphabet")), - ColumnarValue::Scalar(ScalarValue::from("dddddd")), - ], - Ok(Some(false)), - bool, - Boolean, - BooleanArray - ); - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::Scalar(ScalarValue::from("alphabet")), - ColumnarValue::Scalar(ScalarValue::from("pha")), - ], - Ok(Some(true)), - bool, - Boolean, - BooleanArray - ); - - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( - "Apache" - )))), - ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("pac")))), - ], - Ok(Some(true)), - bool, - Boolean, - BooleanArray - ); - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( - "Apache" - )))), - ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("ap")))), - ], - Ok(Some(false)), - bool, - Boolean, - BooleanArray - ); - test_function!( - ContainsFunc::new(), - &[ - ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( - "Apache" - )))), - ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from( - "DataFusion" - )))), - ], - Ok(Some(false)), - bool, - Boolean, - BooleanArray - ); - - Ok(()) - } -} diff --git a/datafusion/functions/src/string/ends_with.rs b/datafusion/functions/src/string/ends_with.rs index 786010764cc3..8c90cbc3b146 100644 --- a/datafusion/functions/src/string/ends_with.rs +++ b/datafusion/functions/src/string/ends_with.rs @@ -24,7 +24,6 @@ use arrow::datatypes::DataType; use crate::utils::make_scalar_function; use datafusion_common::{internal_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; @@ -42,17 +41,7 @@ impl Default for EndsWithFunc { impl EndsWithFunc { pub fn new() -> Self { Self { - signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![DataType::Utf8View, DataType::Utf8View]), - Exact(vec![DataType::Utf8, DataType::Utf8]), - Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/initcap.rs b/datafusion/functions/src/string/initcap.rs index ffd60bb6e979..78c95b9a5e35 100644 --- a/datafusion/functions/src/string/initcap.rs +++ b/datafusion/functions/src/string/initcap.rs @@ -41,13 +41,8 @@ impl Default for InitcapFunc { impl InitcapFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index 2f121426f1f8..558e71239f84 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -26,7 +26,6 @@ use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::utils::datafusion_strsim; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -44,14 +43,7 @@ impl Default for LevenshteinFunc { impl LevenshteinFunc { pub fn new() -> Self { Self { - signature: Signature::one_of( - vec![ - Exact(vec![DataType::Utf8View, DataType::Utf8View]), - Exact(vec![DataType::Utf8, DataType::Utf8]), - Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index 25acfc276013..f82b11ca9051 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -39,13 +39,8 @@ impl Default for LowerFunc { impl LowerFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 1fcde9e97a1d..b64dcda7218e 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -25,8 +25,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Documentation, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; /// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. @@ -49,18 +48,9 @@ impl Default for LtrimFunc { impl LtrimFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8View]), - Exact(vec![Utf8]), - ], + vec![TypeSignature::String(2), TypeSignature::String(1)], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 195a6c296c47..04094396fadc 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -39,13 +39,8 @@ impl Default for OctetLengthFunc { impl OctetLengthFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/overlay.rs b/datafusion/functions/src/string/overlay.rs index ec33840a0b0e..3b31bc360851 100644 --- a/datafusion/functions/src/string/overlay.rs +++ b/datafusion/functions/src/string/overlay.rs @@ -27,8 +27,7 @@ use datafusion_common::cast::{ }; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Documentation, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; #[derive(Debug)] @@ -48,12 +47,12 @@ impl OverlayFunc { Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Utf8View, Int64, Int64]), - Exact(vec![Utf8, Utf8, Int64, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]), - Exact(vec![Utf8View, Utf8View, Int64]), - Exact(vec![Utf8, Utf8, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64]), + TypeSignature::Exact(vec![Utf8View, Utf8View, Int64, Int64]), + TypeSignature::Exact(vec![Utf8, Utf8, Int64, Int64]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]), + TypeSignature::Exact(vec![Utf8View, Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8, Utf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 3abd1767bb0a..fda9c7a13df6 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -18,20 +18,18 @@ use std::any::Any; use std::sync::{Arc, OnceLock}; +use crate::string::common::StringArrayType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, OffsetSizeTrait, StringViewArray, }; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{Int64, LargeUtf8, Utf8, Utf8View}; - -use crate::string::common::StringArrayType; -use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Documentation, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; #[derive(Debug)] @@ -53,9 +51,9 @@ impl RepeatFunc { // Planner attempts coercion to the target type starting with the most preferred candidate. // For example, given input `(Utf8View, Int64)`, it first tries coercing to `(Utf8View, Int64)`. // If that fails, it proceeds to `(Utf8, Int64)`. - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), + TypeSignature::Exact(vec![Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, Int64]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index 7c985b44ab9a..612cd7276bab 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -25,7 +25,6 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; @@ -42,16 +41,8 @@ impl Default for ReplaceFunc { impl ReplaceFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8View, Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(3, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index 6743ad99d3bc..1a27502a2082 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -25,8 +25,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Documentation, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; /// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. @@ -49,18 +48,9 @@ impl Default for RtrimFunc { impl RtrimFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8View]), - Exact(vec![Utf8]), - ], + vec![TypeSignature::String(2), TypeSignature::String(1)], Volatility::Immutable, ), } diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index 2424103c84bf..2441798c38d4 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -25,8 +25,7 @@ use datafusion_common::cast::as_int64_array; use datafusion_common::ScalarValue; use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Documentation, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; use std::any::Any; use std::sync::{Arc, OnceLock}; @@ -50,15 +49,15 @@ impl SplitPartFunc { Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Utf8View, Int64]), - Exact(vec![Utf8View, Utf8, Int64]), - Exact(vec![Utf8View, LargeUtf8, Int64]), - Exact(vec![Utf8, Utf8View, Int64]), - Exact(vec![Utf8, Utf8, Int64]), - Exact(vec![LargeUtf8, Utf8View, Int64]), - Exact(vec![LargeUtf8, Utf8, Int64]), - Exact(vec![Utf8, LargeUtf8, Int64]), - Exact(vec![LargeUtf8, LargeUtf8, Int64]), + TypeSignature::Exact(vec![Utf8View, Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8View, Utf8, Int64]), + TypeSignature::Exact(vec![Utf8View, LargeUtf8, Int64]), + TypeSignature::Exact(vec![Utf8, Utf8View, Int64]), + TypeSignature::Exact(vec![Utf8, Utf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, Utf8View, Int64]), + TypeSignature::Exact(vec![LargeUtf8, Utf8, Int64]), + TypeSignature::Exact(vec![Utf8, LargeUtf8, Int64]), + TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index ff4bf01c993f..713b642d5e91 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -24,7 +24,6 @@ use arrow::datatypes::DataType; use crate::utils::make_scalar_function; use datafusion_common::{internal_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::*; use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -49,17 +48,7 @@ impl Default for StartsWithFunc { impl StartsWithFunc { pub fn new() -> Self { Self { - signature: Signature::one_of( - vec![ - // Planner attempts coercion to the target type starting with the most preferred candidate. - // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`. - // If that fails, it proceeds to `(Utf8, Utf8)`. - Exact(vec![DataType::Utf8View, DataType::Utf8View]), - Exact(vec![DataType::Utf8, DataType::Utf8]), - Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index caef7f655222..bfcb2a86994d 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -38,13 +38,8 @@ impl Default for UpperFunc { impl UpperFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::uniform( - 1, - vec![Utf8, LargeUtf8, Utf8View], - Volatility::Immutable, - ), + signature: Signature::string(1, Volatility::Immutable), } } } diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs index eaff62c338a0..660adc7578a5 100644 --- a/datafusion/functions/src/unicode/strpos.rs +++ b/datafusion/functions/src/unicode/strpos.rs @@ -18,14 +18,12 @@ use std::any::Any; use std::sync::{Arc, OnceLock}; -use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray}; -use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; - use crate::string::common::StringArrayType; use crate::utils::{make_scalar_function, utf8_to_int_type}; +use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray}; +use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -44,20 +42,8 @@ impl Default for StrposFunc { impl StrposFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![Utf8, LargeUtf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![LargeUtf8, LargeUtf8]), - Exact(vec![Utf8View, Utf8View]), - Exact(vec![Utf8View, Utf8]), - Exact(vec![Utf8View, LargeUtf8]), - ], - Volatility::Immutable, - ), + signature: Signature::string(2, Volatility::Immutable), aliases: vec![String::from("instr"), String::from("position")], } } diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 8820fffaeb47..0c2fa41e5bf8 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1906,12 +1906,8 @@ select position('' in '') ---- 1 - -query I +query error DataFusion error: Error during planning: Error during planning: Int64 and Int64 are not coercible to a common string select position(1 in 1) ----- -1 - query I select strpos('abc', 'c'); diff --git a/datafusion/sqllogictest/test_files/string/init_data.slt.part b/datafusion/sqllogictest/test_files/string/init_data.slt.part index d99401f10d20..096e3bb3b330 100644 --- a/datafusion/sqllogictest/test_files/string/init_data.slt.part +++ b/datafusion/sqllogictest/test_files/string/init_data.slt.part @@ -30,4 +30,3 @@ statement ok create table test_substr_base ( col1 VARCHAR ) as values ('foo'), ('hello🌏世界'), ('💩'), ('ThisIsAVeryLongASCIIString'), (''), (NULL); - diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index e7b55c9c1c8c..e01a40586fe0 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -92,6 +92,21 @@ FROM test_source; statement ok drop table test_source +######## +## StringView Function test +######## + +query error DataFusion error: Arrow error: Compute error: bit_length not supported for Utf8View +select bit_length(column1_utf8view) from test; + +query T +select btrim(column1_large_utf8) from test; +---- +Andrew +Xiangpeng +Raphael +NULL + ######## ## StringView to Other Types column ######## @@ -299,9 +314,8 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: starts_with(__common_expr_1, test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(__common_expr_1, CAST(test.column2_large_utf8 AS Utf8View)) AS c4 -02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view -03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] +01)Projection: starts_with(CAST(test.column1_utf8 AS Utf8View), test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(CAST(test.column1_utf8 AS LargeUtf8), test.column2_large_utf8) AS c4 +02)--TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view] query BBB SELECT @@ -591,7 +605,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: contains(test.column1_utf8view, Utf8("foo")) AS c1, contains(test.column1_utf8view, test.column2_utf8view) AS c2, contains(test.column1_utf8view, test.column2_large_utf8) AS c3, contains(test.column1_utf8, test.column2_utf8view) AS c4, contains(test.column1_utf8, test.column2_utf8) AS c5, contains(test.column1_utf8, test.column2_large_utf8) AS c6, contains(test.column1_large_utf8, test.column1_utf8view) AS c7, contains(test.column1_large_utf8, test.column2_utf8) AS c8, contains(test.column1_large_utf8, test.column2_large_utf8) AS c9 +01)Projection: contains(test.column1_utf8view, Utf8View("foo")) AS c1, contains(test.column1_utf8view, test.column2_utf8view) AS c2, contains(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c3, contains(CAST(test.column1_utf8 AS Utf8View), test.column2_utf8view) AS c4, contains(test.column1_utf8, test.column2_utf8) AS c5, contains(CAST(test.column1_utf8 AS LargeUtf8), test.column2_large_utf8) AS c6, contains(CAST(test.column1_large_utf8 AS Utf8View), test.column1_utf8view) AS c7, contains(test.column1_large_utf8, CAST(test.column2_utf8 AS LargeUtf8)) AS c8, contains(test.column1_large_utf8, test.column2_large_utf8) AS c9 02)--TableScan: test projection=[column1_utf8, column2_utf8, column1_large_utf8, column2_large_utf8, column1_utf8view, column2_utf8view] ## Ensure no casts for ENDS_WITH @@ -835,7 +849,7 @@ EXPLAIN SELECT FROM test; ---- logical_plan -01)Projection: strpos(test.column1_utf8view, Utf8("f")) AS c, strpos(test.column1_utf8view, test.column2_utf8view) AS c2 +01)Projection: strpos(test.column1_utf8view, Utf8View("f")) AS c, strpos(test.column1_utf8view, test.column2_utf8view) AS c2 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for SUBSTR