Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Specialize Prefix/Suffix Match for Like/ILike between Array and Scalar for StringViewArray #6231

Merged
merged 11 commits into from
Aug 25, 2024
84 changes: 84 additions & 0 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, S
use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
use arrow_schema::{ArrowError, DataType};
use core::str;
use num::ToPrimitive;
use std::any::Any;
use std::fmt::Debug;
Expand Down Expand Up @@ -278,6 +279,22 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
T::Native::from_bytes_unchecked(b)
}

/// Returns the bytes at index `i`
/// # Safety
/// Caller is responsible for ensuring that the index is within the bounds of the array
pub unsafe fn bytes_unchecked(&self, idx: usize) -> &[u8] {
xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved
let v = self.views.get_unchecked(idx);
let len = *v as u32;
if len <= 12 {
Self::inline_value(v, len as usize)
} else {
let view = ByteView::from(*v);
let data = self.buffers.get_unchecked(view.buffer_index as usize);
let offset = view.offset as usize;
data.get_unchecked(offset..offset + len as usize)
}
}

/// Returns the inline value of the view.
///
/// # Safety
Expand All @@ -294,6 +311,21 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
ArrayIter::new(self)
}

/// Returns an iterator over the bytes of this array.
pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
self.views.iter().map(move |v| {
let len = *v as u32;
if len <= 12 {
unsafe { Self::inline_value(v, len as usize) }
} else {
let view = ByteView::from(*v);
let data = &self.buffers[view.buffer_index as usize];
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset..offset + len as usize) }
}
})
}

/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
Expand Down Expand Up @@ -667,6 +699,58 @@ impl StringViewArray {
None => true,
})
}

/// Returns an iterator over the prefix bytes of this array with respect to the prefix length.
/// If the prefix length is larger than the string length, it will return the empty string.
pub fn prefix_iter(&self, prefix_len: usize) -> impl Iterator<Item = &str> {
self.views().into_iter().map(move |v| {
let len = (*v as u32) as usize;

if len < prefix_len {
return "";
}

let b = if prefix_len <= 4 || len <= 12 {
unsafe { StringViewArray::inline_value(v, prefix_len) }
} else {
let view = ByteView::from(*v);
let data = unsafe {
self.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset..offset + prefix_len) }
};

unsafe { str::from_utf8_unchecked(b) }
})
}

/// Returns an iterator over the suffix bytes of this array with respect to the suffix length.
/// If the suffix length is larger than the string length, it will return the empty string.
pub fn suffix_iter(&self, suffix_len: usize) -> impl Iterator<Item = &str> {
self.views().into_iter().map(move |v| {
let len = (*v as u32) as usize;

if len < suffix_len {
return "";
}

let b = if len <= 12 {
unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
} else {
let view = ByteView::from(*v);
let data = unsafe {
self.data_buffers()
.get_unchecked(view.buffer_index as usize)
};
let offset = view.offset as usize;
unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved
};

unsafe { str::from_utf8_unchecked(b) }
})
}
}

impl From<Vec<&str>> for StringViewArray {
Expand Down
91 changes: 75 additions & 16 deletions arrow-string/src/predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use arrow_array::{ArrayAccessor, BooleanArray};
use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
use arrow_schema::ArrowError;
use memchr::memchr2;
use memchr::memmem::Finder;
Expand Down Expand Up @@ -114,21 +114,80 @@ impl<'a> Predicate<'a> {
Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| {
haystack.eq_ignore_ascii_case(v) != negate
}),
Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| {
finder.find(haystack.as_bytes()).is_some() != negate
}),
Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_kernel) != negate
}),
Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
}),
Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_kernel) != negate
}),
Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
}),
Predicate::Contains(finder) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.bytes_iter()
.map(|haystack| finder.find(haystack).is_some() != negate)
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looking more carefully at BooleanArray::from_unary it will use the ArrayAccessor impl for StringViewArray

impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
type Item = &'a T::Native;
fn value(&self, index: usize) -> Self::Item {
GenericByteViewArray::value(self, index)
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
GenericByteViewArray::value_unchecked(self, index)
}
}

It isn't clear to me that how calling bytes_iter() would make this faster as the code for value_unchecked is the same as butes_iter

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the test should be we'll try benchmarking and see if this improves things

Copy link
Contributor Author

@xinlifoobar xinlifoobar Aug 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ya, I thought the only differences between bytes_iter and ArrayAccessor is

For bytes_iter

self.views().has_next()? -> self.views().next() -> value_unchecked()

For ArrayAccessor

index = index + 1 -> self.views.get_unchecked(idx) -> str(value_unchecked()).as_bytes()

There are merely differences between the indexing operations and iterator methods. The benchmark also indicates in %5 ranges.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made a PR here to refine this documentation: #6306

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another difference is that bytes_iterator() iterates over all array slots, including those that are null

finder.find(haystack.as_bytes()).is_some() != negate
})
}
}
Predicate::StartsWith(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
alamb marked this conversation as resolved.
Show resolved Hide resolved
.prefix_iter(v.len())
.map(|haystack| starts_with(haystack, v, equals_kernel) != negate)
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_kernel) != negate
})
}
}
Predicate::IStartsWithAscii(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.prefix_iter(v.len())
.map(|haystack| {
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
})
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
})
}
}
Predicate::EndsWith(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.suffix_iter(v.len())
xinlifoobar marked this conversation as resolved.
Show resolved Hide resolved
.map(|haystack| starts_with(haystack, v, equals_kernel) != negate)
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_kernel) != negate
})
}
}
Predicate::IEndsWithAscii(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
BooleanArray::from(
string_view_array
.suffix_iter(v.len())
.map(|haystack| {
starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
})
.collect::<Vec<_>>(),
)
} else {
BooleanArray::from_unary(array, |haystack| {
ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
})
}
}
Predicate::Regex(v) => {
BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate)
}
Expand Down
4 changes: 4 additions & 0 deletions arrow/benches/comparison_kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,10 @@ fn add_benchmark(c: &mut Criterion) {
b.iter(|| bench_like_utf8view_scalar(&string_view_left, "xxxx%"))
});

c.bench_function("like_utf8view scalar starts with more than 4 bytes", |b| {
b.iter(|| bench_like_utf8view_scalar(&string_view_left, "xxxxxx%"))
});

c.bench_function("like_utf8view scalar complex", |b| {
b.iter(|| bench_like_utf8view_scalar(&string_view_left, "%xx_xx%xxx"))
});
Expand Down
Loading