From 894e797966b855cceed5e3a49cbfa67929f24441 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sat, 17 Aug 2024 22:27:11 +0800 Subject: [PATCH 1/9] v2 impl --- arrow-array/src/array/byte_view_array.rs | 29 +++++ arrow-string/src/predicate.rs | 145 ++++++++++++++++++++--- 2 files changed, 155 insertions(+), 19 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 42f945838a45..037a7c600275 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -261,6 +261,21 @@ impl GenericByteViewArray { unsafe { self.value_unchecked(i) } } + /// Returns the inline view data at index `i` + pub unsafe fn prefix_bytes_unchecked(&self, prefix_len: usize, idx: usize) -> &[u8] { + let v = self.views.get_unchecked(idx); + let len = (*v as u32) as usize; + + if prefix_len <= 4 || (prefix_len <= 12 && len <= 12) { + Self::inline_value(v, prefix_len) + } else { + let view = ByteView::from(*v); + let data = self.buffers.get_unchecked(view.buffer_index as usize); + let offset = view.offset as usize; + data.get_unchecked(offset..offset + prefix_len) + } + } + /// Returns the element at index `i` /// # Safety /// Caller is responsible for ensuring that the index is within the bounds of the array @@ -278,6 +293,20 @@ impl GenericByteViewArray { T::Native::from_bytes_unchecked(b) } + /// Returns the bytes at index `i` + pub unsafe fn bytes_unchecked(&self, idx: usize) -> &[u8] { + let v = self.views.get_unchecked(idx); + let len = *v as u32; + if len <= 12 { + Self::inline_value(v, len as usize) + } else { + let view = ByteView::from(*v); + let data = self.buffers.get_unchecked(view.buffer_index as usize); + let offset = view.offset as usize; + data.get_unchecked(offset..offset + len as usize) + } + } + /// Returns the inline value of the view. /// /// # Safety diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index ec0c4827830c..0e8ab3530142 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::{ArrayAccessor, BooleanArray}; +use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray}; +use arrow_buffer::BooleanBuffer; use arrow_schema::ArrowError; use memchr::memchr2; use memchr::memmem::Finder; @@ -111,24 +112,130 @@ impl<'a> Predicate<'a> { Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| { (haystack.len() == v.len() && haystack == *v) != negate }), - Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { - haystack.eq_ignore_ascii_case(v) != negate - }), - Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| { - finder.find(haystack.as_bytes()).is_some() != negate - }), - Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| { - starts_with(haystack, v, equals_kernel) != negate - }), - Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate - }), - Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_kernel) != negate - }), - Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| { - ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate - }), + Predicate::IEqAscii(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + let neddle_bytes = v.as_bytes(); + let null_buffer = string_view_array.logical_nulls(); + let boolean_buffer = + BooleanBuffer::collect_bool(string_view_array.len(), |i| { + unsafe { string_view_array.bytes_unchecked(i) } + .eq_ignore_ascii_case(neddle_bytes) + != negate + }); + + BooleanArray::new(boolean_buffer, null_buffer) + } else { + BooleanArray::from_unary(array, |haystack| { + haystack.eq_ignore_ascii_case(v) != negate + }) + } + } + Predicate::Contains(finder) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + let null_buffer = string_view_array.logical_nulls(); + let boolean_buffer = + BooleanBuffer::collect_bool(string_view_array.len(), |i| { + finder + .find(unsafe { string_view_array.bytes_unchecked(i) }) + .is_some() + != negate + }); + + BooleanArray::new(boolean_buffer, null_buffer) + } else { + BooleanArray::from_unary(array, |haystack| { + finder.find(haystack.as_bytes()).is_some() != negate + }) + } + } + Predicate::StartsWith(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + let needle_bytes = v.as_bytes(); + let needle_len = needle_bytes.len(); + let null_buffer = string_view_array.logical_nulls(); + let boolean_buffer = + BooleanBuffer::collect_bool(string_view_array.len(), |i| { + zip( + unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }, + needle_bytes, + ) + .all(equals_kernel) + }); + + BooleanArray::new(boolean_buffer, null_buffer) + } else { + BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_kernel) != negate + }) + } + } + Predicate::IStartsWithAscii(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + let needle_bytes = v.as_bytes(); + let needle_len = needle_bytes.len(); + let null_buffer = string_view_array.logical_nulls(); + let boolean_buffer = + BooleanBuffer::collect_bool(string_view_array.len(), |i| { + zip( + unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }, + needle_bytes, + ) + .all(equals_ignore_ascii_case_kernel) + }); + + BooleanArray::new(boolean_buffer, null_buffer) + } else { + BooleanArray::from_unary(array, |haystack| { + starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }) + } + } + Predicate::EndsWith(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + let needle_bytes = v.as_bytes(); + let needle_len = needle_bytes.len(); + let null_buffer = string_view_array.logical_nulls(); + let boolean_buffer = + BooleanBuffer::collect_bool(string_view_array.len(), |i| { + zip( + unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) } + .iter() + .rev(), + needle_bytes.iter().rev(), + ) + .all(equals_kernel) + }); + + BooleanArray::new(boolean_buffer, null_buffer) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_kernel) != negate + }) + } + } + Predicate::IEndsWithAscii(v) => { + if let Some(string_view_array) = array.as_any().downcast_ref::() { + let needle_bytes = v.as_bytes(); + let needle_len = needle_bytes.len(); + let null_buffer = string_view_array.logical_nulls(); + let boolean_buffer = + BooleanBuffer::collect_bool(string_view_array.len(), |i| { + zip( + unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) } + .iter() + .rev(), + needle_bytes.iter().rev(), + ) + .all(equals_ignore_ascii_case_kernel) + }); + + BooleanArray::new(boolean_buffer, null_buffer) + } else { + BooleanArray::from_unary(array, |haystack| { + ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }) + } + } Predicate::Regex(v) => { BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate) } From a80431f7d22328098c54ef36ce1e8890bb65eb36 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Thu, 15 Aug 2024 22:16:05 +0800 Subject: [PATCH 2/9] Add bench --- arrow/benches/comparison_kernels.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs index 5d18a62d13a1..0fce5725154c 100644 --- a/arrow/benches/comparison_kernels.rs +++ b/arrow/benches/comparison_kernels.rs @@ -244,6 +244,10 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_like_utf8view_scalar(&string_view_left, "xxxx%")) }); + c.bench_function("like_utf8view scalar starts with more than 4 bytes", |b| { + b.iter(|| bench_like_utf8view_scalar(&string_view_left, "xxxxxx%")) + }); + c.bench_function("like_utf8view scalar complex", |b| { b.iter(|| bench_like_utf8view_scalar(&string_view_left, "%xx_xx%xxx")) }); From 32d6dc2e1a2e13562bfc4ce59efadeb583b08582 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sat, 17 Aug 2024 22:45:40 +0800 Subject: [PATCH 3/9] fix clippy --- arrow-array/src/array/byte_view_array.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 037a7c600275..3320d81a83f5 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -261,18 +261,22 @@ impl GenericByteViewArray { unsafe { self.value_unchecked(i) } } - /// Returns the inline view data at index `i` + /// Returns the inline view data at index `i`. + /// If the length of the view is less than `prefix_len`, it will return the length of the view. + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array and + /// handles the returned slice correctly. Note the returned slice may be shorter than `prefix_len`. pub unsafe fn prefix_bytes_unchecked(&self, prefix_len: usize, idx: usize) -> &[u8] { let v = self.views.get_unchecked(idx); let len = (*v as u32) as usize; if prefix_len <= 4 || (prefix_len <= 12 && len <= 12) { - Self::inline_value(v, prefix_len) + Self::inline_value(v, prefix_len.min(len)) } else { let view = ByteView::from(*v); let data = self.buffers.get_unchecked(view.buffer_index as usize); let offset = view.offset as usize; - data.get_unchecked(offset..offset + prefix_len) + data.get_unchecked(offset..offset + prefix_len.min(len)) } } @@ -294,6 +298,8 @@ impl GenericByteViewArray { } /// Returns the bytes at index `i` + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array pub unsafe fn bytes_unchecked(&self, idx: usize) -> &[u8] { let v = self.views.get_unchecked(idx); let len = *v as u32; From f6f8f55ba7bd63d6094468734b4a4b2f5a6f2d7c Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sat, 17 Aug 2024 22:59:50 +0800 Subject: [PATCH 4/9] fix endswith --- arrow-string/src/predicate.rs | 59 +++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 0e8ab3530142..f1f8b616429d 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -155,11 +155,14 @@ impl<'a> Predicate<'a> { let null_buffer = string_view_array.logical_nulls(); let boolean_buffer = BooleanBuffer::collect_bool(string_view_array.len(), |i| { - zip( - unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }, - needle_bytes, - ) - .all(equals_kernel) + let prefix_bytes = + unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }; + + if prefix_bytes.len() != needle_len { + return negate; + } + + zip(prefix_bytes, needle_bytes).all(equals_kernel) != negate }); BooleanArray::new(boolean_buffer, null_buffer) @@ -176,11 +179,15 @@ impl<'a> Predicate<'a> { let null_buffer = string_view_array.logical_nulls(); let boolean_buffer = BooleanBuffer::collect_bool(string_view_array.len(), |i| { - zip( - unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }, - needle_bytes, - ) - .all(equals_ignore_ascii_case_kernel) + let prefix_bytes = + unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }; + + if prefix_bytes.len() != needle_len { + return negate; + } + + zip(prefix_bytes, needle_bytes).all(equals_ignore_ascii_case_kernel) + != negate }); BooleanArray::new(boolean_buffer, null_buffer) @@ -197,13 +204,15 @@ impl<'a> Predicate<'a> { let null_buffer = string_view_array.logical_nulls(); let boolean_buffer = BooleanBuffer::collect_bool(string_view_array.len(), |i| { - zip( - unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) } - .iter() - .rev(), - needle_bytes.iter().rev(), - ) - .all(equals_kernel) + let haystack_bytes = unsafe { string_view_array.bytes_unchecked(i) }; + + if haystack_bytes.len() < needle_len { + return negate; + } + + zip(haystack_bytes.iter().rev(), needle_bytes.iter().rev()) + .all(equals_kernel) + != negate }); BooleanArray::new(boolean_buffer, null_buffer) @@ -220,13 +229,15 @@ impl<'a> Predicate<'a> { let null_buffer = string_view_array.logical_nulls(); let boolean_buffer = BooleanBuffer::collect_bool(string_view_array.len(), |i| { - zip( - unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) } - .iter() - .rev(), - needle_bytes.iter().rev(), - ) - .all(equals_ignore_ascii_case_kernel) + let haystack_bytes = unsafe { string_view_array.bytes_unchecked(i) }; + + if haystack_bytes.len() < needle_len { + return negate; + } + + zip(haystack_bytes.iter().rev(), needle_bytes.iter().rev()) + .all(equals_ignore_ascii_case_kernel) + != negate }); BooleanArray::new(boolean_buffer, null_buffer) From 6bee6871f1d39c3948864fc38cabcb790ce83b03 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Sun, 18 Aug 2024 20:51:31 +0800 Subject: [PATCH 5/9] Finalize the prefix_v2 implementation --- arrow-array/src/array/byte_view_array.rs | 87 +++++++++++---- arrow-string/src/predicate.rs | 135 +++++++---------------- 2 files changed, 106 insertions(+), 116 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 3320d81a83f5..237316e0785c 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -24,6 +24,7 @@ use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, S use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; use arrow_schema::{ArrowError, DataType}; +use core::str; use num::ToPrimitive; use std::any::Any; use std::fmt::Debug; @@ -261,25 +262,6 @@ impl GenericByteViewArray { unsafe { self.value_unchecked(i) } } - /// Returns the inline view data at index `i`. - /// If the length of the view is less than `prefix_len`, it will return the length of the view. - /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array and - /// handles the returned slice correctly. Note the returned slice may be shorter than `prefix_len`. - pub unsafe fn prefix_bytes_unchecked(&self, prefix_len: usize, idx: usize) -> &[u8] { - let v = self.views.get_unchecked(idx); - let len = (*v as u32) as usize; - - if prefix_len <= 4 || (prefix_len <= 12 && len <= 12) { - Self::inline_value(v, prefix_len.min(len)) - } else { - let view = ByteView::from(*v); - let data = self.buffers.get_unchecked(view.buffer_index as usize); - let offset = view.offset as usize; - data.get_unchecked(offset..offset + prefix_len.min(len)) - } - } - /// Returns the element at index `i` /// # Safety /// Caller is responsible for ensuring that the index is within the bounds of the array @@ -329,6 +311,21 @@ impl GenericByteViewArray { ArrayIter::new(self) } + /// Returns an iterator over the bytes of this array. + pub fn bytes_iter(&self) -> impl Iterator { + self.views.iter().map(move |v| { + let len = *v as u32; + if len <= 12 { + unsafe { Self::inline_value(v, len as usize) } + } else { + let view = ByteView::from(*v); + let data = &self.buffers[view.buffer_index as usize]; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset..offset + len as usize) } + } + }) + } + /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { Self { @@ -702,6 +699,58 @@ impl StringViewArray { None => true, }) } + + /// Returns an iterator over the prefix bytes of this array with respect to the prefix length. + /// If the prefix length is larger than the string length, it will return the empty string. + pub fn prefix_iter(&self, prefix_len: usize) -> impl Iterator { + self.views().into_iter().map(move |v| { + let len = (*v as u32) as usize; + + if len < prefix_len { + return ""; + } + + let b = if prefix_len <= 4 || len <= 12 { + unsafe { StringViewArray::inline_value(v, prefix_len) } + } else { + let view = ByteView::from(*v); + let data = unsafe { + self.data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset..offset + prefix_len) } + }; + + unsafe { str::from_utf8_unchecked(b) } + }) + } + + /// Returns an iterator over the suffix bytes of this array with respect to the suffix length. + /// If the suffix length is larger than the string length, it will return the empty string. + pub fn suffix_iter(&self, suffix_len: usize) -> impl Iterator { + self.views().into_iter().map(move |v| { + let len = (*v as u32) as usize; + + if len < suffix_len { + return ""; + } + + let b = if len <= 12 { + unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } + } else { + let view = ByteView::from(*v); + let data = unsafe { + self.data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) } + }; + + unsafe { str::from_utf8_unchecked(b) } + }) + } } impl From> for StringViewArray { diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index f1f8b616429d..4927ecda34b8 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray}; -use arrow_buffer::BooleanBuffer; +use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray}; use arrow_schema::ArrowError; use memchr::memchr2; use memchr::memmem::Finder; @@ -112,36 +111,17 @@ impl<'a> Predicate<'a> { Predicate::Eq(v) => BooleanArray::from_unary(array, |haystack| { (haystack.len() == v.len() && haystack == *v) != negate }), - Predicate::IEqAscii(v) => { - if let Some(string_view_array) = array.as_any().downcast_ref::() { - let neddle_bytes = v.as_bytes(); - let null_buffer = string_view_array.logical_nulls(); - let boolean_buffer = - BooleanBuffer::collect_bool(string_view_array.len(), |i| { - unsafe { string_view_array.bytes_unchecked(i) } - .eq_ignore_ascii_case(neddle_bytes) - != negate - }); - - BooleanArray::new(boolean_buffer, null_buffer) - } else { - BooleanArray::from_unary(array, |haystack| { - haystack.eq_ignore_ascii_case(v) != negate - }) - } - } + Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| { + haystack.eq_ignore_ascii_case(v) != negate + }), Predicate::Contains(finder) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { - let null_buffer = string_view_array.logical_nulls(); - let boolean_buffer = - BooleanBuffer::collect_bool(string_view_array.len(), |i| { - finder - .find(unsafe { string_view_array.bytes_unchecked(i) }) - .is_some() - != negate - }); - - BooleanArray::new(boolean_buffer, null_buffer) + BooleanArray::from( + string_view_array + .bytes_iter() + .map(|haystack| finder.find(haystack).is_some() != negate) + .collect::>(), + ) } else { BooleanArray::from_unary(array, |haystack| { finder.find(haystack.as_bytes()).is_some() != negate @@ -150,22 +130,12 @@ impl<'a> Predicate<'a> { } Predicate::StartsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { - let needle_bytes = v.as_bytes(); - let needle_len = needle_bytes.len(); - let null_buffer = string_view_array.logical_nulls(); - let boolean_buffer = - BooleanBuffer::collect_bool(string_view_array.len(), |i| { - let prefix_bytes = - unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }; - - if prefix_bytes.len() != needle_len { - return negate; - } - - zip(prefix_bytes, needle_bytes).all(equals_kernel) != negate - }); - - BooleanArray::new(boolean_buffer, null_buffer) + BooleanArray::from( + string_view_array + .prefix_iter(v.len()) + .map(|haystack| starts_with(haystack, v, equals_kernel) != negate) + .collect::>(), + ) } else { BooleanArray::from_unary(array, |haystack| { starts_with(haystack, v, equals_kernel) != negate @@ -174,23 +144,14 @@ impl<'a> Predicate<'a> { } Predicate::IStartsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { - let needle_bytes = v.as_bytes(); - let needle_len = needle_bytes.len(); - let null_buffer = string_view_array.logical_nulls(); - let boolean_buffer = - BooleanBuffer::collect_bool(string_view_array.len(), |i| { - let prefix_bytes = - unsafe { string_view_array.prefix_bytes_unchecked(needle_len, i) }; - - if prefix_bytes.len() != needle_len { - return negate; - } - - zip(prefix_bytes, needle_bytes).all(equals_ignore_ascii_case_kernel) - != negate - }); - - BooleanArray::new(boolean_buffer, null_buffer) + BooleanArray::from( + string_view_array + .prefix_iter(v.len()) + .map(|haystack| { + starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }) + .collect::>(), + ) } else { BooleanArray::from_unary(array, |haystack| { starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate @@ -199,23 +160,12 @@ impl<'a> Predicate<'a> { } Predicate::EndsWith(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { - let needle_bytes = v.as_bytes(); - let needle_len = needle_bytes.len(); - let null_buffer = string_view_array.logical_nulls(); - let boolean_buffer = - BooleanBuffer::collect_bool(string_view_array.len(), |i| { - let haystack_bytes = unsafe { string_view_array.bytes_unchecked(i) }; - - if haystack_bytes.len() < needle_len { - return negate; - } - - zip(haystack_bytes.iter().rev(), needle_bytes.iter().rev()) - .all(equals_kernel) - != negate - }); - - BooleanArray::new(boolean_buffer, null_buffer) + BooleanArray::from( + string_view_array + .suffix_iter(v.len()) + .map(|haystack| ends_with(haystack, v, equals_kernel) != negate) + .collect::>(), + ) } else { BooleanArray::from_unary(array, |haystack| { ends_with(haystack, v, equals_kernel) != negate @@ -224,23 +174,14 @@ impl<'a> Predicate<'a> { } Predicate::IEndsWithAscii(v) => { if let Some(string_view_array) = array.as_any().downcast_ref::() { - let needle_bytes = v.as_bytes(); - let needle_len = needle_bytes.len(); - let null_buffer = string_view_array.logical_nulls(); - let boolean_buffer = - BooleanBuffer::collect_bool(string_view_array.len(), |i| { - let haystack_bytes = unsafe { string_view_array.bytes_unchecked(i) }; - - if haystack_bytes.len() < needle_len { - return negate; - } - - zip(haystack_bytes.iter().rev(), needle_bytes.iter().rev()) - .all(equals_ignore_ascii_case_kernel) - != negate - }); - - BooleanArray::new(boolean_buffer, null_buffer) + BooleanArray::from( + string_view_array + .suffix_iter(v.len()) + .map(|haystack| { + ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + }) + .collect::>(), + ) } else { BooleanArray::from_unary(array, |haystack| { ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate From 332290512ca1abccbd8812c89275b04079c0353f Mon Sep 17 00:00:00 2001 From: Xin Li Date: Mon, 19 Aug 2024 17:35:23 +0800 Subject: [PATCH 6/9] stop reverse string for ends_with --- arrow-string/src/predicate.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 4927ecda34b8..73a0cce9744f 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -163,7 +163,7 @@ impl<'a> Predicate<'a> { BooleanArray::from( string_view_array .suffix_iter(v.len()) - .map(|haystack| ends_with(haystack, v, equals_kernel) != negate) + .map(|haystack| starts_with(haystack, v, equals_kernel) != negate) .collect::>(), ) } else { @@ -178,7 +178,7 @@ impl<'a> Predicate<'a> { string_view_array .suffix_iter(v.len()) .map(|haystack| { - ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate }) .collect::>(), ) From cd5886fd8d4cf3fe231d281c2bdb805e72c4e585 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 20 Aug 2024 13:49:17 +0800 Subject: [PATCH 7/9] Fix comments --- arrow-array/src/array/byte_view_array.rs | 116 ++++++++++------------- arrow-string/src/like.rs | 21 ++++ arrow-string/src/predicate.rs | 44 ++++++--- 3 files changed, 101 insertions(+), 80 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 237316e0785c..aa512036a28d 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -279,22 +279,6 @@ impl GenericByteViewArray { T::Native::from_bytes_unchecked(b) } - /// Returns the bytes at index `i` - /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array - pub unsafe fn bytes_unchecked(&self, idx: usize) -> &[u8] { - let v = self.views.get_unchecked(idx); - let len = *v as u32; - if len <= 12 { - Self::inline_value(v, len as usize) - } else { - let view = ByteView::from(*v); - let data = self.buffers.get_unchecked(view.buffer_index as usize); - let offset = view.offset as usize; - data.get_unchecked(offset..offset + len as usize) - } - } - /// Returns the inline value of the view. /// /// # Safety @@ -326,6 +310,54 @@ impl GenericByteViewArray { }) } + /// Returns an iterator over the prefix bytes of this array with respect to the prefix length. + /// If the prefix length is larger than the string length, it will return the empty string. + pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator { + self.views().into_iter().map(move |v| { + let len = (*v as u32) as usize; + + if len < prefix_len { + return &[] as &[u8]; + } + + if prefix_len <= 4 || len <= 12 { + unsafe { StringViewArray::inline_value(v, prefix_len) } + } else { + let view = ByteView::from(*v); + let data = unsafe { + self.data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset..offset + prefix_len) } + } + }) + } + + /// Returns an iterator over the suffix bytes of this array with respect to the suffix length. + /// If the suffix length is larger than the string length, it will return the empty string. + pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator { + self.views().into_iter().map(move |v| { + let len = (*v as u32) as usize; + + if len < suffix_len { + return &[] as &[u8]; + } + + if len <= 12 { + unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } + } else { + let view = ByteView::from(*v); + let data = unsafe { + self.data_buffers() + .get_unchecked(view.buffer_index as usize) + }; + let offset = view.offset as usize; + unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) } + } + }) + } + /// Returns a zero-copy slice of this array with the indicated offset and length. pub fn slice(&self, offset: usize, length: usize) -> Self { Self { @@ -699,58 +731,6 @@ impl StringViewArray { None => true, }) } - - /// Returns an iterator over the prefix bytes of this array with respect to the prefix length. - /// If the prefix length is larger than the string length, it will return the empty string. - pub fn prefix_iter(&self, prefix_len: usize) -> impl Iterator { - self.views().into_iter().map(move |v| { - let len = (*v as u32) as usize; - - if len < prefix_len { - return ""; - } - - let b = if prefix_len <= 4 || len <= 12 { - unsafe { StringViewArray::inline_value(v, prefix_len) } - } else { - let view = ByteView::from(*v); - let data = unsafe { - self.data_buffers() - .get_unchecked(view.buffer_index as usize) - }; - let offset = view.offset as usize; - unsafe { data.get_unchecked(offset..offset + prefix_len) } - }; - - unsafe { str::from_utf8_unchecked(b) } - }) - } - - /// Returns an iterator over the suffix bytes of this array with respect to the suffix length. - /// If the suffix length is larger than the string length, it will return the empty string. - pub fn suffix_iter(&self, suffix_len: usize) -> impl Iterator { - self.views().into_iter().map(move |v| { - let len = (*v as u32) as usize; - - if len < suffix_len { - return ""; - } - - let b = if len <= 12 { - unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } - } else { - let view = ByteView::from(*v); - let data = unsafe { - self.data_buffers() - .get_unchecked(view.buffer_index as usize) - }; - let offset = view.offset as usize; - unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) } - }; - - unsafe { str::from_utf8_unchecked(b) } - }) - } } impl From> for StringViewArray { diff --git a/arrow-string/src/like.rs b/arrow-string/src/like.rs index e878e241853a..4626be1362e9 100644 --- a/arrow-string/src/like.rs +++ b/arrow-string/src/like.rs @@ -989,6 +989,27 @@ mod tests { vec![false, true, true, false, false, false, false, true, true, true, true] ); + // 😈 is four bytes long. + test_utf8_scalar!( + test_uff8_array_like_multibyte, + vec![ + "sdlkdfFooßsdfs", + "sdlkdfFooSSdggs", + "sdlkdfFoosssdsd", + "FooS", + "Foos", + "ffooSS", + "ffooß", + "😃sadlksffofsSsh😈klF", + "😱slgffoesSsh😈klF", + "FFKoSS", + "longer than 12 bytes FFKoSS", + ], + "%Ssh😈klF", + like, + vec![false, false, false, false, false, false, false, true, true, false, false] + ); + test_utf8_scalar!( test_utf8_array_ilike_scalar_one, vec![ diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 73a0cce9744f..91a4e187541b 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -132,8 +132,10 @@ impl<'a> Predicate<'a> { if let Some(string_view_array) = array.as_any().downcast_ref::() { BooleanArray::from( string_view_array - .prefix_iter(v.len()) - .map(|haystack| starts_with(haystack, v, equals_kernel) != negate) + .prefix_bytes_iter(v.len()) + .map(|haystack| { + starts_with_bytes(haystack, v.as_bytes(), equals_kernel) != negate + }) .collect::>(), ) } else { @@ -146,9 +148,13 @@ impl<'a> Predicate<'a> { if let Some(string_view_array) = array.as_any().downcast_ref::() { BooleanArray::from( string_view_array - .prefix_iter(v.len()) + .prefix_bytes_iter(v.len()) .map(|haystack| { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + starts_with_bytes( + haystack, + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ) != negate }) .collect::>(), ) @@ -162,8 +168,10 @@ impl<'a> Predicate<'a> { if let Some(string_view_array) = array.as_any().downcast_ref::() { BooleanArray::from( string_view_array - .suffix_iter(v.len()) - .map(|haystack| starts_with(haystack, v, equals_kernel) != negate) + .suffix_bytes_iter(v.len()) + .map(|haystack| { + starts_with_bytes(haystack, v.as_bytes(), equals_kernel) != negate + }) .collect::>(), ) } else { @@ -176,9 +184,13 @@ impl<'a> Predicate<'a> { if let Some(string_view_array) = array.as_any().downcast_ref::() { BooleanArray::from( string_view_array - .suffix_iter(v.len()) + .suffix_bytes_iter(v.len()) .map(|haystack| { - starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate + starts_with_bytes( + haystack, + v.as_bytes(), + equals_ignore_ascii_case_kernel, + ) != negate }) .collect::>(), ) @@ -195,16 +207,24 @@ impl<'a> Predicate<'a> { } } -/// This is faster than `str::starts_with` for small strings. -/// See for more details. -fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { +fn starts_with_bytes( + haystack: &[u8], + needle: &[u8], + byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, +) -> bool { if needle.len() > haystack.len() { false } else { - zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) + zip(haystack, needle).all(byte_eq_kernel) } } +/// This is faster than `str::starts_with` for small strings. +/// See for more details. +fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { + starts_with_bytes(haystack.as_bytes(), needle.as_bytes(), byte_eq_kernel) +} + /// This is faster than `str::ends_with` for small strings. /// See for more details. fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { From 7eb3c8323071245d77296f09971536ede4684cbc Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 20 Aug 2024 13:55:54 +0800 Subject: [PATCH 8/9] fix bad comment --- arrow-array/src/array/byte_view_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index aa512036a28d..69bd8f01e942 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -311,7 +311,7 @@ impl GenericByteViewArray { } /// Returns an iterator over the prefix bytes of this array with respect to the prefix length. - /// If the prefix length is larger than the string length, it will return the empty string. + /// If the prefix length is larger than the string length, it will return the empty slice. pub fn prefix_bytes_iter(&self, prefix_len: usize) -> impl Iterator { self.views().into_iter().map(move |v| { let len = (*v as u32) as usize; @@ -335,7 +335,7 @@ impl GenericByteViewArray { } /// Returns an iterator over the suffix bytes of this array with respect to the suffix length. - /// If the suffix length is larger than the string length, it will return the empty string. + /// If the suffix length is larger than the string length, it will return the empty slice. pub fn suffix_bytes_iter(&self, suffix_len: usize) -> impl Iterator { self.views().into_iter().map(move |v| { let len = (*v as u32) as usize; From 4a27f96ecce8987ec0b203cad52ef97139154299 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Wed, 21 Aug 2024 15:13:38 +0800 Subject: [PATCH 9/9] Correct equals sematics --- arrow-string/src/predicate.rs | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs index 91a4e187541b..f559088e6c96 100644 --- a/arrow-string/src/predicate.rs +++ b/arrow-string/src/predicate.rs @@ -134,7 +134,7 @@ impl<'a> Predicate<'a> { string_view_array .prefix_bytes_iter(v.len()) .map(|haystack| { - starts_with_bytes(haystack, v.as_bytes(), equals_kernel) != negate + equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate }) .collect::>(), ) @@ -150,7 +150,7 @@ impl<'a> Predicate<'a> { string_view_array .prefix_bytes_iter(v.len()) .map(|haystack| { - starts_with_bytes( + equals_bytes( haystack, v.as_bytes(), equals_ignore_ascii_case_kernel, @@ -170,7 +170,7 @@ impl<'a> Predicate<'a> { string_view_array .suffix_bytes_iter(v.len()) .map(|haystack| { - starts_with_bytes(haystack, v.as_bytes(), equals_kernel) != negate + equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate }) .collect::>(), ) @@ -186,7 +186,7 @@ impl<'a> Predicate<'a> { string_view_array .suffix_bytes_iter(v.len()) .map(|haystack| { - starts_with_bytes( + equals_bytes( haystack, v.as_bytes(), equals_ignore_ascii_case_kernel, @@ -207,24 +207,19 @@ impl<'a> Predicate<'a> { } } -fn starts_with_bytes( - haystack: &[u8], - needle: &[u8], - byte_eq_kernel: impl Fn((&u8, &u8)) -> bool, -) -> bool { - if needle.len() > haystack.len() { - false - } else { - zip(haystack, needle).all(byte_eq_kernel) - } +fn equals_bytes(lhs: &[u8], rhs: &[u8], byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { + lhs.len() == rhs.len() && zip(lhs, rhs).all(byte_eq_kernel) } /// This is faster than `str::starts_with` for small strings. /// See for more details. fn starts_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool { - starts_with_bytes(haystack.as_bytes(), needle.as_bytes(), byte_eq_kernel) + if needle.len() > haystack.len() { + false + } else { + zip(haystack.as_bytes(), needle.as_bytes()).all(byte_eq_kernel) + } } - /// This is faster than `str::ends_with` for small strings. /// See for more details. fn ends_with(haystack: &str, needle: &str, byte_eq_kernel: impl Fn((&u8, &u8)) -> bool) -> bool {