apache · alamb · Aug 25, 2024 · Aug 17, 2024 · Aug 15, 2024 · Aug 17, 2024
diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs
@@ -24,6 +24,7 @@ use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, S
 use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
 use arrow_schema::{ArrowError, DataType};
+use core::str;
 use num::ToPrimitive;
 use std::any::Any;
 use std::fmt::Debug;
@@ -278,6 +279,22 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         T::Native::from_bytes_unchecked(b)
     }
 
+    /// Returns the bytes at index `i`
+    /// # Safety
+    /// Caller is responsible for ensuring that the index is within the bounds of the array
+    pub unsafe fn bytes_unchecked(&self, idx: usize) -> &[u8] {
+        let v = self.views.get_unchecked(idx);
+        let len = *v as u32;
+        if len <= 12 {
+            Self::inline_value(v, len as usize)
+        } else {
+            let view = ByteView::from(*v);
+            let data = self.buffers.get_unchecked(view.buffer_index as usize);
+            let offset = view.offset as usize;
+            data.get_unchecked(offset..offset + len as usize)
+        }
+    }
+
     /// Returns the inline value of the view.
     ///
     /// # Safety
@@ -294,6 +311,21 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         ArrayIter::new(self)
     }
 
+    /// Returns an iterator over the bytes of this array.
+    pub fn bytes_iter(&self) -> impl Iterator<Item = &[u8]> {
+        self.views.iter().map(move |v| {
+            let len = *v as u32;
+            if len <= 12 {
+                unsafe { Self::inline_value(v, len as usize) }
+            } else {
+                let view = ByteView::from(*v);
+                let data = &self.buffers[view.buffer_index as usize];
+                let offset = view.offset as usize;
+                unsafe { data.get_unchecked(offset..offset + len as usize) }
+            }
+        })
+    }
+
     /// Returns a zero-copy slice of this array with the indicated offset and length.
     pub fn slice(&self, offset: usize, length: usize) -> Self {
         Self {
@@ -667,6 +699,58 @@ impl StringViewArray {
             None => true,
         })
     }
+
+    /// Returns an iterator over the prefix bytes of this array with respect to the prefix length.
+    /// If the prefix length is larger than the string length, it will return the empty string.
+    pub fn prefix_iter(&self, prefix_len: usize) -> impl Iterator<Item = &str> {
+        self.views().into_iter().map(move |v| {
+            let len = (*v as u32) as usize;
+
+            if len < prefix_len {
+                return "";
+            }
+
+            let b = if prefix_len <= 4 || len <= 12 {
+                unsafe { StringViewArray::inline_value(v, prefix_len) }
+            } else {
+                let view = ByteView::from(*v);
+                let data = unsafe {
+                    self.data_buffers()
+                        .get_unchecked(view.buffer_index as usize)
+                };
+                let offset = view.offset as usize;
+                unsafe { data.get_unchecked(offset..offset + prefix_len) }
+            };
+
+            unsafe { str::from_utf8_unchecked(b) }
+        })
+    }
+
+    /// Returns an iterator over the suffix bytes of this array with respect to the suffix length.
+    /// If the suffix length is larger than the string length, it will return the empty string.
+    pub fn suffix_iter(&self, suffix_len: usize) -> impl Iterator<Item = &str> {
+        self.views().into_iter().map(move |v| {
+            let len = (*v as u32) as usize;
+
+            if len < suffix_len {
+                return "";
+            }
+
+            let b = if len <= 12 {
+                unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] }
+            } else {
+                let view = ByteView::from(*v);
+                let data = unsafe {
+                    self.data_buffers()
+                        .get_unchecked(view.buffer_index as usize)
+                };
+                let offset = view.offset as usize;
+                unsafe { data.get_unchecked(offset + len - suffix_len..offset + len) }
+            };
+
+            unsafe { str::from_utf8_unchecked(b) }
+        })
+    }
 }
 
 impl From<Vec<&str>> for StringViewArray {

diff --git a/arrow-string/src/predicate.rs b/arrow-string/src/predicate.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow_array::{ArrayAccessor, BooleanArray};
+use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
 use arrow_schema::ArrowError;
 use memchr::memchr2;
 use memchr::memmem::Finder;
@@ -114,21 +114,80 @@ impl<'a> Predicate<'a> {
             Predicate::IEqAscii(v) => BooleanArray::from_unary(array, |haystack| {
                 haystack.eq_ignore_ascii_case(v) != negate
             }),
-            Predicate::Contains(finder) => BooleanArray::from_unary(array, |haystack| {
-                finder.find(haystack.as_bytes()).is_some() != negate
-            }),
-            Predicate::StartsWith(v) => BooleanArray::from_unary(array, |haystack| {
-                starts_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IStartsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
-                starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
-            }),
-            Predicate::EndsWith(v) => BooleanArray::from_unary(array, |haystack| {
-                ends_with(haystack, v, equals_kernel) != negate
-            }),
-            Predicate::IEndsWithAscii(v) => BooleanArray::from_unary(array, |haystack| {
-                ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
-            }),
+            Predicate::Contains(finder) => {
+                if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .bytes_iter()
+                            .map(|haystack| finder.find(haystack).is_some() != negate)
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
 impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> { 
     type Item = &'a T::Native; 
     fn value(&self, index: usize) -> Self::Item { 
         GenericByteViewArray::value(self, index) 
     } 
     unsafe fn value_unchecked(&self, index: usize) -> Self::Item { 
         GenericByteViewArray::value_unchecked(self, index) 
     } 
 } 
 impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> { 
     type Item = &'a T::Native; 
  
     fn value(&self, index: usize) -> Self::Item { 
         GenericByteViewArray::value(self, index) 
     } 
  
     unsafe fn value_unchecked(&self, index: usize) -> Self::Item { 
         GenericByteViewArray::value_unchecked(self, index) 
     } 
 } 
+                        finder.find(haystack.as_bytes()).is_some() != negate
+                    })
+                }
+            }
+            Predicate::StartsWith(v) => {
+                if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .prefix_iter(v.len())
+                            .map(|haystack| starts_with(haystack, v, equals_kernel) != negate)
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        starts_with(haystack, v, equals_kernel) != negate
+                    })
+                }
+            }
+            Predicate::IStartsWithAscii(v) => {
+                if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .prefix_iter(v.len())
+                            .map(|haystack| {
+                                starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
+                            })
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
+                    })
+                }
+            }
+            Predicate::EndsWith(v) => {
+                if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .suffix_iter(v.len())
+                            .map(|haystack| starts_with(haystack, v, equals_kernel) != negate)
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        ends_with(haystack, v, equals_kernel) != negate
+                    })
+                }
+            }
+            Predicate::IEndsWithAscii(v) => {
+                if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
+                    BooleanArray::from(
+                        string_view_array
+                            .suffix_iter(v.len())
+                            .map(|haystack| {
+                                starts_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
+                            })
+                            .collect::<Vec<_>>(),
+                    )
+                } else {
+                    BooleanArray::from_unary(array, |haystack| {
+                        ends_with(haystack, v, equals_ignore_ascii_case_kernel) != negate
+                    })
+                }
+            }
             Predicate::Regex(v) => {
                 BooleanArray::from_unary(array, |haystack| v.is_match(haystack) != negate)
             }

diff --git a/arrow/benches/comparison_kernels.rs b/arrow/benches/comparison_kernels.rs
@@ -244,6 +244,10 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_like_utf8view_scalar(&string_view_left, "xxxx%"))
     });
 
+    c.bench_function("like_utf8view scalar starts with more than 4 bytes", |b| {
+        b.iter(|| bench_like_utf8view_scalar(&string_view_left, "xxxxxx%"))
+    });
+
     c.bench_function("like_utf8view scalar complex", |b| {
         b.iter(|| bench_like_utf8view_scalar(&string_view_left, "%xx_xx%xxx"))
     });