apache · tustvold · Sep 19, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs
@@ -329,8 +329,10 @@ where
     Ok(Ok(PrimitiveArray::<T>::from(array_data)))
 }
 
-/// Applies the provided fallible binary operation across `a` and `b`, returning any error,
-/// and collecting the results into a [`PrimitiveArray`]. If any index is null in either `a`
+/// Applies the provided fallible binary operation across `a` and `b`.
+///
+/// This will returning any error encountered, or collecting the results into
+/// a [`PrimitiveArray`]. If any index is null in either `a`
 /// or `b`, the corresponding index in the result will also be null
 ///
 /// Like [`try_unary`] the function is only evaluated for non-null indices
@@ -381,12 +383,15 @@ where
 }
 
 /// Applies the provided fallible binary operation across `a` and `b` by mutating the mutable
-/// [`PrimitiveArray`] `a` with the results, returning any error. If any index is null in
-/// either `a` or `b`, the corresponding index in the result will also be null
+/// [`PrimitiveArray`] `a` with the results.
 ///
-/// Like [`try_unary`] the function is only evaluated for non-null indices
+/// Returning any error encountered, or collecting the results into a [`PrimitiveArray`] as return
+/// value. If any index is null in either `a` or `b`, the corresponding index in the result will
+/// also be null.
+///
+/// Like [`try_unary`] the function is only evaluated for non-null indices.
 ///
-/// See [`binary_mut`] for errors and buffer reuse information
+/// See [`binary_mut`] for errors and buffer reuse information.
 pub fn try_binary_mut<T, F>(
     a: PrimitiveArray<T>,
     b: &PrimitiveArray<T>,

diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs
@@ -666,6 +666,7 @@ impl<T: Datelike> ChronoDateExt for T {
 
 /// Parse the given string into a string representing fixed-offset that is correct as of the given
 /// UTC NaiveDateTime.
+///
 /// Note that the offset is function of time and can vary depending on whether daylight savings is
 /// in effect or not. e.g. Australia/Sydney is +10:00 or +11:00 depending on DST.
 #[deprecated(note = "Use arrow_array::timezone::Tz instead")]
@@ -811,6 +812,7 @@ where
 }
 
 /// Extracts the day of a given temporal array as an array of integers.
+///
 /// If the given array isn't temporal primitive or dictionary array,
 /// an `Err` will be returned.
 #[deprecated(since = "51.0.0", note = "Use `date_part` instead")]
@@ -828,7 +830,8 @@ where
     date_part_primitive(array, DatePart::Day)
 }
 
-/// Extracts the day of year of a given temporal array as an array of integers
+/// Extracts the day of year of a given temporal array as an array of integers.
+///
 /// The day of year that ranges from 1 to 366.
 /// If the given array isn't temporal primitive or dictionary array,
 /// an `Err` will be returned.
@@ -837,7 +840,8 @@ pub fn doy_dyn(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
     date_part(array, DatePart::DayOfYear)
 }
 
-/// Extracts the day of year of a given temporal primitive array as an array of integers
+/// Extracts the day of year of a given temporal primitive array as an array of integers.
+///
 /// The day of year that ranges from 1 to 366
 #[deprecated(since = "51.0.0", note = "Use `date_part` instead")]
 pub fn doy<T>(array: &PrimitiveArray<T>) -> Result<Int32Array, ArrowError>

diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -512,7 +512,8 @@ fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 {
     u128::from_le_bytes(view_buffer)
 }
 
-/// Create a view based on the given data, block id and offset
+/// Create a view based on the given data, block id and offset.
+///
 /// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G>
 /// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined),
 /// which slows down things.

diff --git a/arrow-array/src/ffi_stream.rs b/arrow-array/src/ffi_stream.rs
@@ -275,6 +275,7 @@ fn get_error_code(err: &ArrowError) -> i32 {
 }
 
 /// A `RecordBatchReader` which imports Arrays from `FFI_ArrowArrayStream`.
+///
 /// Struct used to fetch `RecordBatch` from the C Stream Interface.
 /// Its main responsibility is to expose `RecordBatchReader` functionality
 /// that requires [FFI_ArrowArrayStream].

diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs
@@ -50,7 +50,9 @@ impl BooleanType {
     pub const DATA_TYPE: DataType = DataType::Boolean;
 }
 
-/// Trait for [primitive values], bridging the dynamic-typed nature of Arrow
+/// Trait for [primitive values]
+///
+/// This trait bridging the dynamic-typed nature of Arrow
 /// (via [`DataType`]) with the static-typed nature of rust types
 /// ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`].
 ///

diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs
@@ -18,6 +18,7 @@
 use crate::{BooleanBufferBuilder, MutableBuffer, NullBuffer};
 
 /// Builder for creating the null bit buffer.
+///
 /// This builder only materializes the buffer when we append `false`.
 /// If you only append `true`s to the builder, what you get will be
 /// `None` when calling [`finish`](#method.finish).

diff --git a/arrow-buffer/src/util/bit_mask.rs b/arrow-buffer/src/util/bit_mask.rs
@@ -20,8 +20,10 @@
 use crate::bit_chunk_iterator::BitChunks;
 use crate::bit_util::{ceil, get_bit, set_bit};
 
-/// Sets all bits on `write_data` in the range `[offset_write..offset_write+len]` to be equal to the
-/// bits in `data` in the range `[offset_read..offset_read+len]`
+/// Util function to set bits in a slice of bytes.
+///
+/// This will sets all bits on `write_data` in the range `[offset_write..offset_write+len]`
+/// to be equal to the bits in `data` in the range `[offset_read..offset_read+len]`
 /// returns the number of `0` bits `data[offset_read..offset_read+len]`
 pub fn set_bits(
     write_data: &mut [u8],

diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
@@ -161,9 +161,11 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff
     }
 }
 
-/// A generic representation of Arrow array data which encapsulates common attributes and
-/// operations for Arrow array. Specific operations for different arrays types (e.g.,
-/// primitive, list, struct) are implemented in `Array`.
+/// A generic representation of Arrow array data which encapsulates common attributes
+/// and operations for Arrow array.
+///
+/// Specific operations for different arrays types (e.g., primitive, list, struct)
+/// are implemented in `Array`.
 ///
 /// # Memory Layout
 ///

diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs
@@ -50,8 +50,7 @@ use std::{fmt, ops::Deref};
 
 type ArrowResult<T> = std::result::Result<T, ArrowError>;
 
-#[allow(clippy::derive_partial_eq_without_eq)]
-
+#[allow(clippy::all)]
 mod gen {
     include!("arrow.flight.protocol.rs");
 }

diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs
@@ -979,6 +979,7 @@ fn arrow_error_to_status(err: arrow_schema::ArrowError) -> Status {
 
 /// A wrapper around [`Streaming<FlightData>`] that allows "peeking" at the
 /// message at the front of the stream without consuming it.
+///
 /// This is needed because sometimes the first message in the stream will contain
 /// a [`FlightDescriptor`] in addition to potentially any data, and the dispatch logic
 /// must inspect this information.

diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
@@ -710,7 +710,9 @@ fn into_zero_offset_run_array<R: RunEndIndexType>(
 }
 
 /// Keeps track of dictionaries that have been written, to avoid emitting the same dictionary
-/// multiple times. Can optionally error if an update to an existing dictionary is attempted, which
+/// multiple times.
+///
+/// Can optionally error if an update to an existing dictionary is attempted, which
 /// isn't allowed in the `FileWriter`.
 pub struct DictionaryTracker {
     written: HashMap<i64, ArrayData>,

diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
@@ -42,8 +42,9 @@ use arrow_schema::*;
 const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8;
 
 /// An iterator of `(usize, usize)` each representing an interval
-/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each
-/// interval corresponds to a contiguous region of memory to be
+/// `[start, end)` whose slots of a bitmap [Buffer] are true.
+///
+/// Each interval corresponds to a contiguous region of memory to be
 /// "taken" from an array to be filtered.
 ///
 /// ## Notes:
@@ -117,6 +118,7 @@ fn filter_count(filter: &BooleanArray) -> usize {
 pub type Filter<'a> = Box<dyn Fn(&ArrayData) -> ArrayData + 'a>;
 
 /// Returns a prepared function optimized to filter multiple arrays.
+///
 /// Creating this function requires time, but using it is faster than [filter] when the
 /// same filter needs to be applied to multiple arrays (e.g. a multi-column `RecordBatch`).
 /// WARNING: the nulls of `filter` are ignored and the value on its slot is considered.

diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs
@@ -45,6 +45,7 @@ fn bit_length_impl<P: ArrowPrimitiveType>(
 }
 
 /// Returns an array of Int32/Int64 denoting the length of each value in the array.
+///
 /// For list array, length is the number of elements in each list.
 /// For string array and binary array, length is the number of bytes of each value.
 ///

diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs
@@ -18,6 +18,7 @@
 //! Pass Arrow objects from and to PyArrow, using Arrow's
 //! [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
 //! and [pyo3](https://docs.rs/pyo3/latest/pyo3/).
+//!
 //! For underlying implementation, see the [ffi] module.
 //!
 //! One can use these to write Python functions that take and return PyArrow
@@ -472,8 +473,10 @@ impl IntoPyArrow for ArrowArrayStreamReader {
     }
 }
 
-/// A newtype wrapper. When wrapped around a type `T: FromPyArrow`, it
-/// implements `FromPyObject` for the PyArrow objects. When wrapped around a
+/// A newtype wrapper for types implement [`FromPyArrow`] or [`IntoPyArrow`].
+///
+/// When wrapped around a type `T: FromPyArrow`, it
+/// implements [`FromPyObject`] for the PyArrow objects. When wrapped around a
 /// `T: IntoPyArrow`, it implements `IntoPy<PyObject>` for the wrapped type.
 #[derive(Debug)]
 pub struct PyArrowType<T>(pub T);

diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
@@ -108,7 +108,9 @@ where
         .collect()
 }
 
-/// Creates a random (but fixed-seeded) string array of a given size and null density, strings have a random length
+/// Creates a random (but fixed-seeded) string array of a given size and null density.
+///
+/// Strings have a random length
 /// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths,
 /// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex.
 pub fn create_string_array<Offset: OffsetSizeTrait>(

diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
@@ -39,6 +39,7 @@ pub use crate::format::{
 // Mirrors `parquet::Type`
 
 /// Types supported by Parquet.
+///
 /// These physical types are intended to be used in combination with the encodings to
 /// control the on disk storage format.
 /// For example INT16 is not included as a type since a good encoding of INT32
@@ -60,6 +61,7 @@ pub enum Type {
 // Mirrors `parquet::ConvertedType`
 
 /// Common types (converted types) used by frameworks when using Parquet.
+///
 /// This helps map between types in those frameworks to the base types in Parquet.
 /// This is only metadata and not needed to read or write the data.
 ///

diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs
@@ -181,9 +181,10 @@ impl std::ops::IndexMut<usize> for Block {
     }
 }
 
-/// A split block Bloom filter. The creation of this structure is based on the
-/// [`crate::file::properties::BloomFilterProperties`] struct set via [`crate::file::properties::WriterProperties`] and
-/// is thus hidden by default.
+/// A split block Bloom filter.
+///
+/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`]
+/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default.
 #[derive(Debug, Clone)]
 pub struct Sbbf(Vec<Block>);
 

diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs
@@ -273,6 +273,7 @@ pub struct PlainDecoderDetails {
 }
 
 /// Plain decoding that supports all types.
+///
 /// Values are encoded back to back. For native types, data is encoded as little endian.
 /// Floating point types are encoded in IEEE.
 /// See [`PlainEncoder`](crate::encoding::PlainEncoder) for more information.
@@ -333,6 +334,7 @@ impl<T: DataType> Decoder<T> for PlainDecoder<T> {
 // RLE_DICTIONARY/PLAIN_DICTIONARY Decoding
 
 /// Dictionary decoder.
+///
 /// The dictionary encoding builds a dictionary of values encountered in a given column.
 /// The dictionary is be stored in a dictionary page per column chunk.
 /// See [`DictEncoder`](crate::encoding::DictEncoder) for more information.
@@ -824,6 +826,7 @@ where
 // DELTA_LENGTH_BYTE_ARRAY Decoding
 
 /// Delta length byte array decoder.
+///
 /// Only applied to byte arrays to separate the length values and the data, the lengths
 /// are encoded using DELTA_BINARY_PACKED encoding.
 /// See [`DeltaLengthByteArrayEncoder`](crate::encoding::DeltaLengthByteArrayEncoder)
@@ -952,6 +955,7 @@ impl<T: DataType> Decoder<T> for DeltaLengthByteArrayDecoder<T> {
 // DELTA_BYTE_ARRAY Decoding
 
 /// Delta byte array decoder.
+///
 /// Prefix lengths are encoded using `DELTA_BINARY_PACKED` encoding, Suffixes are stored
 /// using `DELTA_LENGTH_BYTE_ARRAY` encoding.
 /// See [`DeltaByteArrayEncoder`](crate::encoding::DeltaByteArrayEncoder) for more

diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
@@ -116,8 +116,8 @@ pub mod basic;
 ///
 /// [parquet.thrift]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
 // see parquet/CONTRIBUTING.md for instructions on regenerating
-#[allow(clippy::derivable_impls, clippy::match_single_binding)]
-// Don't try and format auto generated code
+// Don't try clippy and format auto generated code
+#[allow(clippy::all)]
 #[rustfmt::skip]
 pub mod format;
 

diff --git a/parquet/src/record/record_reader.rs b/parquet/src/record/record_reader.rs
@@ -18,7 +18,8 @@
 use super::super::errors::ParquetError;
 use super::super::file::reader::RowGroupReader;
 
-/// read up to `max_records` records from `row_group_reader` into `self`
+/// Read up to `max_records` records from `row_group_reader` into `self`.
+///
 /// The type parameter `T` is used to work around the rust orphan rule
 /// when implementing on types such as `Vec<T>`.
 pub trait RecordReader<T> {

diff --git a/parquet/src/record/record_writer.rs b/parquet/src/record/record_writer.rs
@@ -20,16 +20,23 @@ use crate::schema::types::TypePtr;
 use super::super::errors::ParquetError;
 use super::super::file::writer::SerializedRowGroupWriter;
 
-/// `write_to_row_group` writes from `self` into `row_group_writer`
-/// `schema` builds the schema used by `row_group_writer`
+/// Trait descriping how to write a record (the implementator) to a row group writer.
+///
+/// [`parquet_derive`] crate provides a derive macro [`ParquetRecordWriter`] for this trait
+/// for unnested structs.
+///
 /// The type parameter `T` is used to work around the rust orphan rule
 /// when implementing on types such as `&[T]`.
+///
+/// [`parquet_derive`]: https://crates.io/crates/parquet_derive
+/// [`ParquetRecordWriter`]: https://docs.rs/parquet_derive/53.0.0/parquet_derive/derive.ParquetRecordWriter.html
 pub trait RecordWriter<T> {
+    /// Writes from `self` into `row_group_writer`.
     fn write_to_row_group<W: std::io::Write + Send>(
         &self,
         row_group_writer: &mut SerializedRowGroupWriter<W>,
     ) -> Result<(), ParquetError>;
 
-    /// Generated schema
+    /// Generated schema used by `row_group_writer`
     fn schema(&self) -> Result<TypePtr, ParquetError>;
 }
diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs
@@ -29,8 +29,9 @@ use ::syn::{parse_macro_input, Data, DataStruct, DeriveInput};
 
 mod parquet_field;
 
-/// Derive flat, simple RecordWriter implementations. Works by parsing
-/// a struct tagged with `#[derive(ParquetRecordWriter)]` and emitting
+/// Derive flat, simple RecordWriter implementations.
+///
+/// Works by parsing a struct tagged with `#[derive(ParquetRecordWriter)]` and emitting
 /// the correct writing code for each field of the struct. Column writers
 /// are generated in the order they are defined.
 ///
@@ -143,8 +144,9 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke
   }).into()
 }
 
-/// Derive flat, simple RecordReader implementations. Works by parsing
-/// a struct tagged with `#[derive(ParquetRecordReader)]` and emitting
+/// Derive flat, simple RecordReader implementations.
+///
+/// Works by parsing a struct tagged with `#[derive(ParquetRecordReader)]` and emitting
 /// the correct writing code for each field of the struct. Column readers
 /// are generated by matching names in the schema to the names in the struct.
 ///