From 13902836eaecc2c99d603b4cf41281e6d4671a85 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Thu, 19 Sep 2024 18:55:09 +0800 Subject: [PATCH] Rephrase doc comment (#6421) * docs: rephase some Signed-off-by: Ruihang Xia * fix all warnings Signed-off-by: Ruihang Xia * big letter at the beginning Signed-off-by: Ruihang Xia * Apply suggestions from code review Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow/src/pyarrow.rs Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> * Update arrow-array/src/types.rs Co-authored-by: Matthijs Brobbel --------- Signed-off-by: Ruihang Xia Co-authored-by: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Co-authored-by: Matthijs Brobbel --- arrow-arith/src/arity.rs | 17 +++++++++++------ arrow-arith/src/temporal.rs | 8 ++++++-- .../src/builder/generic_bytes_view_builder.rs | 3 ++- arrow-array/src/ffi_stream.rs | 1 + arrow-array/src/types.rs | 4 +++- arrow-buffer/src/builder/null.rs | 1 + arrow-buffer/src/util/bit_mask.rs | 6 ++++-- arrow-data/src/data.rs | 8 +++++--- arrow-flight/src/lib.rs | 3 +-- arrow-flight/src/sql/server.rs | 1 + arrow-ipc/src/writer.rs | 4 +++- arrow-select/src/filter.rs | 6 ++++-- arrow-string/src/length.rs | 1 + arrow/src/pyarrow.rs | 7 +++++-- arrow/src/util/bench_util.rs | 4 +++- parquet/src/basic.rs | 2 ++ parquet/src/bloom_filter/mod.rs | 7 ++++--- parquet/src/encodings/decoding.rs | 4 ++++ parquet/src/lib.rs | 4 ++-- parquet/src/record/record_reader.rs | 3 ++- parquet/src/record/record_writer.rs | 13 ++++++++++--- parquet_derive/src/lib.rs | 10 ++++++---- 22 files changed, 81 insertions(+), 36 deletions(-) diff --git a/arrow-arith/src/arity.rs b/arrow-arith/src/arity.rs index 9cf4453d7fca..bb983e1225ac 100644 --- a/arrow-arith/src/arity.rs +++ b/arrow-arith/src/arity.rs @@ -332,8 +332,10 @@ where Ok(Ok(PrimitiveArray::::from(array_data))) } -/// Applies the provided fallible binary operation across `a` and `b`, returning any error, -/// and collecting the results into a [`PrimitiveArray`]. If any index is null in either `a` +/// Applies the provided fallible binary operation across `a` and `b`. +/// +/// This will return any error encountered, or collect the results into +/// a [`PrimitiveArray`]. If any index is null in either `a` /// or `b`, the corresponding index in the result will also be null /// /// Like [`try_unary`] the function is only evaluated for non-null indices @@ -384,12 +386,15 @@ where } /// Applies the provided fallible binary operation across `a` and `b` by mutating the mutable -/// [`PrimitiveArray`] `a` with the results, returning any error. If any index is null in -/// either `a` or `b`, the corresponding index in the result will also be null +/// [`PrimitiveArray`] `a` with the results. /// -/// Like [`try_unary`] the function is only evaluated for non-null indices +/// Returns any error encountered, or collects the results into a [`PrimitiveArray`] as return +/// value. If any index is null in either `a` or `b`, the corresponding index in the result will +/// also be null. +/// +/// Like [`try_unary`] the function is only evaluated for non-null indices. /// -/// See [`binary_mut`] for errors and buffer reuse information +/// See [`binary_mut`] for errors and buffer reuse information. pub fn try_binary_mut( a: PrimitiveArray, b: &PrimitiveArray, diff --git a/arrow-arith/src/temporal.rs b/arrow-arith/src/temporal.rs index 5f3eeb325104..09d690d3237c 100644 --- a/arrow-arith/src/temporal.rs +++ b/arrow-arith/src/temporal.rs @@ -666,6 +666,7 @@ impl ChronoDateExt for T { /// Parse the given string into a string representing fixed-offset that is correct as of the given /// UTC NaiveDateTime. +/// /// Note that the offset is function of time and can vary depending on whether daylight savings is /// in effect or not. e.g. Australia/Sydney is +10:00 or +11:00 depending on DST. #[deprecated(note = "Use arrow_array::timezone::Tz instead")] @@ -811,6 +812,7 @@ where } /// Extracts the day of a given temporal array as an array of integers. +/// /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. #[deprecated(since = "51.0.0", note = "Use `date_part` instead")] @@ -828,7 +830,8 @@ where date_part_primitive(array, DatePart::Day) } -/// Extracts the day of year of a given temporal array as an array of integers +/// Extracts the day of year of a given temporal array as an array of integers. +/// /// The day of year that ranges from 1 to 366. /// If the given array isn't temporal primitive or dictionary array, /// an `Err` will be returned. @@ -837,7 +840,8 @@ pub fn doy_dyn(array: &dyn Array) -> Result { date_part(array, DatePart::DayOfYear) } -/// Extracts the day of year of a given temporal primitive array as an array of integers +/// Extracts the day of year of a given temporal primitive array as an array of integers. +/// /// The day of year that ranges from 1 to 366 #[deprecated(since = "51.0.0", note = "Use `date_part` instead")] pub fn doy(array: &PrimitiveArray) -> Result diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 3a9cf17c028e..09277c679c16 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -515,7 +515,8 @@ fn make_inlined_view(data: &[u8]) -> u128 { u128::from_le_bytes(view_buffer) } -/// Create a view based on the given data, block id and offset +/// Create a view based on the given data, block id and offset. +/// /// Note that the code below is carefully examined with x86_64 assembly code: /// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), /// which slows down things. diff --git a/arrow-array/src/ffi_stream.rs b/arrow-array/src/ffi_stream.rs index 6f3405ead7b0..db44ebad1c22 100644 --- a/arrow-array/src/ffi_stream.rs +++ b/arrow-array/src/ffi_stream.rs @@ -275,6 +275,7 @@ fn get_error_code(err: &ArrowError) -> i32 { } /// A `RecordBatchReader` which imports Arrays from `FFI_ArrowArrayStream`. +/// /// Struct used to fetch `RecordBatch` from the C Stream Interface. /// Its main responsibility is to expose `RecordBatchReader` functionality /// that requires [FFI_ArrowArrayStream]. diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 550d1aadf3fa..b39c9c40311b 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -50,7 +50,9 @@ impl BooleanType { pub const DATA_TYPE: DataType = DataType::Boolean; } -/// Trait for [primitive values], bridging the dynamic-typed nature of Arrow +/// Trait for [primitive values]. +/// +/// This trait bridges the dynamic-typed nature of Arrow /// (via [`DataType`]) with the static-typed nature of rust types /// ([`ArrowNativeType`]) for all types that implement [`ArrowNativeType`]. /// diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs index a1cea6ef2cca..ce5e1dc34aa0 100644 --- a/arrow-buffer/src/builder/null.rs +++ b/arrow-buffer/src/builder/null.rs @@ -18,6 +18,7 @@ use crate::{BooleanBufferBuilder, MutableBuffer, NullBuffer}; /// Builder for creating the null bit buffer. +/// /// This builder only materializes the buffer when we append `false`. /// If you only append `true`s to the builder, what you get will be /// `None` when calling [`finish`](#method.finish). diff --git a/arrow-buffer/src/util/bit_mask.rs b/arrow-buffer/src/util/bit_mask.rs index 2074f0fab988..e9c80e097f82 100644 --- a/arrow-buffer/src/util/bit_mask.rs +++ b/arrow-buffer/src/util/bit_mask.rs @@ -19,8 +19,10 @@ use crate::bit_util::ceil; -/// Sets all bits on `write_data` in the range `[offset_write..offset_write+len]` to be equal to the -/// bits in `data` in the range `[offset_read..offset_read+len]` +/// Util function to set bits in a slice of bytes. +/// +/// This will sets all bits on `write_data` in the range `[offset_write..offset_write+len]` +/// to be equal to the bits in `data` in the range `[offset_read..offset_read+len]` /// returns the number of `0` bits `data[offset_read..offset_read+len]` /// `offset_write`, `offset_read`, and `len` are in terms of bits pub fn set_bits( diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index a14bc4873628..33cbc897a6c1 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -161,9 +161,11 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff } } -/// A generic representation of Arrow array data which encapsulates common attributes and -/// operations for Arrow array. Specific operations for different arrays types (e.g., -/// primitive, list, struct) are implemented in `Array`. +/// A generic representation of Arrow array data which encapsulates common attributes +/// and operations for Arrow array. +/// +/// Specific operations for different arrays types (e.g., primitive, list, struct) +/// are implemented in `Array`. /// /// # Memory Layout /// diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 1180264e5ddd..ff9e387dab0b 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -50,8 +50,7 @@ use std::{fmt, ops::Deref}; type ArrowResult = std::result::Result; -#[allow(clippy::derive_partial_eq_without_eq)] - +#[allow(clippy::all)] mod gen { include!("arrow.flight.protocol.rs"); } diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index e348367a91eb..37b2885b5aff 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -979,6 +979,7 @@ fn arrow_error_to_status(err: arrow_schema::ArrowError) -> Status { /// A wrapper around [`Streaming`] that allows "peeking" at the /// message at the front of the stream without consuming it. +/// /// This is needed because sometimes the first message in the stream will contain /// a [`FlightDescriptor`] in addition to potentially any data, and the dispatch logic /// must inspect this information. diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index b09dcdc5029b..6ef70cdeaa2c 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -710,7 +710,9 @@ fn into_zero_offset_run_array( } /// Keeps track of dictionaries that have been written, to avoid emitting the same dictionary -/// multiple times. Can optionally error if an update to an existing dictionary is attempted, which +/// multiple times. +/// +/// Can optionally error if an update to an existing dictionary is attempted, which /// isn't allowed in the `FileWriter`. pub struct DictionaryTracker { written: HashMap, diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index e07b03d1f276..e59ad50dd3f9 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -42,8 +42,9 @@ use arrow_schema::*; const FILTER_SLICES_SELECTIVITY_THRESHOLD: f64 = 0.8; /// An iterator of `(usize, usize)` each representing an interval -/// `[start, end)` whose slots of a bitmap [Buffer] are true. Each -/// interval corresponds to a contiguous region of memory to be +/// `[start, end)` whose slots of a bitmap [Buffer] are true. +/// +/// Each interval corresponds to a contiguous region of memory to be /// "taken" from an array to be filtered. /// /// ## Notes: @@ -117,6 +118,7 @@ fn filter_count(filter: &BooleanArray) -> usize { pub type Filter<'a> = Box ArrayData + 'a>; /// Returns a prepared function optimized to filter multiple arrays. +/// /// Creating this function requires time, but using it is faster than [filter] when the /// same filter needs to be applied to multiple arrays (e.g. a multi-column `RecordBatch`). /// WARNING: the nulls of `filter` are ignored and the value on its slot is considered. diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 82fb2e0d109b..97f876a9f953 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -45,6 +45,7 @@ fn bit_length_impl( } /// Returns an array of Int32/Int64 denoting the length of each value in the array. +/// /// For list array, length is the number of elements in each list. /// For string array and binary array, length is the number of bytes of each value. /// diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index a7b593799835..6ff6df01c454 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -18,6 +18,7 @@ //! Pass Arrow objects from and to PyArrow, using Arrow's //! [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) //! and [pyo3](https://docs.rs/pyo3/latest/pyo3/). +//! //! For underlying implementation, see the [ffi] module. //! //! One can use these to write Python functions that take and return PyArrow @@ -472,8 +473,10 @@ impl IntoPyArrow for ArrowArrayStreamReader { } } -/// A newtype wrapper. When wrapped around a type `T: FromPyArrow`, it -/// implements `FromPyObject` for the PyArrow objects. When wrapped around a +/// A newtype wrapper for types implementing [`FromPyArrow`] or [`IntoPyArrow`]. +/// +/// When wrapped around a type `T: FromPyArrow`, it +/// implements [`FromPyObject`] for the PyArrow objects. When wrapped around a /// `T: IntoPyArrow`, it implements `IntoPy` for the wrapped type. #[derive(Debug)] pub struct PyArrowType(pub T); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 2561c925aaec..cd615aa73383 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -108,7 +108,9 @@ where .collect() } -/// Creates a random (but fixed-seeded) string array of a given size and null density, strings have a random length +/// Creates a random (but fixed-seeded) string array of a given size and null density. +/// +/// Strings have a random length /// between 0 and 400 alphanumeric characters. `0..400` is chosen to cover a wide range of common string lengths, /// which have a dramatic impact on performance of some queries, e.g. LIKE/ILIKE/regex. pub fn create_string_array( diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 02c2f44f60c3..8fde542f59c8 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -39,6 +39,7 @@ pub use crate::format::{ // Mirrors `parquet::Type` /// Types supported by Parquet. +/// /// These physical types are intended to be used in combination with the encodings to /// control the on disk storage format. /// For example INT16 is not included as a type since a good encoding of INT32 @@ -60,6 +61,7 @@ pub enum Type { // Mirrors `parquet::ConvertedType` /// Common types (converted types) used by frameworks when using Parquet. +/// /// This helps map between types in those frameworks to the base types in Parquet. /// This is only metadata and not needed to read or write the data. /// diff --git a/parquet/src/bloom_filter/mod.rs b/parquet/src/bloom_filter/mod.rs index a8d68d4b6442..f98111416f6a 100644 --- a/parquet/src/bloom_filter/mod.rs +++ b/parquet/src/bloom_filter/mod.rs @@ -181,9 +181,10 @@ impl std::ops::IndexMut for Block { } } -/// A split block Bloom filter. The creation of this structure is based on the -/// [`crate::file::properties::BloomFilterProperties`] struct set via [`crate::file::properties::WriterProperties`] and -/// is thus hidden by default. +/// A split block Bloom filter. +/// +/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`] +/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default. #[derive(Debug, Clone)] pub struct Sbbf(Vec); diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index b5217d02ff09..e7f437304b7a 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -273,6 +273,7 @@ pub struct PlainDecoderDetails { } /// Plain decoding that supports all types. +/// /// Values are encoded back to back. For native types, data is encoded as little endian. /// Floating point types are encoded in IEEE. /// See [`PlainEncoder`](crate::encoding::PlainEncoder) for more information. @@ -333,6 +334,7 @@ impl Decoder for PlainDecoder { // RLE_DICTIONARY/PLAIN_DICTIONARY Decoding /// Dictionary decoder. +/// /// The dictionary encoding builds a dictionary of values encountered in a given column. /// The dictionary is be stored in a dictionary page per column chunk. /// See [`DictEncoder`](crate::encoding::DictEncoder) for more information. @@ -824,6 +826,7 @@ where // DELTA_LENGTH_BYTE_ARRAY Decoding /// Delta length byte array decoder. +/// /// Only applied to byte arrays to separate the length values and the data, the lengths /// are encoded using DELTA_BINARY_PACKED encoding. /// See [`DeltaLengthByteArrayEncoder`](crate::encoding::DeltaLengthByteArrayEncoder) @@ -952,6 +955,7 @@ impl Decoder for DeltaLengthByteArrayDecoder { // DELTA_BYTE_ARRAY Decoding /// Delta byte array decoder. +/// /// Prefix lengths are encoded using `DELTA_BINARY_PACKED` encoding, Suffixes are stored /// using `DELTA_LENGTH_BYTE_ARRAY` encoding. /// See [`DeltaByteArrayEncoder`](crate::encoding::DeltaByteArrayEncoder) for more diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 543c629d3425..a54d4a427635 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -116,8 +116,8 @@ pub mod basic; /// /// [parquet.thrift]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift // see parquet/CONTRIBUTING.md for instructions on regenerating -#[allow(clippy::derivable_impls, clippy::match_single_binding)] -// Don't try and format auto generated code +// Don't try clippy and format auto generated code +#[allow(clippy::all)] #[rustfmt::skip] pub mod format; diff --git a/parquet/src/record/record_reader.rs b/parquet/src/record/record_reader.rs index bcfeb95dcdf4..cfaf14a3d6f8 100644 --- a/parquet/src/record/record_reader.rs +++ b/parquet/src/record/record_reader.rs @@ -18,7 +18,8 @@ use super::super::errors::ParquetError; use super::super::file::reader::RowGroupReader; -/// read up to `max_records` records from `row_group_reader` into `self` +/// Read up to `max_records` records from `row_group_reader` into `self`. +/// /// The type parameter `T` is used to work around the rust orphan rule /// when implementing on types such as `Vec`. pub trait RecordReader { diff --git a/parquet/src/record/record_writer.rs b/parquet/src/record/record_writer.rs index 0b2b95ef7dea..56e0aa490e4d 100644 --- a/parquet/src/record/record_writer.rs +++ b/parquet/src/record/record_writer.rs @@ -20,16 +20,23 @@ use crate::schema::types::TypePtr; use super::super::errors::ParquetError; use super::super::file::writer::SerializedRowGroupWriter; -/// `write_to_row_group` writes from `self` into `row_group_writer` -/// `schema` builds the schema used by `row_group_writer` +/// Trait describing how to write a record (the implementator) to a row group writer. +/// +/// [`parquet_derive`] crate provides a derive macro [`ParquetRecordWriter`] for this trait +/// for unnested structs. +/// /// The type parameter `T` is used to work around the rust orphan rule /// when implementing on types such as `&[T]`. +/// +/// [`parquet_derive`]: https://crates.io/crates/parquet_derive +/// [`ParquetRecordWriter`]: https://docs.rs/parquet_derive/53.0.0/parquet_derive/derive.ParquetRecordWriter.html pub trait RecordWriter { + /// Writes from `self` into `row_group_writer`. fn write_to_row_group( &self, row_group_writer: &mut SerializedRowGroupWriter, ) -> Result<(), ParquetError>; - /// Generated schema + /// Generated schema used by `row_group_writer` fn schema(&self) -> Result; } diff --git a/parquet_derive/src/lib.rs b/parquet_derive/src/lib.rs index 9c93e2cca978..038d8fa446e5 100644 --- a/parquet_derive/src/lib.rs +++ b/parquet_derive/src/lib.rs @@ -29,8 +29,9 @@ use ::syn::{parse_macro_input, Data, DataStruct, DeriveInput}; mod parquet_field; -/// Derive flat, simple RecordWriter implementations. Works by parsing -/// a struct tagged with `#[derive(ParquetRecordWriter)]` and emitting +/// Derive flat, simple RecordWriter implementations. +/// +/// Works by parsing a struct tagged with `#[derive(ParquetRecordWriter)]` and emitting /// the correct writing code for each field of the struct. Column writers /// are generated in the order they are defined. /// @@ -143,8 +144,9 @@ pub fn parquet_record_writer(input: proc_macro::TokenStream) -> proc_macro::Toke }).into() } -/// Derive flat, simple RecordReader implementations. Works by parsing -/// a struct tagged with `#[derive(ParquetRecordReader)]` and emitting +/// Derive flat, simple RecordReader implementations. +/// +/// Works by parsing a struct tagged with `#[derive(ParquetRecordReader)]` and emitting /// the correct writing code for each field of the struct. Column readers /// are generated by matching names in the schema to the names in the struct. ///