From b309525a318be829596b59c2b4fe3248f97de440 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 17 Sep 2024 11:18:51 -0400 Subject: [PATCH] Minor: improve `GroupsAccumulatorAdapter` docs (#12502) --- .../src/aggregate/groups_accumulator.rs | 52 +++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs index b5eb36c3fac7..92dd91bd86bc 100644 --- a/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs +++ b/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs @@ -42,6 +42,52 @@ use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator}; /// they are not as fast as a specialized `GroupsAccumulator`. This /// interface bridges the gap so the group by operator only operates /// in terms of [`Accumulator`]. +/// +/// Internally, this adapter creates a new [`Accumulator`] for each group which +/// stores the state for that group. This both requires an allocation for each +/// Accumulator, internal indices, as well as whatever internal allocations the +/// Accumulator itself requires. +/// +/// For example, a `MinAccumulator` that computes the minimum string value with +/// a [`ScalarValue::Utf8`]. That will require at least two allocations per group +/// (one for the `MinAccumulator` and one for the `ScalarValue::Utf8`). +/// +/// ```text +/// ┌─────────────────────────────────┐ +/// │MinAccumulator { │ +/// ┌─────▶│ min: ScalarValue::Utf8("A") │───────┐ +/// │ │} │ │ +/// │ └─────────────────────────────────┘ └───────▶ "A" +/// ┌─────┐ │ ┌─────────────────────────────────┐ +/// │ 0 │─────┘ │MinAccumulator { │ +/// ├─────┤ ┌─────▶│ min: ScalarValue::Utf8("Z") │───────────────▶ "Z" +/// │ 1 │─────┘ │} │ +/// └─────┘ └─────────────────────────────────┘ ... +/// ... ... +/// ┌─────┐ ┌────────────────────────────────┐ +/// │ N-2 │ │MinAccumulator { │ +/// ├─────┤ │ min: ScalarValue::Utf8("A") │────────────────▶ "A" +/// │ N-1 │─────┐ │} │ +/// └─────┘ │ └────────────────────────────────┘ +/// │ ┌────────────────────────────────┐ ┌───────▶ "Q" +/// │ │MinAccumulator { │ │ +/// └─────▶│ min: ScalarValue::Utf8("Q") │────────┘ +/// │} │ +/// └────────────────────────────────┘ +/// +/// +/// Logical group Current Min/Max value for that group stored +/// number as a ScalarValue which points to an +/// indivdually allocated String +/// +///``` +/// +/// # Optimizations +/// +/// The adapter minimizes the number of calls to [`Accumulator::update_batch`] +/// by first collecting the input rows for each group into a contiguous array +/// using [`compute::take`] +/// pub struct GroupsAccumulatorAdapter { factory: Box Result> + Send>, @@ -61,9 +107,9 @@ struct AccumulatorState { /// [`Accumulator`] that stores the per-group state accumulator: Box, - // scratch space: indexes in the input array that will be fed to - // this accumulator. Stores indexes as `u32` to match the arrow - // `take` kernel input. + /// scratch space: indexes in the input array that will be fed to + /// this accumulator. Stores indexes as `u32` to match the arrow + /// `take` kernel input. indices: Vec, }