Skip to content

Commit

Permalink
refactor(11367): remove layer of indirection using the conditional pa…
Browse files Browse the repository at this point in the history
…rquet feature, instead rely upon regression testing
  • Loading branch information
wiedld committed Jul 22, 2024
1 parent 38124ba commit d972416
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 90 deletions.
100 changes: 12 additions & 88 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,85 +314,6 @@ config_namespace! {
}
}

/// When using the parquet feature,
/// use the same default writer settings as the extern parquet.
#[cfg(feature = "parquet")]
mod parquet_defaults {
use parquet::basic::Compression;
use parquet::file::properties as props;

/// Default value for [`props::WriterProperties::data_page_size_limit`]
pub const DEFAULT_PAGE_SIZE: usize = props::DEFAULT_PAGE_SIZE;
/// Default value for [`props::WriterProperties::write_batch_size`]
pub const DEFAULT_WRITE_BATCH_SIZE: usize = props::DEFAULT_WRITE_BATCH_SIZE;
/// Default value for [`props::WriterProperties::writer_version`]
pub const DEFAULT_WRITER_VERSION: &str = "1.0";
/// Default value for [`props::WriterProperties::dictionary_enabled`]
pub const DEFAULT_DICTIONARY_ENABLED: Option<bool> =
Some(props::DEFAULT_DICTIONARY_ENABLED);
/// Default value for [`props::WriterProperties::dictionary_page_size_limit`]
pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize =
props::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT;
/// Default value for [`props::WriterProperties::data_page_row_count_limit`]
pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize =
props::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT;
/// Default value for [`props::WriterProperties::max_statistics_size`]
pub const DEFAULT_MAX_STATISTICS_SIZE: Option<usize> =
Some(props::DEFAULT_MAX_STATISTICS_SIZE);
/// Default value for [`props::WriterProperties::max_row_group_size`]
pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = props::DEFAULT_MAX_ROW_GROUP_SIZE;
/// Default value for [`props::WriterProperties::column_index_truncate_length`]
pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> =
props::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH;

// TODO: discuss if we want datafusion to use these defaults from the extern parquet
// refer to https://github.com/apache/datafusion/issues/11367

#[allow(dead_code)]
/// Default value for [`props::WriterProperties::statistics_enabled`]
pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page");
#[allow(dead_code)]
/// Default value for [`props::BloomFilterProperties::fpp`]
pub const DEFAULT_BLOOM_FILTER_FPP: Option<f64> =
Some(props::DEFAULT_BLOOM_FILTER_FPP);
#[allow(dead_code)]
/// Default value for [`props::BloomFilterProperties::ndv`]
pub const DEFAULT_BLOOM_FILTER_NDV: Option<u64> =
Some(props::DEFAULT_BLOOM_FILTER_NDV);

#[allow(dead_code)]
/// Default value for [props::WriterProperties::compression`]
pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
}

/// When note using the parquet feature, provide a manual copy
/// of the extern parquet's settings in order to compile.
///
/// This is required since the [`ParquetOptions`] are extended with the
/// `config_namespace` macro, which does not handle internal configuration macros.
#[cfg(not(feature = "parquet"))]
mod parquet_defaults {
pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
pub const DEFAULT_WRITER_VERSION: &str = "1.0";
pub const DEFAULT_DICTIONARY_ENABLED: Option<bool> = Some(true);
pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
pub const DEFAULT_MAX_STATISTICS_SIZE: Option<usize> = Some(4096);
pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);

// TODO: discuss if we want datafusion to use these defaults from the extern parquet
// refer to https://github.com/apache/datafusion/issues/11367

#[allow(dead_code)]
pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page");
#[allow(dead_code)]
pub const DEFAULT_BLOOM_FILTER_FPP: Option<f64> = Some(0.05);
#[allow(dead_code)]
pub const DEFAULT_BLOOM_FILTER_NDV: Option<u64> = Some(1_000_000_u64);
}

config_namespace! {
/// Options for reading and writing parquet files
///
Expand Down Expand Up @@ -436,28 +357,31 @@ config_namespace! {
// and map to parquet::file::properties::WriterProperties

/// (writing) Sets best effort maximum size of data page in bytes
pub data_pagesize_limit: usize, default = parquet_defaults::DEFAULT_PAGE_SIZE
pub data_pagesize_limit: usize, default = 1024 * 1024

/// (writing) Sets write_batch_size in bytes
pub write_batch_size: usize, default = parquet_defaults::DEFAULT_WRITE_BATCH_SIZE
pub write_batch_size: usize, default = 1024

/// (writing) Sets parquet writer version
/// valid values are "1.0" and "2.0"
pub writer_version: String, default = parquet_defaults::DEFAULT_WRITER_VERSION.to_string()
pub writer_version: String, default = "1.0".to_string()

/// (writing) Sets default parquet compression codec.
/// Valid values are: uncompressed, snappy, gzip(level),
/// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
/// These values are not case sensitive. If NULL, uses
/// default parquet writer setting
///
/// Note that this default setting is not the same as
/// the default parquet writer setting.
pub compression: Option<String>, default = Some("zstd(3)".into())

/// (writing) Sets if dictionary encoding is enabled. If NULL, uses
/// default parquet writer setting
pub dictionary_enabled: Option<bool>, default = parquet_defaults::DEFAULT_DICTIONARY_ENABLED
pub dictionary_enabled: Option<bool>, default = Some(true)

/// (writing) Sets best effort maximum dictionary page size, in bytes
pub dictionary_page_size_limit: usize, default = parquet_defaults::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
pub dictionary_page_size_limit: usize, default = 1024 * 1024

/// (writing) Sets if statistics are enabled for any column
/// Valid values are: "none", "chunk", and "page"
Expand All @@ -467,21 +391,21 @@ config_namespace! {

/// (writing) Sets max statistics size for any column. If NULL, uses
/// default parquet writer setting
pub max_statistics_size: Option<usize>, default = parquet_defaults::DEFAULT_MAX_STATISTICS_SIZE
pub max_statistics_size: Option<usize>, default = Some(4096)

/// (writing) Target maximum number of rows in each row group (defaults to 1M
/// rows). Writing larger row groups requires more memory to write, but
/// can get better compression and be faster to read.
pub max_row_group_size: usize, default = parquet_defaults::DEFAULT_MAX_ROW_GROUP_SIZE
pub max_row_group_size: usize, default = 1024 * 1024

/// (writing) Sets "created by" property
pub created_by: String, default = concat!("datafusion version ", env!("CARGO_PKG_VERSION")).into()

/// (writing) Sets column index truncate length
pub column_index_truncate_length: Option<usize>, default = parquet_defaults::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH
pub column_index_truncate_length: Option<usize>, default = Some(64)

/// (writing) Sets best effort maximum number of rows in data page
pub data_page_row_count_limit: usize, default = parquet_defaults::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT
pub data_page_row_count_limit: usize, default = 20_000

/// (writing) Sets default encoding for any column.
/// Valid values are: plain, plain_dictionary, rle,
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ datafusion.execution.parquet.bloom_filter_ndv NULL (writing) Sets bloom filter n
datafusion.execution.parquet.bloom_filter_on_read true (writing) Use any available bloom filters when reading parquet files
datafusion.execution.parquet.bloom_filter_on_write false (writing) Write bloom filters for all columns when creating parquet files
datafusion.execution.parquet.column_index_truncate_length 64 (writing) Sets column index truncate length
datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting
datafusion.execution.parquet.compression zstd(3) (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting.
datafusion.execution.parquet.created_by datafusion (writing) Sets "created by" property
datafusion.execution.parquet.data_page_row_count_limit 20000 (writing) Sets best effort maximum number of rows in data page
datafusion.execution.parquet.data_pagesize_limit 1048576 (writing) Sets best effort maximum size of data page in bytes
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user-guide/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
| datafusion.execution.parquet.data_pagesize_limit | 1048576 | (writing) Sets best effort maximum size of data page in bytes |
| datafusion.execution.parquet.write_batch_size | 1024 | (writing) Sets write_batch_size in bytes |
| datafusion.execution.parquet.writer_version | 1.0 | (writing) Sets parquet writer version valid values are "1.0" and "2.0" |
| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.compression | zstd(3) | (writing) Sets default parquet compression codec. Valid values are: uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and lz4_raw. These values are not case sensitive. If NULL, uses default parquet writer setting Note that this default setting is not the same as the default parquet writer setting. |
| datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting |
| datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes |
| datafusion.execution.parquet.statistics_enabled | NULL | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting |
Expand Down

0 comments on commit d972416

Please sign in to comment.