Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make ZSTD default compression for Parquet writes #4726

Merged
merged 4 commits into from
Jun 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ object ParquetAvroIO {
private[scio] val DefaultSchema = null
private[scio] val DefaultNumShards = 0
private[scio] val DefaultSuffix = ".parquet"
private[scio] val DefaultCompression = CompressionCodecName.GZIP
private[scio] val DefaultCompression = CompressionCodecName.ZSTD
private[scio] val DefaultConfiguration = null
private[scio] val DefaultShardNameTemplate = null
private[scio] val DefaultTempDirectory = null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ object ParquetExampleIO {
object WriteParam {
private[tensorflow] val DefaultNumShards = 0
private[tensorflow] val DefaultSuffix = ".parquet"
private[tensorflow] val DefaultCompression = CompressionCodecName.GZIP
private[tensorflow] val DefaultCompression = CompressionCodecName.ZSTD
private[tensorflow] val DefaultConfiguration = null
private[tensorflow] val DefaultShardNameTemplate = null
private[tensorflow] val DefaultTempDirectory = null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ object ParquetTypeIO {
object WriteParam {
private[scio] val DefaultNumShards = 0
private[scio] val DefaultSuffix = ".parquet"
private[scio] val DefaultCompression = CompressionCodecName.GZIP
private[scio] val DefaultCompression = CompressionCodecName.ZSTD
private[scio] val DefaultConfiguration = null
private[scio] val DefaultShardNameTemplate = null
private[scio] val DefaultTempDirectory = null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
* Avro records.
*/
public class ParquetAvroFileOperations<ValueT> extends FileOperations<ValueT> {
static final CompressionCodecName DEFAULT_COMPRESSION = CompressionCodecName.GZIP;
static final CompressionCodecName DEFAULT_COMPRESSION = CompressionCodecName.ZSTD;
private final SerializableSchemaSupplier schemaSupplier;
private final CompressionCodecName compression;
private final SerializableConfiguration conf;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName
import java.nio.channels.{ReadableByteChannel, WritableByteChannel}

object ParquetTypeFileOperations {
val DefaultCompression = CompressionCodecName.GZIP
val DefaultCompression = CompressionCodecName.ZSTD
val DefaultConfiguration: Configuration = null

def apply[T: Coder: ParquetType](): ParquetTypeFileOperations[T] = apply(DefaultCompression)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public void testUncompressed() throws Exception {

@Test
public void testCompression() throws Exception {
test(Compression.GZIP);
test(Compression.ZSTD);
}

private void test(Compression compression) throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,13 +217,13 @@ public void testDisplayData() {
MatcherAssert.assertThat(
displayData, hasDisplayItem("compression", Compression.UNCOMPRESSED.toString()));
MatcherAssert.assertThat(
displayData, hasDisplayItem("compressionCodecName", CompressionCodecName.GZIP.name()));
displayData, hasDisplayItem("compressionCodecName", CompressionCodecName.ZSTD.name()));
MatcherAssert.assertThat(displayData, hasDisplayItem("schema", USER_SCHEMA.getFullName()));
}

private void writeFile(ResourceId file) throws IOException {
final ParquetAvroFileOperations<GenericRecord> fileOperations =
ParquetAvroFileOperations.of(USER_SCHEMA, CompressionCodecName.GZIP);
ParquetAvroFileOperations.of(USER_SCHEMA, CompressionCodecName.ZSTD);
final FileOperations.Writer<GenericRecord> writer = fileOperations.createWriter(file);
for (GenericRecord record : USER_RECORDS) {
writer.write(record);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public void testUncompressed() throws Exception {

@Test
public void testCompressed() throws Exception {
test(Compression.GZIP);
test(Compression.ZSTD);
}

private void test(Compression compression) throws Exception {
Expand Down
4 changes: 3 additions & 1 deletion site/src/main/paradox/io/Parquet.md
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,9 @@ val parquetConf: Configuration = {
Here are some other recommended settings.

- `numShards` - This should be explicitly set so that the size of each output file is smaller than but close to `parquet.block.size`, i.e. 1 GiB. This guarantees that each file contains 1 row group only and reduces seeks.
- `compression` - `SNAPPY` and `GZIP` work out of the box. Snappy is less CPU intensive but has lower compression ratio. In our benchmarks GZIP seem to work better on GCS.
- `compression` - Parquet defaults to ZSTD compression with a level of 3; compression level can be set to any integer from 1-22 using the configuration option `parquet.compression.codec.zstd.level`. `SNAPPY` and `GZIP` compression types also work out of the box; Snappy is less CPU intensive but has lower compression ratio. In our benchmarks GZIP seem to work better than Snappy on GCS.

A full list of Parquet configuration options can be found [here](https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md).

## Parquet Reads in Scio 0.12.0+

Expand Down