[HUDI-4319] Fixed Parquet's PLAIN_DICTIONARY encoding not being applied when bulk-inserting (#5966)
* Fixed Dictionary encoding config not being properly propagated to Parquet writer (making it unable to apply it, substantially bloating the storage footprint)
This commit is contained in:
@@ -18,21 +18,20 @@
|
||||
|
||||
package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||
import org.apache.hudi.common.fs.inline.InLineFSUtils;
|
||||
import org.apache.hudi.common.fs.inline.InLineFileSystem;
|
||||
import org.apache.hudi.common.util.ClosableIterator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ParquetReaderIterator;
|
||||
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
|
||||
import org.apache.hudi.io.storage.HoodieParquetStreamWriter;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||
import org.apache.hudi.common.fs.inline.InLineFSUtils;
|
||||
import org.apache.hudi.common.fs.inline.InLineFileSystem;
|
||||
import org.apache.hudi.common.util.ClosableIterator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ParquetReaderIterator;
|
||||
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||
import org.apache.hudi.io.storage.HoodieParquetStreamWriter;
|
||||
import org.apache.parquet.avro.AvroParquetReader;
|
||||
import org.apache.parquet.avro.AvroReadSupport;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
@@ -43,7 +42,6 @@ import org.apache.parquet.hadoop.util.HadoopInputFile;
|
||||
import org.apache.parquet.io.InputFile;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
@@ -97,8 +95,8 @@ public class HoodieParquetDataBlock extends HoodieDataBlock {
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||
new AvroSchemaConverter().convert(writerSchema), writerSchema, Option.empty());
|
||||
|
||||
HoodieAvroParquetConfig avroParquetConfig =
|
||||
new HoodieAvroParquetConfig(
|
||||
HoodieParquetConfig<HoodieAvroWriteSupport> avroParquetConfig =
|
||||
new HoodieParquetConfig<>(
|
||||
writeSupport,
|
||||
compressionCodecName.get(),
|
||||
ParquetWriter.DEFAULT_BLOCK_SIZE,
|
||||
|
||||
@@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
|
||||
/**
|
||||
* ParquetConfig for writing avro records in Parquet files.
|
||||
*/
|
||||
public class HoodieAvroParquetConfig extends HoodieBaseParquetConfig<HoodieAvroWriteSupport> {
|
||||
|
||||
public HoodieAvroParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
||||
double compressionRatio) {
|
||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio);
|
||||
}
|
||||
|
||||
public HoodieAvroParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
||||
double compressionRatio, boolean directoryEnabled) {
|
||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, directoryEnabled);
|
||||
}
|
||||
}
|
||||
@@ -25,7 +25,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
* Base ParquetConfig to hold config params for writing to Parquet.
|
||||
* @param <T>
|
||||
*/
|
||||
public class HoodieBaseParquetConfig<T> {
|
||||
public class HoodieParquetConfig<T> {
|
||||
private final T writeSupport;
|
||||
private final CompressionCodecName compressionCodecName;
|
||||
private final int blockSize;
|
||||
@@ -35,13 +35,13 @@ public class HoodieBaseParquetConfig<T> {
|
||||
private final double compressionRatio;
|
||||
private final boolean dictionaryEnabled;
|
||||
|
||||
public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
|
||||
public HoodieParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
|
||||
this(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, false);
|
||||
}
|
||||
|
||||
public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio, boolean dictionaryEnabled) {
|
||||
public HoodieParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio, boolean dictionaryEnabled) {
|
||||
this.writeSupport = writeSupport;
|
||||
this.compressionCodecName = compressionCodecName;
|
||||
this.blockSize = blockSize;
|
||||
@@ -38,7 +38,7 @@ public class HoodieParquetStreamWriter<R extends IndexedRecord> implements AutoC
|
||||
private final HoodieAvroWriteSupport writeSupport;
|
||||
|
||||
public HoodieParquetStreamWriter(FSDataOutputStream outputStream,
|
||||
HoodieAvroParquetConfig parquetConfig) throws IOException {
|
||||
HoodieParquetConfig<HoodieAvroWriteSupport> parquetConfig) throws IOException {
|
||||
this.writeSupport = parquetConfig.getWriteSupport();
|
||||
this.writer = new Builder<R>(new OutputStreamBackedOutputFile(outputStream), writeSupport)
|
||||
.withWriteMode(ParquetFileWriter.Mode.CREATE)
|
||||
|
||||
Reference in New Issue
Block a user