[HUDI-2385] Make parquet dictionary encoding configurable (#3578)
Co-authored-by: leesf <leesf@apache.org>
This commit is contained in:
@@ -110,6 +110,11 @@ public class HoodieStorageConfig extends HoodieConfig {
|
||||
.defaultValue("gzip")
|
||||
.withDocumentation("Compression Codec for parquet files");
|
||||
|
||||
public static final ConfigProperty<Boolean> PARQUET_DICTIONARY_ENABLED = ConfigProperty
|
||||
.key("hoodie.parquet.dictionary.enabled")
|
||||
.defaultValue(true)
|
||||
.withDocumentation("Whether to use dictionary encoding");
|
||||
|
||||
public static final ConfigProperty<String> HFILE_COMPRESSION_ALGORITHM_NAME = ConfigProperty
|
||||
.key("hoodie.hfile.compression.algorithm")
|
||||
.defaultValue("GZ")
|
||||
|
||||
@@ -1403,10 +1403,6 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
return getInt(HoodieStorageConfig.LOGFILE_DATA_BLOCK_MAX_SIZE);
|
||||
}
|
||||
|
||||
public long getLogFileMaxSize() {
|
||||
return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE);
|
||||
}
|
||||
|
||||
public double getParquetCompressionRatio() {
|
||||
return getDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION);
|
||||
}
|
||||
@@ -1415,6 +1411,14 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
return CompressionCodecName.fromConf(getString(HoodieStorageConfig.PARQUET_COMPRESSION_CODEC_NAME));
|
||||
}
|
||||
|
||||
public boolean parquetDictionaryEnabled() {
|
||||
return getBoolean(HoodieStorageConfig.PARQUET_DICTIONARY_ENABLED);
|
||||
}
|
||||
|
||||
public long getLogFileMaxSize() {
|
||||
return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE);
|
||||
}
|
||||
|
||||
public double getLogFileToParquetCompressionRatio() {
|
||||
return getDouble(HoodieStorageConfig.LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION);
|
||||
}
|
||||
|
||||
@@ -33,4 +33,10 @@ public class HoodieAvroParquetConfig extends HoodieBaseParquetConfig<HoodieAvroW
|
||||
double compressionRatio) {
|
||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio);
|
||||
}
|
||||
|
||||
public HoodieAvroParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
||||
double compressionRatio, boolean directoryEnabled) {
|
||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, directoryEnabled);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,15 +27,21 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
*/
|
||||
public class HoodieBaseParquetConfig<T> {
|
||||
private final T writeSupport;
|
||||
private CompressionCodecName compressionCodecName;
|
||||
private int blockSize;
|
||||
private int pageSize;
|
||||
private long maxFileSize;
|
||||
private Configuration hadoopConf;
|
||||
private double compressionRatio;
|
||||
private final CompressionCodecName compressionCodecName;
|
||||
private final int blockSize;
|
||||
private final int pageSize;
|
||||
private final long maxFileSize;
|
||||
private final Configuration hadoopConf;
|
||||
private final double compressionRatio;
|
||||
private final boolean dictionaryEnabled;
|
||||
|
||||
public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
|
||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
|
||||
this(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, false);
|
||||
}
|
||||
|
||||
public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio, boolean dictionaryEnabled) {
|
||||
this.writeSupport = writeSupport;
|
||||
this.compressionCodecName = compressionCodecName;
|
||||
this.blockSize = blockSize;
|
||||
@@ -43,6 +49,7 @@ public class HoodieBaseParquetConfig<T> {
|
||||
this.maxFileSize = maxFileSize;
|
||||
this.hadoopConf = hadoopConf;
|
||||
this.compressionRatio = compressionRatio;
|
||||
this.dictionaryEnabled = dictionaryEnabled;
|
||||
}
|
||||
|
||||
public CompressionCodecName getCompressionCodecName() {
|
||||
@@ -72,4 +79,8 @@ public class HoodieBaseParquetConfig<T> {
|
||||
public T getWriteSupport() {
|
||||
return writeSupport;
|
||||
}
|
||||
|
||||
public boolean dictionaryEnabled() {
|
||||
return dictionaryEnabled;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,7 +71,7 @@ public class HoodieFileWriterFactory {
|
||||
|
||||
HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(),
|
||||
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
|
||||
hoodieTable.getHadoopConf(), config.getParquetCompressionRatio());
|
||||
hoodieTable.getHadoopConf(), config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
|
||||
|
||||
return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier, populateMetaFields);
|
||||
}
|
||||
|
||||
@@ -56,7 +56,7 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
||||
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||
DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED,
|
||||
parquetConfig.dictionaryEnabled(), DEFAULT_IS_VALIDATING_ENABLED,
|
||||
DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
||||
this.fs =
|
||||
|
||||
Reference in New Issue
Block a user