1
0

Adding config for parquet compression ratio

This commit is contained in:
Nishith Agarwal
2018-03-23 21:50:11 -07:00
committed by vinoth chandar
parent 48643795b8
commit 1b756db221
7 changed files with 28 additions and 7 deletions

View File

@@ -40,6 +40,9 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
// used to size data blocks in log file
public static final String LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = "hoodie.logfile.data.block.max.size";
public static final String DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES = String.valueOf(256*1024*1024); // 256 MB
public static final String PARQUET_COMPRESSION_RATIO = "hoodie.parquet.compression.ratio";
// Default compression ratio for parquet
public static final String DEFAULT_STREAM_COMPRESSION_RATIO = String.valueOf(0.1);
private HoodieStorageConfig(Properties props) {
super(props);
@@ -93,6 +96,11 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
return this;
}
public Builder parquetCompressionRatio(double parquetCompressionRatio) {
props.setProperty(PARQUET_COMPRESSION_RATIO, String.valueOf(parquetCompressionRatio));
return this;
}
public HoodieStorageConfig build() {
HoodieStorageConfig config = new HoodieStorageConfig(props);
setDefaultOnCondition(props, !props.containsKey(PARQUET_FILE_MAX_BYTES),
@@ -105,6 +113,8 @@ public class HoodieStorageConfig extends DefaultHoodieConfig {
LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES, DEFAULT_LOGFILE_DATA_BLOCK_SIZE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(LOGFILE_SIZE_MAX_BYTES),
LOGFILE_SIZE_MAX_BYTES, DEFAULT_LOGFILE_SIZE_MAX_BYTES);
setDefaultOnCondition(props, !props.containsKey(PARQUET_COMPRESSION_RATIO),
PARQUET_COMPRESSION_RATIO, DEFAULT_STREAM_COMPRESSION_RATIO);
return config;
}
}

View File

@@ -309,6 +309,10 @@ public class HoodieWriteConfig extends DefaultHoodieConfig {
return Integer.parseInt(props.getProperty(HoodieStorageConfig.LOGFILE_SIZE_MAX_BYTES));
}
public double getParquetCompressionRatio() {
return Double.valueOf(props.getProperty(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO));
}
/**
* metrics properties

View File

@@ -28,16 +28,18 @@ public class HoodieParquetConfig {
private int pageSize;
private long maxFileSize;
private Configuration hadoopConf;
private double compressionRatio;
public HoodieParquetConfig(HoodieAvroWriteSupport writeSupport,
CompressionCodecName compressionCodecName, int blockSize, int pageSize, long maxFileSize,
Configuration hadoopConf) {
Configuration hadoopConf, double compressionRatio) {
this.writeSupport = writeSupport;
this.compressionCodecName = compressionCodecName;
this.blockSize = blockSize;
this.pageSize = pageSize;
this.maxFileSize = maxFileSize;
this.hadoopConf = hadoopConf;
this.compressionRatio = compressionRatio;
}
public HoodieAvroWriteSupport getWriteSupport() {
@@ -63,4 +65,8 @@ public class HoodieParquetConfig {
public Configuration getHadoopConf() {
return hadoopConf;
}
public double getCompressionRatio() {
return compressionRatio;
}
}

View File

@@ -39,10 +39,8 @@ import org.apache.spark.TaskContext;
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
private static double STREAM_COMPRESSION_RATIO = 0.1;
private static AtomicLong recordIndex = new AtomicLong(1);
private final Path file;
private final HoodieWrapperFileSystem fs;
private final long maxFileSize;
@@ -75,7 +73,7 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
// We cannot accurately measure the snappy compressed output file size. We are choosing a conservative 10%
// TODO - compute this compression ratio dynamically by looking at the bytes written to the stream and the actual file size reported by HDFS
this.maxFileSize = parquetConfig.getMaxFileSize() + Math
.round(parquetConfig.getMaxFileSize() * STREAM_COMPRESSION_RATIO);
.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
this.writeSupport = parquetConfig.getWriteSupport();
this.commitTime = commitTime;
this.schema = schema;

View File

@@ -50,7 +50,8 @@ public class HoodieStorageWriterFactory {
HoodieParquetConfig parquetConfig =
new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
config.getParquetBlockSize(), config.getParquetPageSize(),
config.getParquetMaxFileSize(), hoodieTable.getHadoopConf());
config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(),
config.getParquetCompressionRatio());
return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema);
}