1
0

[HUDI-4038] Avoid calling getDataSize after every record written (#5497)

- getDataSize has non-trivial overhead in the current ParquetWriter impl, requiring traversal of already composed Column Groups in memory. Instead we can sample these calls to getDataSize to amortize its cost.

Co-authored-by: sivabalan <n.siva.b@gmail.com>
This commit is contained in:
Alexey Kudinkin
2022-05-11 05:08:31 -07:00
committed by GitHub
parent 4258a71517
commit 4a8589f222
12 changed files with 124 additions and 97 deletions

View File

@@ -41,7 +41,7 @@ import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
import org.apache.hudi.io.storage.HoodieOrcConfig;
import org.apache.hudi.io.storage.HoodieOrcWriter;
import org.apache.hudi.io.storage.HoodieParquetWriter;
import org.apache.hudi.io.storage.HoodieAvroParquetWriter;
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
import org.apache.avro.Schema;
@@ -113,10 +113,9 @@ public class HoodieWriteableTestTable extends HoodieMetadataTestTable {
HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP,
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue()));
try (HoodieParquetWriter writer = new HoodieParquetWriter(
currentInstantTime,
new Path(Paths.get(basePath, partition, fileName).toString()),
config, schema, contextSupplier, populateMetaFields)) {
try (HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter<>(
new Path(Paths.get(basePath, partition, fileName).toString()), config, currentInstantTime,
contextSupplier, populateMetaFields)) {
int seqId = 1;
for (HoodieRecord record : records) {
GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get();