[HUDI-4038] Avoid calling getDataSize after every record written (#5497)

- getDataSize has non-trivial overhead in the current ParquetWriter impl, requiring traversal of already composed Column Groups in memory. Instead we can sample these calls to getDataSize to amortize its cost. Co-authored-by: sivabalan <n.siva.b@gmail.com>
2022-05-11 05:08:31 -07:00
parent 4258a71517
commit 4a8589f222
12 changed files with 124 additions and 97 deletions
--- a/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java
+++ b/hudi-client/hudi-client-common/src/test/java/org/apache/hudi/testutils/HoodieWriteableTestTable.java
@@ -41,7 +41,7 @@ import org.apache.hudi.config.HoodieStorageConfig;
 import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
 import org.apache.hudi.io.storage.HoodieOrcConfig;
 import org.apache.hudi.io.storage.HoodieOrcWriter;
-import org.apache.hudi.io.storage.HoodieParquetWriter;
+import org.apache.hudi.io.storage.HoodieAvroParquetWriter;
 import org.apache.hudi.metadata.HoodieTableMetadataWriter;

 import org.apache.avro.Schema;
@@ -113,10 +113,9 @@ public class HoodieWriteableTestTable extends HoodieMetadataTestTable {
      HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP,
          ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
          new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue()));
-      try (HoodieParquetWriter writer = new HoodieParquetWriter(
-          currentInstantTime,
-          new Path(Paths.get(basePath, partition, fileName).toString()),
-          config, schema, contextSupplier, populateMetaFields)) {
+      try (HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter<>(
+          new Path(Paths.get(basePath, partition, fileName).toString()), config, currentInstantTime,
+          contextSupplier, populateMetaFields)) {
        int seqId = 1;
        for (HoodieRecord record : records) {
          GenericRecord avroRecord = (GenericRecord) ((HoodieRecordPayload) record.getData()).getInsertValue(schema).get();