1
0

[HUDI-4038] Avoid calling getDataSize after every record written (#5497)

- getDataSize has non-trivial overhead in the current ParquetWriter impl, requiring traversal of already composed Column Groups in memory. Instead we can sample these calls to getDataSize to amortize its cost.

Co-authored-by: sivabalan <n.siva.b@gmail.com>
This commit is contained in:
Alexey Kudinkin
2022-05-11 05:08:31 -07:00
committed by GitHub
parent 4258a71517
commit 4a8589f222
12 changed files with 124 additions and 97 deletions

View File

@@ -49,7 +49,7 @@ public class TestHoodieFileWriterFactory extends HoodieClientTestBase {
SparkTaskContextSupplier supplier = new SparkTaskContextSupplier();
HoodieFileWriter<IndexedRecord> parquetWriter = HoodieFileWriterFactory.getFileWriter(instantTime,
parquetPath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier);
assertTrue(parquetWriter instanceof HoodieParquetWriter);
assertTrue(parquetWriter instanceof HoodieAvroParquetWriter);
// hfile format.
final Path hfilePath = new Path(basePath + "/partition/path/f1_1-0-1_000.hfile");

View File

@@ -419,7 +419,7 @@ public class TestCopyOnWriteActionExecutor extends HoodieClientTestBase {
List<HoodieRecord> records = new ArrayList<>();
// Approx 1150 records are written for block size of 64KB
for (int i = 0; i < 2000; i++) {
for (int i = 0; i < 2050; i++) {
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString()
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
@@ -441,7 +441,8 @@ public class TestCopyOnWriteActionExecutor extends HoodieClientTestBase {
counts++;
}
}
assertEquals(5, counts, "If the number of records are more than 1150, then there should be a new file");
// we check canWrite only once every 1000 records. and so 2 files with 1000 records and 3rd file with 50 records.
assertEquals(3, counts, "If the number of records are more than 1150, then there should be a new file");
}
@Test