1
0

[HUDI-1911] Reuse the partition path and file group id for flink write data buffer (#2961)

Reuse to reduce memory footprint.
This commit is contained in:
Danny Chan
2021-05-18 17:47:22 +08:00
committed by GitHub
parent 46a2399a45
commit 7d2971d4e2
5 changed files with 89 additions and 24 deletions

View File

@@ -378,21 +378,21 @@ public class TestWriteCopyOnWrite {
@Test
public void testInsertWithMiniBatches() throws Exception {
// reset the config option
conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.001); // 1Kb batch size
conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.0006); // 630 bytes batch size
funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);
// open the function and ingest data
funcWrapper.openFunction();
// Each record is 424 bytes. so 3 records expect to trigger a mini-batch write
// Each record is 208 bytes. so 4 records expect to trigger a mini-batch write
for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
funcWrapper.invoke(rowData);
}
Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
assertThat("2 records expect to flush out as a mini-batch",
assertThat("3 records expect to flush out as a mini-batch",
dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
is(3));
is(2));
// this triggers the data write and event send
funcWrapper.checkpointFunction(1);
@@ -439,12 +439,12 @@ public class TestWriteCopyOnWrite {
@Test
public void testInsertWithSmallBufferSize() throws Exception {
// reset the config option
conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.001); // 1Kb buffer size
conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.0006); // 630 bytes buffer size
funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);
// open the function and ingest data
funcWrapper.openFunction();
// each record is 424 bytes. so 3 records expect to trigger buffer flush:
// each record is 208 bytes. so 4 records expect to trigger buffer flush:
// flush the max size bucket once at a time.
for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
funcWrapper.invoke(rowData);
@@ -452,9 +452,9 @@ public class TestWriteCopyOnWrite {
Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
assertThat("2 records expect to flush out as a mini-batch",
assertThat("3 records expect to flush out as a mini-batch",
dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
is(3));
is(2));
// this triggers the data write and event send
funcWrapper.checkpointFunction(1);
@@ -500,8 +500,9 @@ public class TestWriteCopyOnWrite {
Map<String, String> getMiniBatchExpected() {
Map<String, String> expected = new HashMap<>();
// the last 3 lines are merged
// the last 2 lines are merged
expected.put("par1", "["
+ "id1,par1,id1,Danny,23,1,par1, "
+ "id1,par1,id1,Danny,23,1,par1, "
+ "id1,par1,id1,Danny,23,1,par1]");
return expected;

View File

@@ -401,6 +401,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.INDEX_GLOBAL_ENABLED.key(), "true");
options.put(FlinkOptions.INSERT_DROP_DUPS.key(), "true");
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
streamTableEnv.executeSql(hoodieTableDDL);