1
0

[HUDI-1844] Add option to flush when total buckets memory exceeds the threshold (#2877)

Current code supports flushing as per-bucket memory usage, while the
buckets may still take too much memory for bootstrap from history data.

When the threshold hits, flush out half of the buckets with bigger
buffer size.
This commit is contained in:
Danny Chan
2021-04-25 23:06:53 +08:00
committed by GitHub
parent a5789c4067
commit 1b27259b53
4 changed files with 147 additions and 17 deletions

View File

@@ -378,7 +378,7 @@ public class TestWriteCopyOnWrite {
@Test
public void testInsertWithMiniBatches() throws Exception {
// reset the config option
conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.001); // 1Kb batch size
conf.setDouble(FlinkOptions.WRITE_BUCKET_SIZE, 0.001); // 1Kb batch size
funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);
// open the function and ingest data
@@ -436,6 +436,68 @@ public class TestWriteCopyOnWrite {
checkWrittenData(tempFile, expected, 1);
}
@Test
public void testInsertWithSmallBuffer() throws Exception {
// reset the config option
conf.setDouble(FlinkOptions.WRITE_BUFFER_SIZE, 0.001); // 1Kb buffer size
funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);
// open the function and ingest data
funcWrapper.openFunction();
// each record is 424 bytes. so 3 records expect to trigger buffer flush:
// flush half of the buckets once at a time.
for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
funcWrapper.invoke(rowData);
}
Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
assertThat("4 records expect to flush out as a mini-batch",
dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
is(1));
// this triggers the data write and event send
funcWrapper.checkpointFunction(1);
dataBuffer = funcWrapper.getDataBuffer();
assertThat("All data should be flushed out", dataBuffer.size(), is(0));
for (int i = 0; i < 3; i++) {
final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first
assertThat("The operator expect to send an event", event, instanceOf(BatchWriteSuccessEvent.class));
funcWrapper.getCoordinator().handleEventFromOperator(0, event);
}
assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event");
String instant = funcWrapper.getWriteClient()
.getLastPendingInstant(getTableType());
funcWrapper.checkpointComplete(1);
Map<String, String> expected = getMiniBatchExpected();
checkWrittenData(tempFile, expected, 1);
// started a new instant already
checkInflightInstant(funcWrapper.getWriteClient());
checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant);
// insert duplicates again
for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
funcWrapper.invoke(rowData);
}
funcWrapper.checkpointFunction(2);
for (int i = 0; i < 3; i++) {
final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first
funcWrapper.getCoordinator().handleEventFromOperator(0, event);
}
funcWrapper.checkpointComplete(2);
// Same the original base file content.
checkWrittenData(tempFile, expected, 1);
}
Map<String, String> getMiniBatchExpected() {
Map<String, String> expected = new HashMap<>();
expected.put("par1", "[id1,par1,id1,Danny,23,1,par1, "

View File

@@ -330,7 +330,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv;
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.WRITE_BATCH_SIZE.key(), "0.001");
options.put(FlinkOptions.WRITE_BUCKET_SIZE.key(), "0.001");
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
tableEnv.executeSql(hoodieTableDDL);