1
0

[HUDI-2548] Flink streaming reader misses the rolling over file handles (#3787)

This commit is contained in:
Danny Chan
2021-10-14 10:36:18 +08:00
committed by GitHub
parent cff384d23f
commit abf3e3fe71
16 changed files with 225 additions and 126 deletions

View File

@@ -420,9 +420,9 @@ public class TestWriteCopyOnWrite {
Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
assertThat("3 records expect to flush out as a mini-batch",
assertThat("2 records expect to flush out as a mini-batch",
dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
is(3));
is(2));
// this triggers the data write and event send
funcWrapper.checkpointFunction(1);
@@ -483,9 +483,9 @@ public class TestWriteCopyOnWrite {
Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
assertThat("3 records expect to flush out as a mini-batch",
assertThat("2 records expect to flush out as a mini-batch",
dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
is(3));
is(2));
// this triggers the data write and event send
funcWrapper.checkpointFunction(1);
@@ -615,9 +615,9 @@ public class TestWriteCopyOnWrite {
Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
assertThat("3 records expect to flush out as a mini-batch",
assertThat("2 records expect to flush out as a mini-batch",
dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
is(3));
is(2));
// this triggers the data write and event send
funcWrapper.checkpointFunction(1);
@@ -665,6 +665,7 @@ public class TestWriteCopyOnWrite {
Map<String, String> expected = new HashMap<>();
// the last 2 lines are merged
expected.put("par1", "["
+ "id1,par1,id1,Danny,23,1,par1, "
+ "id1,par1,id1,Danny,23,1,par1, "
+ "id1,par1,id1,Danny,23,1,par1" + "]");
return expected;

View File

@@ -79,8 +79,11 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
streamTableEnv = TableEnvironmentImpl.create(settings);
streamTableEnv.getConfig().getConfiguration()
.setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1);
streamTableEnv.getConfig().getConfiguration()
.setString("execution.checkpointing.interval", "2s");
Configuration execConf = streamTableEnv.getConfig().getConfiguration();
execConf.setString("execution.checkpointing.interval", "2s");
// configure not to retry after failure
execConf.setString("restart-strategy", "fixed-delay");
execConf.setString("restart-strategy.fixed-delay.attempts", "0");
settings = EnvironmentSettings.newInstance().inBatchMode().build();
batchTableEnv = TableEnvironmentImpl.create(settings);
@@ -529,12 +532,37 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
}
@ParameterizedTest
@EnumSource(value = ExecMode.class)
void testUpsertWithMiniBatches(ExecMode execMode) {
@EnumSource(value = HoodieTableType.class)
void testStreamWriteAndReadWithMiniBatches(HoodieTableType tableType) throws Exception {
// create filesystem table named source
String createSource = TestConfigurations.getFileSourceDDL("source", 4);
streamTableEnv.executeSql(createSource);
String hoodieTableDDL = sql("t1")
.option(FlinkOptions.PATH, tempFile.getAbsolutePath())
.option(FlinkOptions.READ_AS_STREAMING, true)
.option(FlinkOptions.TABLE_TYPE, tableType)
.option(FlinkOptions.READ_START_COMMIT, "earliest")
.option(FlinkOptions.WRITE_BATCH_SIZE, 0.00001)
.noPartition()
.end();
streamTableEnv.executeSql(hoodieTableDDL);
String insertInto = "insert into t1 select * from source";
execInsertSql(streamTableEnv, insertInto);
// reading from the earliest commit instance.
List<Row> rows = execSelectSql(streamTableEnv, "select * from t1", 20);
assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT);
}
@ParameterizedTest
@MethodSource("executionModeAndTableTypeParams")
void testBatchUpsertWithMiniBatches(ExecMode execMode, HoodieTableType tableType) {
TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv;
String hoodieTableDDL = sql("t1")
.option(FlinkOptions.PATH, tempFile.getAbsolutePath())
.option(FlinkOptions.WRITE_BATCH_SIZE, "0.001")
.option(FlinkOptions.TABLE_TYPE, tableType)
.end();
tableEnv.executeSql(hoodieTableDDL);
@@ -958,7 +986,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
try {
tableResult.getJobClient().get().getJobExecutionResult().get();
} catch (InterruptedException | ExecutionException ex) {
throw new RuntimeException(ex);
// ignored
}
}