[HUDI-2548] Flink streaming reader misses the rolling over file handles (#3787)

2021-10-14 10:36:18 +08:00
parent cff384d23f
commit abf3e3fe71
16 changed files with 225 additions and 126 deletions
--- a/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java
+++ b/hudi-flink/src/test/java/org/apache/hudi/sink/TestWriteCopyOnWrite.java
@@ -420,9 +420,9 @@ public class TestWriteCopyOnWrite {

    Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
    assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
-    assertThat("3 records expect to flush out as a mini-batch",
+    assertThat("2 records expect to flush out as a mini-batch",
        dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
-        is(3));
+        is(2));

    // this triggers the data write and event send
    funcWrapper.checkpointFunction(1);
@@ -483,9 +483,9 @@ public class TestWriteCopyOnWrite {

    Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
    assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
-    assertThat("3 records expect to flush out as a mini-batch",
+    assertThat("2 records expect to flush out as a mini-batch",
        dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
-        is(3));
+        is(2));

    // this triggers the data write and event send
    funcWrapper.checkpointFunction(1);
@@ -615,9 +615,9 @@ public class TestWriteCopyOnWrite {

    Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
    assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
-    assertThat("3 records expect to flush out as a mini-batch",
+    assertThat("2 records expect to flush out as a mini-batch",
        dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
-        is(3));
+        is(2));

    // this triggers the data write and event send
    funcWrapper.checkpointFunction(1);
@@ -665,6 +665,7 @@ public class TestWriteCopyOnWrite {
    Map<String, String> expected = new HashMap<>();
    // the last 2 lines are merged
    expected.put("par1", "["
+        + "id1,par1,id1,Danny,23,1,par1, "
        + "id1,par1,id1,Danny,23,1,par1, "
        + "id1,par1,id1,Danny,23,1,par1" + "]");
    return expected;
--- a/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java
+++ b/hudi-flink/src/test/java/org/apache/hudi/table/HoodieDataSourceITCase.java
@@ -79,8 +79,11 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
    streamTableEnv = TableEnvironmentImpl.create(settings);
    streamTableEnv.getConfig().getConfiguration()
        .setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1);
-    streamTableEnv.getConfig().getConfiguration()
-        .setString("execution.checkpointing.interval", "2s");
+    Configuration execConf = streamTableEnv.getConfig().getConfiguration();
+    execConf.setString("execution.checkpointing.interval", "2s");
+    // configure not to retry after failure
+    execConf.setString("restart-strategy", "fixed-delay");
+    execConf.setString("restart-strategy.fixed-delay.attempts", "0");

    settings = EnvironmentSettings.newInstance().inBatchMode().build();
    batchTableEnv = TableEnvironmentImpl.create(settings);
@@ -529,12 +532,37 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
  }

  @ParameterizedTest
-  @EnumSource(value = ExecMode.class)
-  void testUpsertWithMiniBatches(ExecMode execMode) {
+  @EnumSource(value = HoodieTableType.class)
+  void testStreamWriteAndReadWithMiniBatches(HoodieTableType tableType) throws Exception {
+    // create filesystem table named source
+    String createSource = TestConfigurations.getFileSourceDDL("source", 4);
+    streamTableEnv.executeSql(createSource);
+
+    String hoodieTableDDL = sql("t1")
+        .option(FlinkOptions.PATH, tempFile.getAbsolutePath())
+        .option(FlinkOptions.READ_AS_STREAMING, true)
+        .option(FlinkOptions.TABLE_TYPE, tableType)
+        .option(FlinkOptions.READ_START_COMMIT, "earliest")
+        .option(FlinkOptions.WRITE_BATCH_SIZE, 0.00001)
+        .noPartition()
+        .end();
+    streamTableEnv.executeSql(hoodieTableDDL);
+    String insertInto = "insert into t1 select * from source";
+    execInsertSql(streamTableEnv, insertInto);
+
+    // reading from the earliest commit instance.
+    List<Row> rows = execSelectSql(streamTableEnv, "select * from t1", 20);
+    assertRowsEquals(rows, TestData.DATA_SET_SOURCE_INSERT);
+  }
+
+  @ParameterizedTest
+  @MethodSource("executionModeAndTableTypeParams")
+  void testBatchUpsertWithMiniBatches(ExecMode execMode, HoodieTableType tableType) {
    TableEnvironment tableEnv = execMode == ExecMode.BATCH ? batchTableEnv : streamTableEnv;
    String hoodieTableDDL = sql("t1")
        .option(FlinkOptions.PATH, tempFile.getAbsolutePath())
        .option(FlinkOptions.WRITE_BATCH_SIZE, "0.001")
+        .option(FlinkOptions.TABLE_TYPE, tableType)
        .end();
    tableEnv.executeSql(hoodieTableDDL);

@@ -958,7 +986,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
    try {
      tableResult.getJobClient().get().getJobExecutionResult().get();
    } catch (InterruptedException | ExecutionException ex) {
-      throw new RuntimeException(ex);
+      // ignored
    }
  }