1
0

[HUDI-1902] Global index for flink writer (#2958)

Supports deduplication for record keys with different partition path.
This commit is contained in:
Danny Chan
2021-05-18 13:55:38 +08:00
committed by GitHub
parent fcedbfcb58
commit 46a2399a45
10 changed files with 345 additions and 89 deletions

View File

@@ -392,6 +392,56 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
assertRowsEquals(result, "[id1,Sophia,18,1970-01-01T00:00:05,par5]");
}
@Test
void testWriteGlobalIndex() {
// the source generates 4 commits
String createSource = TestConfigurations.getFileSourceDDL(
"source", "test_source_4.data", 4);
streamTableEnv.executeSql(createSource);
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.INSERT_DROP_DUPS.key(), "true");
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
streamTableEnv.executeSql(hoodieTableDDL);
final String insertInto2 = "insert into t1 select * from source";
execInsertSql(streamTableEnv, insertInto2);
List<Row> result = CollectionUtil.iterableToList(
() -> streamTableEnv.sqlQuery("select * from t1").execute().collect());
assertRowsEquals(result, "[id1,Phoebe,52,1970-01-01T00:00:08,par4]");
}
@Test
void testWriteLocalIndex() {
// the source generates 4 commits
String createSource = TestConfigurations.getFileSourceDDL(
"source", "test_source_4.data", 4);
streamTableEnv.executeSql(createSource);
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.INDEX_GLOBAL_ENABLED.key(), "false");
options.put(FlinkOptions.INSERT_DROP_DUPS.key(), "true");
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
streamTableEnv.executeSql(hoodieTableDDL);
final String insertInto2 = "insert into t1 select * from source";
execInsertSql(streamTableEnv, insertInto2);
List<Row> result = CollectionUtil.iterableToList(
() -> streamTableEnv.sqlQuery("select * from t1").execute().collect());
final String expected = "["
+ "id1,Stephen,34,1970-01-01T00:00:02,par1, "
+ "id1,Fabian,32,1970-01-01T00:00:04,par2, "
+ "id1,Jane,19,1970-01-01T00:00:06,par3, "
+ "id1,Phoebe,52,1970-01-01T00:00:08,par4]";
assertRowsEquals(result, expected, 3);
}
@Test
void testStreamReadEmptyTablePath() throws Exception {
// create an empty table

View File

@@ -256,8 +256,20 @@ public class TestData {
* @param expected Expected string of the sorted rows
*/
public static void assertRowsEquals(List<Row> rows, String expected) {
assertRowsEquals(rows, expected, 0);
}
/**
* Sort the {@code rows} using field at index {@code orderingPos} and asserts
* it equals with the expected string {@code expected}.
*
* @param rows Actual result rows
* @param expected Expected string of the sorted rows
* @param orderingPos Field position for ordering
*/
public static void assertRowsEquals(List<Row> rows, String expected, int orderingPos) {
String rowsString = rows.stream()
.sorted(Comparator.comparing(o -> toStringSafely(o.getField(0))))
.sorted(Comparator.comparing(o -> toStringSafely(o.getField(orderingPos))))
.collect(Collectors.toList()).toString();
assertThat(rowsString, is(expected));
}

View File

@@ -0,0 +1,8 @@
{"uuid": "id1", "name": "Danny", "age": 24, "ts": "1970-01-01T00:00:01", "partition": "par1"}
{"uuid": "id1", "name": "Stephen", "age": 34, "ts": "1970-01-01T00:00:02", "partition": "par1"}
{"uuid": "id1", "name": "Julian", "age": 54, "ts": "1970-01-01T00:00:03", "partition": "par2"}
{"uuid": "id1", "name": "Fabian", "age": 32, "ts": "1970-01-01T00:00:04", "partition": "par2"}
{"uuid": "id1", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"}
{"uuid": "id1", "name": "Jane", "age": 19, "ts": "1970-01-01T00:00:06", "partition": "par3"}
{"uuid": "id1", "name": "Ella", "age": 38, "ts": "1970-01-01T00:00:07", "partition": "par4"}
{"uuid": "id1", "name": "Phoebe", "age": 52, "ts": "1970-01-01T00:00:08", "partition": "par4"}