[HUDI-796] Add deduping logic for upserts case (#1558)

2020-09-18 17:07:52 +05:30
parent bf65269f66
commit 73e5b4c7bb
5 changed files with 195 additions and 40 deletions
--- a/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java
+++ b/hudi-cli/src/test/java/org/apache/hudi/cli/integ/ITTestRepairsCommand.java
@@ -59,12 +59,16 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 public class ITTestRepairsCommand extends AbstractShellIntegrationTest {

  private String duplicatedPartitionPath;
+  private String duplicatedPartitionPathWithUpdates;
+  private String duplicatedPartitionPathWithUpserts;
  private String repairedOutputPath;

  @BeforeEach
  public void init() throws Exception {
    final String tablePath = Paths.get(basePath, "test_table").toString();
    duplicatedPartitionPath = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).toString();
+    duplicatedPartitionPathWithUpdates = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).toString();
+    duplicatedPartitionPathWithUpserts = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).toString();
    repairedOutputPath = Paths.get(basePath, "tmp").toString();

    HoodieCLI.conf = jsc.hadoopConfiguration();
@@ -85,10 +89,16 @@ public class ITTestRepairsCommand extends AbstractShellIntegrationTest {
        .withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "2", hoodieRecords2)
        .withLogFile(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH);

+    testTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "4", hoodieRecords1)
+            .withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "6", hoodieRecords1);
+
    // read records and get 10 to generate duplicates
    HoodieRecord[] dupRecords = Arrays.copyOf(hoodieRecords1, 10);
+    testTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "5", dupRecords);
    testTable.addCommit("20160401010202")
        .withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "3", dupRecords);
+    testTable.withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "7", dupRecords)
+            .withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "8", dupRecords);

    metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient());
  }
@@ -97,7 +107,7 @@ public class ITTestRepairsCommand extends AbstractShellIntegrationTest {
   * Test case for dry run deduplicate.
   */
  @Test
-  public void testDeduplicate() throws IOException {
+  public void testDeduplicateWithInserts() throws IOException {
    // get fs and check number of latest files
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
        metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
@@ -124,6 +134,60 @@ public class ITTestRepairsCommand extends AbstractShellIntegrationTest {
    assertEquals(200, result.count());
  }

+  @Test
+  public void testDeduplicateWithUpdates() throws IOException {
+    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
+        metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
+        fs.listStatus(new Path(duplicatedPartitionPathWithUpdates)));
+    List<String> filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
+    assertEquals(2, filteredStatuses.size(), "There should be 2 files.");
+
+    // Before deduplicate, all files contain 110 records
+    String[] files = filteredStatuses.toArray(new String[0]);
+    Dataset df = sqlContext.read().parquet(files);
+    assertEquals(110, df.count());
+
+    String partitionPath = HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH;
+    String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s --dedupeType %s",
+        partitionPath, repairedOutputPath, "local", "update_type");
+    CommandResult cr = getShell().executeCommand(cmdStr);
+    assertTrue(cr.isSuccess());
+    assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString());
+
+    // After deduplicate, there are 100 records
+    FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath));
+    files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new);
+    Dataset result = sqlContext.read().parquet(files);
+    assertEquals(100, result.count());
+  }
+
+  @Test
+  public void testDeduplicateWithUpserts() throws IOException {
+    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
+        metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
+        fs.listStatus(new Path(duplicatedPartitionPathWithUpserts)));
+    List<String> filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
+    assertEquals(3, filteredStatuses.size(), "There should be 3 files.");
+
+    // Before deduplicate, all files contain 120 records
+    String[] files = filteredStatuses.toArray(new String[0]);
+    Dataset df = sqlContext.read().parquet(files);
+    assertEquals(120, df.count());
+
+    String partitionPath = HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH;
+    String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s --dedupeType %s",
+        partitionPath, repairedOutputPath, "local", "upsert_type");
+    CommandResult cr = getShell().executeCommand(cmdStr);
+    assertTrue(cr.isSuccess());
+    assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString());
+
+    // After deduplicate, there are 100 records
+    FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath));
+    files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new);
+    Dataset result = sqlContext.read().parquet(files);
+    assertEquals(100, result.count());
+  }
+
  /**
   * Test case for real run deduplicate.
   */