[HUDI-796] Add deduping logic for upserts case (#1558)
This commit is contained in:
@@ -59,12 +59,16 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
public class ITTestRepairsCommand extends AbstractShellIntegrationTest {
|
||||
|
||||
private String duplicatedPartitionPath;
|
||||
private String duplicatedPartitionPathWithUpdates;
|
||||
private String duplicatedPartitionPathWithUpserts;
|
||||
private String repairedOutputPath;
|
||||
|
||||
@BeforeEach
|
||||
public void init() throws Exception {
|
||||
final String tablePath = Paths.get(basePath, "test_table").toString();
|
||||
duplicatedPartitionPath = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).toString();
|
||||
duplicatedPartitionPathWithUpdates = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH).toString();
|
||||
duplicatedPartitionPathWithUpserts = Paths.get(tablePath, HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH).toString();
|
||||
repairedOutputPath = Paths.get(basePath, "tmp").toString();
|
||||
|
||||
HoodieCLI.conf = jsc.hadoopConfiguration();
|
||||
@@ -85,10 +89,16 @@ public class ITTestRepairsCommand extends AbstractShellIntegrationTest {
|
||||
.withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "2", hoodieRecords2)
|
||||
.withLogFile(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH);
|
||||
|
||||
testTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "4", hoodieRecords1)
|
||||
.withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "6", hoodieRecords1);
|
||||
|
||||
// read records and get 10 to generate duplicates
|
||||
HoodieRecord[] dupRecords = Arrays.copyOf(hoodieRecords1, 10);
|
||||
testTable.withInserts(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH, "5", dupRecords);
|
||||
testTable.addCommit("20160401010202")
|
||||
.withInserts(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, "3", dupRecords);
|
||||
testTable.withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "7", dupRecords)
|
||||
.withInserts(HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH, "8", dupRecords);
|
||||
|
||||
metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient());
|
||||
}
|
||||
@@ -97,7 +107,7 @@ public class ITTestRepairsCommand extends AbstractShellIntegrationTest {
|
||||
* Test case for dry run deduplicate.
|
||||
*/
|
||||
@Test
|
||||
public void testDeduplicate() throws IOException {
|
||||
public void testDeduplicateWithInserts() throws IOException {
|
||||
// get fs and check number of latest files
|
||||
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
|
||||
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
|
||||
@@ -124,6 +134,60 @@ public class ITTestRepairsCommand extends AbstractShellIntegrationTest {
|
||||
assertEquals(200, result.count());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDeduplicateWithUpdates() throws IOException {
|
||||
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
|
||||
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
|
||||
fs.listStatus(new Path(duplicatedPartitionPathWithUpdates)));
|
||||
List<String> filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
||||
assertEquals(2, filteredStatuses.size(), "There should be 2 files.");
|
||||
|
||||
// Before deduplicate, all files contain 110 records
|
||||
String[] files = filteredStatuses.toArray(new String[0]);
|
||||
Dataset df = sqlContext.read().parquet(files);
|
||||
assertEquals(110, df.count());
|
||||
|
||||
String partitionPath = HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH;
|
||||
String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s --dedupeType %s",
|
||||
partitionPath, repairedOutputPath, "local", "update_type");
|
||||
CommandResult cr = getShell().executeCommand(cmdStr);
|
||||
assertTrue(cr.isSuccess());
|
||||
assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString());
|
||||
|
||||
// After deduplicate, there are 100 records
|
||||
FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath));
|
||||
files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new);
|
||||
Dataset result = sqlContext.read().parquet(files);
|
||||
assertEquals(100, result.count());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDeduplicateWithUpserts() throws IOException {
|
||||
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
|
||||
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
|
||||
fs.listStatus(new Path(duplicatedPartitionPathWithUpserts)));
|
||||
List<String> filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
||||
assertEquals(3, filteredStatuses.size(), "There should be 3 files.");
|
||||
|
||||
// Before deduplicate, all files contain 120 records
|
||||
String[] files = filteredStatuses.toArray(new String[0]);
|
||||
Dataset df = sqlContext.read().parquet(files);
|
||||
assertEquals(120, df.count());
|
||||
|
||||
String partitionPath = HoodieTestDataGenerator.DEFAULT_THIRD_PARTITION_PATH;
|
||||
String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s --dedupeType %s",
|
||||
partitionPath, repairedOutputPath, "local", "upsert_type");
|
||||
CommandResult cr = getShell().executeCommand(cmdStr);
|
||||
assertTrue(cr.isSuccess());
|
||||
assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString());
|
||||
|
||||
// After deduplicate, there are 100 records
|
||||
FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath));
|
||||
files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new);
|
||||
Dataset result = sqlContext.read().parquet(files);
|
||||
assertEquals(100, result.count());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test case for real run deduplicate.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user