1
0

[HUDI-808] Support cleaning bootstrap source data (#1870)

Co-authored-by: Wenning Ding <wenningd@amazon.com>
Co-authored-by: Balaji Varadarajan <vbalaji@apache.org>
This commit is contained in:
wenningd
2020-08-11 01:43:46 -07:00
committed by GitHub
parent 626f78f6f6
commit 8b928e9bca
23 changed files with 772 additions and 173 deletions

View File

@@ -26,6 +26,7 @@ import org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter;
import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex;
import org.apache.hudi.common.model.BootstrapFileMapping;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
import org.apache.hudi.common.util.collection.Pair;
@@ -86,7 +87,7 @@ public class TestBootstrapIndex extends HoodieCommonTestHarness {
@Test
public void testBootstrapIndexConcurrent() throws Exception {
Map<String, List<BootstrapFileMapping>> bootstrapMapping = generateBootstrapIndex(100);
Map<String, List<BootstrapFileMapping>> bootstrapMapping = generateBootstrapIndex(metaClient, BOOTSTRAP_BASE_PATH, PARTITIONS, 100);
final int numThreads = 20;
final int numRequestsPerThread = 50;
ExecutorService service = Executors.newFixedThreadPool(numThreads);
@@ -111,15 +112,15 @@ public class TestBootstrapIndex extends HoodieCommonTestHarness {
}
private void testBootstrapIndexOneRound(int numEntriesPerPartition) throws IOException {
Map<String, List<BootstrapFileMapping>> bootstrapMapping = generateBootstrapIndex(numEntriesPerPartition);
Map<String, List<BootstrapFileMapping>> bootstrapMapping = generateBootstrapIndex(metaClient, BOOTSTRAP_BASE_PATH, PARTITIONS, numEntriesPerPartition);
validateBootstrapIndex(bootstrapMapping);
}
private Map<String, List<BootstrapFileMapping>> generateBootstrapIndex(int numEntriesPerPartition)
throws IOException {
Map<String, List<BootstrapFileMapping>> bootstrapMapping = generateBootstrapMapping(numEntriesPerPartition);
public static Map<String, List<BootstrapFileMapping>> generateBootstrapIndex(HoodieTableMetaClient metaClient,
String sourceBasePath, String[] partitions, int numEntriesPerPartition) {
Map<String, List<BootstrapFileMapping>> bootstrapMapping = generateBootstrapMapping(sourceBasePath, partitions, numEntriesPerPartition);
BootstrapIndex index = new HFileBootstrapIndex(metaClient);
try (IndexWriter writer = index.createWriter(BOOTSTRAP_BASE_PATH)) {
try (IndexWriter writer = index.createWriter(sourceBasePath)) {
writer.begin();
bootstrapMapping.entrySet().stream().forEach(e -> writer.appendNextPartition(e.getKey(), e.getValue()));
writer.finish();
@@ -162,13 +163,14 @@ public class TestBootstrapIndex extends HoodieCommonTestHarness {
}
}
private Map<String, List<BootstrapFileMapping>> generateBootstrapMapping(int numEntriesPerPartition) {
return Arrays.stream(PARTITIONS).map(partition -> {
private static Map<String, List<BootstrapFileMapping>> generateBootstrapMapping(String sourceBasePath,
String[] partitions, int numEntriesPerPartition) {
return Arrays.stream(partitions).map(partition -> {
return Pair.of(partition, IntStream.range(0, numEntriesPerPartition).mapToObj(idx -> {
String hudiFileId = UUID.randomUUID().toString();
String sourceFileName = idx + ".parquet";
HoodieFileStatus sourceFileStatus = HoodieFileStatus.newBuilder()
.setPath(HoodiePath.newBuilder().setUri(BOOTSTRAP_BASE_PATH + "/" + partition + "/" + sourceFileName).build())
.setPath(HoodiePath.newBuilder().setUri(sourceBasePath + "/" + partition + "/" + sourceFileName).build())
.setLength(256 * 1024 * 1024L)
.setAccessTime(new Date().getTime())
.setModificationTime(new Date().getTime() + 99999)
@@ -179,7 +181,7 @@ public class TestBootstrapIndex extends HoodieCommonTestHarness {
.setPermission(HoodieFSPermission.newBuilder().setUserAction(FsAction.ALL.name())
.setGroupAction(FsAction.READ.name()).setOtherAction(FsAction.NONE.name()).setStickyBit(true).build())
.build();
return new BootstrapFileMapping(BOOTSTRAP_BASE_PATH, partition, partition, sourceFileStatus, hudiFileId);
return new BootstrapFileMapping(sourceBasePath, partition, partition, sourceFileStatus, hudiFileId);
}).collect(Collectors.toList()));
}).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
}

View File

@@ -45,6 +45,7 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler;
import org.apache.hudi.common.util.CleanerUtils;
import org.apache.hudi.common.util.CompactionUtils;
import org.apache.hudi.common.util.Option;
@@ -219,7 +220,8 @@ public class HoodieTestUtils {
os = metaClient.getFs().create(commitFile, true);
// Write empty clean metadata
os.write(TimelineMetadataUtils.serializeCleanerPlan(
new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(), 1)).get());
new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(),
CleanPlanV2MigrationHandler.VERSION, new HashMap<>())).get());
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
} finally {