[HUDI-427] [HUDI-971] Implement CLI support for performing bootstrap (#1869)
* [HUDI-971] Clean partitions & fileIds returned by HFileBootstrapIndex * [HUDI-427] Implement CLI support for performing bootstrap Co-authored-by: Wenning Ding <wenningd@amazon.com> Co-authored-by: Balaji Varadarajan <vbalaji@apache.org>
This commit is contained in:
@@ -71,14 +71,15 @@ public abstract class BootstrapIndex implements Serializable {
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if bootstrap Index is present and ensures readable.
|
||||
* Check if bootstrap Index is physically present. It does not guarantee the validity of the index.
|
||||
* To ensure an index is valid, use useIndex() API.
|
||||
*/
|
||||
protected abstract boolean isPresent();
|
||||
|
||||
/**
|
||||
* Bootstrap Index Reader Interface.
|
||||
*/
|
||||
public abstract static class IndexReader implements Serializable, AutoCloseable {
|
||||
public abstract static class IndexReader implements Serializable, AutoCloseable {
|
||||
|
||||
protected final HoodieTableMetaClient metaClient;
|
||||
|
||||
@@ -102,7 +103,7 @@ public abstract class BootstrapIndex implements Serializable {
|
||||
* Return list file-ids indexed.
|
||||
* @return
|
||||
*/
|
||||
public abstract List<String> getIndexedFileIds();
|
||||
public abstract List<HoodieFileGroupId> getIndexedFileGroupIds();
|
||||
|
||||
/**
|
||||
* Lookup bootstrap index by partition.
|
||||
|
||||
@@ -29,6 +29,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
@@ -57,6 +58,7 @@ import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
@@ -79,6 +81,13 @@ public class HFileBootstrapIndex extends BootstrapIndex {
|
||||
|
||||
public static final String BOOTSTRAP_INDEX_FILE_ID = "00000000-0000-0000-0000-000000000000-0";
|
||||
|
||||
private static final String PARTITION_KEY_PREFIX = "part";
|
||||
private static final String FILE_ID_KEY_PREFIX = "fileid";
|
||||
private static final String KEY_VALUE_SEPARATOR = "=";
|
||||
private static final String KEY_PARTS_SEPARATOR = ";";
|
||||
// This is part of the suffix that HFIle appends to every key
|
||||
private static final String HFILE_CELL_KEY_SUFFIX_PART = "//LATEST_TIMESTAMP/Put/vlen";
|
||||
|
||||
// Additional Metadata written to HFiles.
|
||||
public static final byte[] INDEX_INFO_KEY = Bytes.toBytes("INDEX_INFO");
|
||||
|
||||
@@ -96,12 +105,44 @@ public class HFileBootstrapIndex extends BootstrapIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns partition-key to be used in HFile.
|
||||
* @param partition Partition-Path
|
||||
* @return
|
||||
*/
|
||||
private static String getPartitionKey(String partition) {
|
||||
return "part=" + partition;
|
||||
return getKeyValueString(PARTITION_KEY_PREFIX, partition);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns file group key to be used in HFile.
|
||||
* @param fileGroupId File Group Id.
|
||||
* @return
|
||||
*/
|
||||
private static String getFileGroupKey(HoodieFileGroupId fileGroupId) {
|
||||
return "part=" + fileGroupId.getPartitionPath() + ";fileid=" + fileGroupId.getFileId();
|
||||
return getPartitionKey(fileGroupId.getPartitionPath()) + KEY_PARTS_SEPARATOR
|
||||
+ getKeyValueString(FILE_ID_KEY_PREFIX, fileGroupId.getFileId());
|
||||
}
|
||||
|
||||
private static String getPartitionFromKey(String key) {
|
||||
String[] parts = key.split("=", 2);
|
||||
ValidationUtils.checkArgument(parts[0].equals(PARTITION_KEY_PREFIX));
|
||||
return parts[1];
|
||||
}
|
||||
|
||||
private static String getFileIdFromKey(String key) {
|
||||
String[] parts = key.split("=", 2);
|
||||
ValidationUtils.checkArgument(parts[0].equals(FILE_ID_KEY_PREFIX));
|
||||
return parts[1];
|
||||
}
|
||||
|
||||
private static HoodieFileGroupId getFileGroupFromKey(String key) {
|
||||
String[] parts = key.split(KEY_PARTS_SEPARATOR, 2);
|
||||
return new HoodieFileGroupId(getPartitionFromKey(parts[0]), getFileIdFromKey(parts[1]));
|
||||
}
|
||||
|
||||
private static String getKeyValueString(String key, String value) {
|
||||
return key + KEY_VALUE_SEPARATOR + value;
|
||||
}
|
||||
|
||||
private static Path partitionIndexPath(HoodieTableMetaClient metaClient) {
|
||||
@@ -116,6 +157,17 @@ public class HFileBootstrapIndex extends BootstrapIndex {
|
||||
HoodieFileFormat.HFILE.getFileExtension()));
|
||||
}
|
||||
|
||||
/**
|
||||
* HFile stores cell key in the format example : "2020/03/18//LATEST_TIMESTAMP/Put/vlen=3692/seqid=0".
|
||||
* This API returns only the user key part from it.
|
||||
* @param cellKey HFIle Cell Key
|
||||
* @return
|
||||
*/
|
||||
private static String getUserKeyFromCellKey(String cellKey) {
|
||||
int hfileSuffixBeginIndex = cellKey.lastIndexOf(HFILE_CELL_KEY_SUFFIX_PART);
|
||||
return cellKey.substring(0, hfileSuffixBeginIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper method to create HFile Reader.
|
||||
*
|
||||
@@ -160,7 +212,7 @@ public class HFileBootstrapIndex extends BootstrapIndex {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isPresent() {
|
||||
public boolean isPresent() {
|
||||
return isPresent;
|
||||
}
|
||||
|
||||
@@ -240,21 +292,21 @@ public class HFileBootstrapIndex extends BootstrapIndex {
|
||||
@Override
|
||||
public List<String> getIndexedPartitionPaths() {
|
||||
HFileScanner scanner = partitionIndexReader().getScanner(true, true);
|
||||
return getAllKeys(scanner);
|
||||
return getAllKeys(scanner, HFileBootstrapIndex::getPartitionFromKey);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getIndexedFileIds() {
|
||||
public List<HoodieFileGroupId> getIndexedFileGroupIds() {
|
||||
HFileScanner scanner = fileIdIndexReader().getScanner(true, true);
|
||||
return getAllKeys(scanner);
|
||||
return getAllKeys(scanner, HFileBootstrapIndex::getFileGroupFromKey);
|
||||
}
|
||||
|
||||
private List<String> getAllKeys(HFileScanner scanner) {
|
||||
List<String> keys = new ArrayList<>();
|
||||
private <T> List<T> getAllKeys(HFileScanner scanner, Function<String, T> converter) {
|
||||
List<T> keys = new ArrayList<>();
|
||||
try {
|
||||
boolean available = scanner.seekTo();
|
||||
while (available) {
|
||||
keys.add(CellUtil.getCellKeyAsString(scanner.getKeyValue()));
|
||||
keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getKeyValue()))));
|
||||
available = scanner.next();
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
|
||||
@@ -34,10 +34,12 @@ import org.apache.hadoop.fs.permission.FsAction;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
@@ -50,17 +52,18 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
/**
|
||||
* Unit Tests for Bootstrap Index.
|
||||
*/
|
||||
public class TestBootstrapIndex extends HoodieCommonTestHarness {
|
||||
|
||||
private static String[] PARTITIONS = {"2020/03/18", "2020/03/19", "2020/03/20", "2020/03/21"};
|
||||
private static String BOOTSTRAP_BASE_PATH = "/tmp/source/parquet_tables/table1";
|
||||
private static final String[] PARTITIONS = {"2020/03/18", "2020/03/19", "2020/03/20", "2020/03/21"};
|
||||
private static final Set<String> PARTITION_SET = Arrays.stream(PARTITIONS).collect(Collectors.toSet());
|
||||
private static final String BOOTSTRAP_BASE_PATH = "/tmp/source/parquet_tables/table1";
|
||||
|
||||
@BeforeEach
|
||||
|
||||
public void init() throws IOException {
|
||||
initMetaClient();
|
||||
}
|
||||
@@ -127,11 +130,14 @@ public class TestBootstrapIndex extends HoodieCommonTestHarness {
|
||||
private void validateBootstrapIndex(Map<String, List<BootstrapFileMapping>> bootstrapMapping) {
|
||||
BootstrapIndex index = new HFileBootstrapIndex(metaClient);
|
||||
try (BootstrapIndex.IndexReader reader = index.createReader()) {
|
||||
List<String> partitions = reader.getIndexedPartitionPaths();
|
||||
assertEquals(bootstrapMapping.size(), partitions.size());
|
||||
long expNumFileGroupKeys = bootstrapMapping.values().stream().flatMap(x -> x.stream()).count();
|
||||
long gotNumFileGroupKeys = reader.getIndexedFileIds().size();
|
||||
List<String> indexedPartitions = reader.getIndexedPartitionPaths();
|
||||
assertEquals(bootstrapMapping.size(), indexedPartitions.size());
|
||||
indexedPartitions.forEach(partition -> assertTrue(PARTITION_SET.contains(partition)));
|
||||
long expNumFileGroupKeys = bootstrapMapping.values().stream().flatMap(Collection::stream).count();
|
||||
List<HoodieFileGroupId> fileGroupIds = reader.getIndexedFileGroupIds();
|
||||
long gotNumFileGroupKeys = fileGroupIds.size();
|
||||
assertEquals(expNumFileGroupKeys, gotNumFileGroupKeys);
|
||||
fileGroupIds.forEach(fgId -> assertTrue(PARTITION_SET.contains(fgId.getPartitionPath())));
|
||||
|
||||
bootstrapMapping.entrySet().stream().forEach(e -> {
|
||||
List<BootstrapFileMapping> gotMapping = reader.getSourceFileMappingForPartition(e.getKey());
|
||||
|
||||
Reference in New Issue
Block a user