1
0

[HUDI-427] [HUDI-971] Implement CLI support for performing bootstrap (#1869)

* [HUDI-971] Clean partitions & fileIds returned by HFileBootstrapIndex
* [HUDI-427] Implement CLI support for performing bootstrap

Co-authored-by: Wenning Ding <wenningd@amazon.com>
Co-authored-by: Balaji Varadarajan <vbalaji@apache.org>
This commit is contained in:
wenningd
2020-08-08 12:37:29 -07:00
committed by GitHub
parent 5ee676e34f
commit 9fe2d2b14a
11 changed files with 448 additions and 28 deletions

View File

@@ -71,14 +71,15 @@ public abstract class BootstrapIndex implements Serializable {
}
/**
* Check if bootstrap Index is present and ensures readable.
* Check if bootstrap Index is physically present. It does not guarantee the validity of the index.
* To ensure an index is valid, use useIndex() API.
*/
protected abstract boolean isPresent();
/**
* Bootstrap Index Reader Interface.
*/
public abstract static class IndexReader implements Serializable, AutoCloseable {
public abstract static class IndexReader implements Serializable, AutoCloseable {
protected final HoodieTableMetaClient metaClient;
@@ -102,7 +103,7 @@ public abstract class BootstrapIndex implements Serializable {
* Return list file-ids indexed.
* @return
*/
public abstract List<String> getIndexedFileIds();
public abstract List<HoodieFileGroupId> getIndexedFileGroupIds();
/**
* Lookup bootstrap index by partition.

View File

@@ -29,6 +29,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
@@ -57,6 +58,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
@@ -79,6 +81,13 @@ public class HFileBootstrapIndex extends BootstrapIndex {
public static final String BOOTSTRAP_INDEX_FILE_ID = "00000000-0000-0000-0000-000000000000-0";
private static final String PARTITION_KEY_PREFIX = "part";
private static final String FILE_ID_KEY_PREFIX = "fileid";
private static final String KEY_VALUE_SEPARATOR = "=";
private static final String KEY_PARTS_SEPARATOR = ";";
// This is part of the suffix that HFIle appends to every key
private static final String HFILE_CELL_KEY_SUFFIX_PART = "//LATEST_TIMESTAMP/Put/vlen";
// Additional Metadata written to HFiles.
public static final byte[] INDEX_INFO_KEY = Bytes.toBytes("INDEX_INFO");
@@ -96,12 +105,44 @@ public class HFileBootstrapIndex extends BootstrapIndex {
}
}
/**
* Returns partition-key to be used in HFile.
* @param partition Partition-Path
* @return
*/
private static String getPartitionKey(String partition) {
return "part=" + partition;
return getKeyValueString(PARTITION_KEY_PREFIX, partition);
}
/**
* Returns file group key to be used in HFile.
* @param fileGroupId File Group Id.
* @return
*/
private static String getFileGroupKey(HoodieFileGroupId fileGroupId) {
return "part=" + fileGroupId.getPartitionPath() + ";fileid=" + fileGroupId.getFileId();
return getPartitionKey(fileGroupId.getPartitionPath()) + KEY_PARTS_SEPARATOR
+ getKeyValueString(FILE_ID_KEY_PREFIX, fileGroupId.getFileId());
}
private static String getPartitionFromKey(String key) {
String[] parts = key.split("=", 2);
ValidationUtils.checkArgument(parts[0].equals(PARTITION_KEY_PREFIX));
return parts[1];
}
private static String getFileIdFromKey(String key) {
String[] parts = key.split("=", 2);
ValidationUtils.checkArgument(parts[0].equals(FILE_ID_KEY_PREFIX));
return parts[1];
}
private static HoodieFileGroupId getFileGroupFromKey(String key) {
String[] parts = key.split(KEY_PARTS_SEPARATOR, 2);
return new HoodieFileGroupId(getPartitionFromKey(parts[0]), getFileIdFromKey(parts[1]));
}
private static String getKeyValueString(String key, String value) {
return key + KEY_VALUE_SEPARATOR + value;
}
private static Path partitionIndexPath(HoodieTableMetaClient metaClient) {
@@ -116,6 +157,17 @@ public class HFileBootstrapIndex extends BootstrapIndex {
HoodieFileFormat.HFILE.getFileExtension()));
}
/**
* HFile stores cell key in the format example : "2020/03/18//LATEST_TIMESTAMP/Put/vlen=3692/seqid=0".
* This API returns only the user key part from it.
* @param cellKey HFIle Cell Key
* @return
*/
private static String getUserKeyFromCellKey(String cellKey) {
int hfileSuffixBeginIndex = cellKey.lastIndexOf(HFILE_CELL_KEY_SUFFIX_PART);
return cellKey.substring(0, hfileSuffixBeginIndex);
}
/**
* Helper method to create HFile Reader.
*
@@ -160,7 +212,7 @@ public class HFileBootstrapIndex extends BootstrapIndex {
}
@Override
protected boolean isPresent() {
public boolean isPresent() {
return isPresent;
}
@@ -240,21 +292,21 @@ public class HFileBootstrapIndex extends BootstrapIndex {
@Override
public List<String> getIndexedPartitionPaths() {
HFileScanner scanner = partitionIndexReader().getScanner(true, true);
return getAllKeys(scanner);
return getAllKeys(scanner, HFileBootstrapIndex::getPartitionFromKey);
}
@Override
public List<String> getIndexedFileIds() {
public List<HoodieFileGroupId> getIndexedFileGroupIds() {
HFileScanner scanner = fileIdIndexReader().getScanner(true, true);
return getAllKeys(scanner);
return getAllKeys(scanner, HFileBootstrapIndex::getFileGroupFromKey);
}
private List<String> getAllKeys(HFileScanner scanner) {
List<String> keys = new ArrayList<>();
private <T> List<T> getAllKeys(HFileScanner scanner, Function<String, T> converter) {
List<T> keys = new ArrayList<>();
try {
boolean available = scanner.seekTo();
while (available) {
keys.add(CellUtil.getCellKeyAsString(scanner.getKeyValue()));
keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getKeyValue()))));
available = scanner.next();
}
} catch (IOException ioe) {

View File

@@ -34,10 +34,12 @@ import org.apache.hadoop.fs.permission.FsAction;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@@ -50,17 +52,18 @@ import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Unit Tests for Bootstrap Index.
*/
public class TestBootstrapIndex extends HoodieCommonTestHarness {
private static String[] PARTITIONS = {"2020/03/18", "2020/03/19", "2020/03/20", "2020/03/21"};
private static String BOOTSTRAP_BASE_PATH = "/tmp/source/parquet_tables/table1";
private static final String[] PARTITIONS = {"2020/03/18", "2020/03/19", "2020/03/20", "2020/03/21"};
private static final Set<String> PARTITION_SET = Arrays.stream(PARTITIONS).collect(Collectors.toSet());
private static final String BOOTSTRAP_BASE_PATH = "/tmp/source/parquet_tables/table1";
@BeforeEach
public void init() throws IOException {
initMetaClient();
}
@@ -127,11 +130,14 @@ public class TestBootstrapIndex extends HoodieCommonTestHarness {
private void validateBootstrapIndex(Map<String, List<BootstrapFileMapping>> bootstrapMapping) {
BootstrapIndex index = new HFileBootstrapIndex(metaClient);
try (BootstrapIndex.IndexReader reader = index.createReader()) {
List<String> partitions = reader.getIndexedPartitionPaths();
assertEquals(bootstrapMapping.size(), partitions.size());
long expNumFileGroupKeys = bootstrapMapping.values().stream().flatMap(x -> x.stream()).count();
long gotNumFileGroupKeys = reader.getIndexedFileIds().size();
List<String> indexedPartitions = reader.getIndexedPartitionPaths();
assertEquals(bootstrapMapping.size(), indexedPartitions.size());
indexedPartitions.forEach(partition -> assertTrue(PARTITION_SET.contains(partition)));
long expNumFileGroupKeys = bootstrapMapping.values().stream().flatMap(Collection::stream).count();
List<HoodieFileGroupId> fileGroupIds = reader.getIndexedFileGroupIds();
long gotNumFileGroupKeys = fileGroupIds.size();
assertEquals(expNumFileGroupKeys, gotNumFileGroupKeys);
fileGroupIds.forEach(fgId -> assertTrue(PARTITION_SET.contains(fgId.getPartitionPath())));
bootstrapMapping.entrySet().stream().forEach(e -> {
List<BootstrapFileMapping> gotMapping = reader.getSourceFileMappingForPartition(e.getKey());