1
0

[HUDI-427] [HUDI-971] Implement CLI support for performing bootstrap (#1869)

* [HUDI-971] Clean partitions & fileIds returned by HFileBootstrapIndex
* [HUDI-427] Implement CLI support for performing bootstrap

Co-authored-by: Wenning Ding <wenningd@amazon.com>
Co-authored-by: Balaji Varadarajan <vbalaji@apache.org>
This commit is contained in:
wenningd
2020-08-08 12:37:29 -07:00
committed by GitHub
parent 5ee676e34f
commit 9fe2d2b14a
11 changed files with 448 additions and 28 deletions

View File

@@ -153,6 +153,13 @@
<scope>test</scope> <scope>test</scope>
<type>test-jar</type> <type>test-jar</type>
</dependency> </dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency> <dependency>
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId> <artifactId>hudi-utilities_${scala.binary.version}</artifactId>

View File

@@ -0,0 +1,186 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.cli.commands;
import org.apache.hudi.cli.HoodieCLI;
import org.apache.hudi.cli.HoodiePrintHelper;
import org.apache.hudi.cli.TableHeader;
import org.apache.hudi.cli.commands.SparkMain.SparkCommand;
import org.apache.hudi.cli.utils.InputStreamConsumer;
import org.apache.hudi.cli.utils.SparkUtil;
import org.apache.hudi.common.bootstrap.index.BootstrapIndex;
import org.apache.hudi.common.model.BootstrapFileMapping;
import org.apache.hudi.common.model.HoodieFileGroupId;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.spark.launcher.SparkLauncher;
import org.apache.spark.util.Utils;
import org.springframework.shell.core.CommandMarker;
import org.springframework.shell.core.annotation.CliCommand;
import org.springframework.shell.core.annotation.CliOption;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.stream.Collectors;
import scala.collection.JavaConverters;
/**
* CLI command to perform bootstrap action & display bootstrap index.
*/
@Component
public class BootstrapCommand implements CommandMarker {
@CliCommand(value = "bootstrap run", help = "Run a bootstrap action for current Hudi table")
public String bootstrap(
@CliOption(key = {"srcPath"}, mandatory = true, help = "Bootstrap source data path of the table") final String srcPath,
@CliOption(key = {"targetPath"}, mandatory = true,
help = "Base path for the target hoodie table") final String targetPath,
@CliOption(key = {"tableName"}, mandatory = true, help = "Hoodie table name") final String tableName,
@CliOption(key = {"tableType"}, mandatory = true, help = "Hoodie table type") final String tableType,
@CliOption(key = {"rowKeyField"}, mandatory = true, help = "Record key columns for bootstrap data") final String rowKeyField,
@CliOption(key = {"partitionPathField"}, unspecifiedDefaultValue = "",
help = "Partition fields for bootstrap source data") final String partitionPathField,
@CliOption(key = {"bootstrapIndexClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex",
help = "Bootstrap Index Class") final String bootstrapIndexClass,
@CliOption(key = {"selectorClass"}, unspecifiedDefaultValue = "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector",
help = "Selector class for bootstrap") final String selectorClass,
@CliOption(key = {"keyGeneratorClass"}, unspecifiedDefaultValue = "org.apache.hudi.keygen.SimpleKeyGenerator",
help = "Key generator class for bootstrap") final String keyGeneratorClass,
@CliOption(key = {"fullBootstrapInputProvider"}, unspecifiedDefaultValue = "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider",
help = "Class for Full bootstrap input provider") final String fullBootstrapInputProvider,
@CliOption(key = {"schemaProviderClass"}, unspecifiedDefaultValue = "",
help = "SchemaProvider to attach schemas to bootstrap source data") final String schemaProviderClass,
@CliOption(key = {"payloadClass"}, unspecifiedDefaultValue = "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload",
help = "Payload Class") final String payloadClass,
@CliOption(key = {"parallelism"}, unspecifiedDefaultValue = "1500", help = "Bootstrap writer parallelism") final int parallelism,
@CliOption(key = {"sparkMaster"}, unspecifiedDefaultValue = "", help = "Spark Master") String master,
@CliOption(key = {"sparkMemory"}, unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory,
@CliOption(key = {"enableHiveSync"}, unspecifiedDefaultValue = "false", help = "Enable Hive sync") final Boolean enableHiveSync,
@CliOption(key = {"propsFilePath"}, help = "path to properties file on localfs or dfs with configurations for hoodie client for importing",
unspecifiedDefaultValue = "") final String propsFilePath,
@CliOption(key = {"hoodieConfigs"}, help = "Any configuration that can be set in the properties file can be passed here in the form of an array",
unspecifiedDefaultValue = "") final String[] configs)
throws IOException, InterruptedException, URISyntaxException {
String sparkPropertiesPath =
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
String cmd = SparkCommand.BOOTSTRAP.toString();
sparkLauncher.addAppArgs(cmd, master, sparkMemory, tableName, tableType, targetPath, srcPath, rowKeyField,
partitionPathField, String.valueOf(parallelism), schemaProviderClass, bootstrapIndexClass, selectorClass,
keyGeneratorClass, fullBootstrapInputProvider, payloadClass, String.valueOf(enableHiveSync), propsFilePath);
UtilHelpers.validateAndAddProperties(configs, sparkLauncher);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
if (exitCode != 0) {
return "Failed to bootstrap source data to Hudi dataset";
}
return "Bootstrapped source data as Hudi dataset";
}
@CliCommand(value = "bootstrap index showmapping", help = "Show bootstrap index mapping")
public String showBootstrapIndexMapping(
@CliOption(key = {"partitionPath"}, unspecifiedDefaultValue = "", help = "A valid partition path") String partitionPath,
@CliOption(key = {"fileIds"}, unspecifiedDefaultValue = "", help = "Valid fileIds split by comma") String fileIds,
@CliOption(key = {"limit"}, unspecifiedDefaultValue = "-1", help = "Limit rows to be displayed") Integer limit,
@CliOption(key = {"sortBy"}, unspecifiedDefaultValue = "", help = "Sorting Field") final String sortByField,
@CliOption(key = {"desc"}, unspecifiedDefaultValue = "false", help = "Ordering") final boolean descending,
@CliOption(key = {"headeronly"}, unspecifiedDefaultValue = "false", help = "Print Header Only")
final boolean headerOnly) {
if (partitionPath.isEmpty() && !fileIds.isEmpty()) {
throw new IllegalStateException("PartitionPath is mandatory when passing fileIds.");
}
BootstrapIndex.IndexReader indexReader = createBootstrapIndexReader();
List<String> indexedPartitions = indexReader.getIndexedPartitionPaths();
if (!partitionPath.isEmpty() && !indexedPartitions.contains(partitionPath)) {
return partitionPath + " is not an valid indexed partition";
}
List<BootstrapFileMapping> mappingList = new ArrayList<>();
if (!fileIds.isEmpty()) {
List<HoodieFileGroupId> fileGroupIds = Arrays.stream(fileIds.split(","))
.map(fileId -> new HoodieFileGroupId(partitionPath, fileId)).collect(Collectors.toList());
mappingList.addAll(indexReader.getSourceFileMappingForFileIds(fileGroupIds).values());
} else if (!partitionPath.isEmpty()) {
mappingList.addAll(indexReader.getSourceFileMappingForPartition(partitionPath));
} else {
for (String part : indexedPartitions) {
mappingList.addAll(indexReader.getSourceFileMappingForPartition(part));
}
}
final List<Comparable[]> rows = convertBootstrapSourceFileMapping(mappingList);
final TableHeader header = new TableHeader()
.addTableHeaderField("Hudi Partition")
.addTableHeaderField("FileId")
.addTableHeaderField("Source File Base Path")
.addTableHeaderField("Source File Partition")
.addTableHeaderField("Source File Path");
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending,
limit, headerOnly, rows);
}
@CliCommand(value = "bootstrap index showpartitions", help = "Show bootstrap indexed partitions")
public String showBootstrapIndexPartitions() {
BootstrapIndex.IndexReader indexReader = createBootstrapIndexReader();
List<String> indexedPartitions = indexReader.getIndexedPartitionPaths();
String[] header = new String[] {"Indexed partitions"};
String[][] rows = new String[indexedPartitions.size()][1];
for (int i = 0; i < indexedPartitions.size(); i++) {
rows[i][0] = indexedPartitions.get(i);
}
return HoodiePrintHelper.print(header, rows);
}
private BootstrapIndex.IndexReader createBootstrapIndexReader() {
HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
BootstrapIndex index = BootstrapIndex.getBootstrapIndex(metaClient);
if (!index.useIndex()) {
throw new HoodieException("This is not a bootstrapped Hudi table. Don't have any index info");
}
return index.createReader();
}
private List<Comparable[]> convertBootstrapSourceFileMapping(List<BootstrapFileMapping> mappingList) {
final List<Comparable[]> rows = new ArrayList<>();
for (BootstrapFileMapping mapping : mappingList) {
rows.add(new Comparable[] {mapping.getPartitionPath(), mapping.getFileId(),
mapping.getBootstrapBasePath(), mapping.getBootstrapPartitionPath(), mapping.getBoostrapFileStatus().getPath().getUri()});
}
return rows;
}
}

View File

@@ -18,12 +18,15 @@
package org.apache.hudi.cli.commands; package org.apache.hudi.cli.commands;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.cli.DedupeSparkJob; import org.apache.hudi.cli.DedupeSparkJob;
import org.apache.hudi.cli.utils.SparkUtil; import org.apache.hudi.cli.utils.SparkUtil;
import org.apache.hudi.client.HoodieWriteClient; import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieBootstrapConfig;
import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieSavepointException; import org.apache.hudi.exception.HoodieSavepointException;
@@ -35,11 +38,16 @@ import org.apache.hudi.utilities.HoodieCleaner;
import org.apache.hudi.utilities.HoodieCompactionAdminTool; import org.apache.hudi.utilities.HoodieCompactionAdminTool;
import org.apache.hudi.utilities.HoodieCompactionAdminTool.Operation; import org.apache.hudi.utilities.HoodieCompactionAdminTool.Operation;
import org.apache.hudi.utilities.HoodieCompactor; import org.apache.hudi.utilities.HoodieCompactor;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.utilities.deltastreamer.BootstrapExecutor;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.SQLContext;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
@@ -55,7 +63,7 @@ public class SparkMain {
* Commands. * Commands.
*/ */
enum SparkCommand { enum SparkCommand {
ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN, BOOTSTRAP, ROLLBACK, DEDUPLICATE, ROLLBACK_TO_SAVEPOINT, SAVEPOINT, IMPORT, UPSERT, COMPACT_SCHEDULE, COMPACT_RUN,
COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR, CLEAN, DELETE_SAVEPOINT COMPACT_UNSCHEDULE_PLAN, COMPACT_UNSCHEDULE_FILE, COMPACT_VALIDATE, COMPACT_REPAIR, CLEAN, DELETE_SAVEPOINT
} }
@@ -164,6 +172,19 @@ public class SparkMain {
assert (args.length == 5); assert (args.length == 5);
returnCode = deleteSavepoint(jsc, args[3], args[4]); returnCode = deleteSavepoint(jsc, args[3], args[4]);
break; break;
case BOOTSTRAP:
assert (args.length >= 18);
propsFilePath = null;
if (!StringUtils.isNullOrEmpty(args[17])) {
propsFilePath = args[17];
}
configs = new ArrayList<>();
if (args.length > 18) {
configs.addAll(Arrays.asList(args).subList(18, args.length));
}
returnCode = doBootstrap(jsc, args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10],
args[11], args[12], args[13], args[14], args[15], args[16], propsFilePath, configs);
break;
default: default:
break; break;
} }
@@ -174,7 +195,7 @@ public class SparkMain {
List<SparkCommand> masterContained = Arrays.asList(SparkCommand.COMPACT_VALIDATE, SparkCommand.COMPACT_REPAIR, List<SparkCommand> masterContained = Arrays.asList(SparkCommand.COMPACT_VALIDATE, SparkCommand.COMPACT_REPAIR,
SparkCommand.COMPACT_UNSCHEDULE_PLAN, SparkCommand.COMPACT_UNSCHEDULE_FILE, SparkCommand.CLEAN, SparkCommand.COMPACT_UNSCHEDULE_PLAN, SparkCommand.COMPACT_UNSCHEDULE_FILE, SparkCommand.CLEAN,
SparkCommand.IMPORT, SparkCommand.UPSERT, SparkCommand.DEDUPLICATE, SparkCommand.SAVEPOINT, SparkCommand.IMPORT, SparkCommand.UPSERT, SparkCommand.DEDUPLICATE, SparkCommand.SAVEPOINT,
SparkCommand.DELETE_SAVEPOINT, SparkCommand.ROLLBACK_TO_SAVEPOINT, SparkCommand.ROLLBACK); SparkCommand.DELETE_SAVEPOINT, SparkCommand.ROLLBACK_TO_SAVEPOINT, SparkCommand.ROLLBACK, SparkCommand.BOOTSTRAP);
return masterContained.contains(command); return masterContained.contains(command);
} }
@@ -281,6 +302,36 @@ public class SparkMain {
return 0; return 0;
} }
private static int doBootstrap(JavaSparkContext jsc, String tableName, String tableType, String basePath,
String sourcePath, String recordKeyCols, String partitionFields, String parallelism, String schemaProviderClass,
String bootstrapIndexClass, String selectorClass, String keyGeneratorClass, String fullBootstrapInputProvider,
String payloadClassName, String enableHiveSync, String propsFilePath, List<String> configs) throws IOException {
TypedProperties properties = propsFilePath == null ? UtilHelpers.buildProperties(configs)
: UtilHelpers.readConfig(FSUtils.getFs(propsFilePath, jsc.hadoopConfiguration()), new Path(propsFilePath), configs).getConfig();
properties.setProperty(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP, sourcePath);
properties.setProperty(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS, keyGeneratorClass);
properties.setProperty(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER, fullBootstrapInputProvider);
properties.setProperty(HoodieBootstrapConfig.BOOTSTRAP_PARALLELISM, parallelism);
properties.setProperty(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR, selectorClass);
properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), recordKeyCols);
properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), partitionFields);
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
cfg.targetTableName = tableName;
cfg.targetBasePath = basePath;
cfg.tableType = tableType;
cfg.schemaProviderClassName = schemaProviderClass;
cfg.bootstrapIndexClass = bootstrapIndexClass;
cfg.payloadClassName = payloadClassName;
cfg.enableHiveSync = Boolean.valueOf(enableHiveSync);
new BootstrapExecutor(cfg, jsc, FSUtils.getFs(basePath, jsc.hadoopConfiguration()),
jsc.hadoopConfiguration(), properties).execute();
return 0;
}
private static int rollback(JavaSparkContext jsc, String instantTime, String basePath) throws Exception { private static int rollback(JavaSparkContext jsc, String instantTime, String basePath) throws Exception {
HoodieWriteClient client = createHoodieClient(jsc, basePath); HoodieWriteClient client = createHoodieClient(jsc, basePath);
if (client.rollback(instantTime)) { if (client.rollback(instantTime)) {

View File

@@ -0,0 +1,114 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.cli.integ;
import org.apache.hudi.cli.HoodieCLI;
import org.apache.hudi.cli.HoodiePrintHelper;
import org.apache.hudi.cli.commands.TableCommand;
import org.apache.hudi.cli.testutils.AbstractShellIntegrationTest;
import org.apache.hudi.client.TestBootstrap;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.springframework.shell.core.CommandResult;
import java.io.File;
import java.io.IOException;
import java.time.Instant;
import java.util.Arrays;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Test class of {@link org.apache.hudi.cli.commands.BootstrapCommand}.
*/
public class ITTestBootstrapCommand extends AbstractShellIntegrationTest {
private static final int NUM_OF_RECORDS = 100;
private static final String PARTITION_FIELD = "datestr";
private static final String RECORD_KEY_FIELD = "_row_key";
private String tableName;
private String sourcePath;
private String tablePath;
private List<String> partitions;
@BeforeEach
public void init() {
String srcName = "source";
tableName = "test-table";
sourcePath = basePath + File.separator + srcName;
tablePath = basePath + File.separator + tableName;
// generate test data
partitions = Arrays.asList("2018", "2019", "2020");
double timestamp = new Double(Instant.now().toEpochMilli()).longValue();
for (int i = 0; i < partitions.size(); i++) {
Dataset<Row> df = TestBootstrap.generateTestRawTripDataset(timestamp,
i * NUM_OF_RECORDS, i * NUM_OF_RECORDS + NUM_OF_RECORDS, null, jsc, sqlContext);
df.write().parquet(sourcePath + File.separator + PARTITION_FIELD + "=" + partitions.get(i));
}
}
/**
* Test case for command 'bootstrap'.
*/
@Test
public void testBootstrapRunCommand() throws IOException {
// test bootstrap run command
String cmdStr = String.format(
"bootstrap run --targetPath %s --tableName %s --tableType %s --srcPath %s --rowKeyField %s --partitionPathField %s --sparkMaster %s",
tablePath, tableName, HoodieTableType.COPY_ON_WRITE.name(), sourcePath, RECORD_KEY_FIELD, PARTITION_FIELD, "local");
CommandResult cr = getShell().executeCommand(cmdStr);
assertTrue(cr.isSuccess());
// Connect & check Hudi table exist
new TableCommand().connect(tablePath, TimelineLayoutVersion.VERSION_1, false, 2000, 300000, 7);
metaClient = HoodieCLI.getTableMetaClient();
assertEquals(1, metaClient.getActiveTimeline().getCommitsTimeline().countInstants(), "Should have 1 commit.");
// test "bootstrap index showpartitions"
CommandResult crForIndexedPartitions = getShell().executeCommand("bootstrap index showpartitions");
assertTrue(crForIndexedPartitions.isSuccess());
String[] header = new String[] {"Indexed partitions"};
String[][] rows = new String[partitions.size()][1];
for (int i = 0; i < partitions.size(); i++) {
rows[i][0] = PARTITION_FIELD + "=" + partitions.get(i);
}
String expect = HoodiePrintHelper.print(header, rows);
expect = removeNonWordAndStripSpace(expect);
String got = removeNonWordAndStripSpace(crForIndexedPartitions.getResult().toString());
assertEquals(expect, got);
// test "bootstrap index showMapping"
CommandResult crForIndexedMapping = getShell().executeCommand("bootstrap index showmapping");
assertTrue(crForIndexedMapping.isSuccess());
CommandResult crForIndexedMappingWithPartition = getShell().executeCommand(String.format(
"bootstrap index showmapping --partitionPath %s=%s", PARTITION_FIELD, partitions.get(0)));
assertTrue(crForIndexedMappingWithPartition.isSuccess());
}
}

View File

@@ -193,7 +193,7 @@ public class BootstrapCommitActionExecutor<T extends HoodieRecordPayload<T>>
} }
/** /**
* Perform Metadata Bootstrap. * Perform Full Bootstrap.
* @param partitionFilesList List of partitions and files within that partitions * @param partitionFilesList List of partitions and files within that partitions
*/ */
protected Option<HoodieWriteMetadata> fullBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitionFilesList) { protected Option<HoodieWriteMetadata> fullBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitionFilesList) {

View File

@@ -522,8 +522,10 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
secondClient.rollback(commitTime1); secondClient.rollback(commitTime1);
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath()); allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
// After rollback, there should be no base file with the failed commit time // After rollback, there should be no base file with the failed commit time
assertEquals(0, Arrays.stream(allFiles) List<String> remainingFiles = Arrays.stream(allFiles).filter(file -> file.getPath().getName()
.filter(file -> file.getPath().getName().contains(commitTime1)).count()); .contains(commitTime1)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
assertEquals(0, remainingFiles.size(), "There files should have been rolled-back "
+ "when rolling back commit " + commitTime1 + " but are still remaining. Files: " + remainingFiles);
dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList()); dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, basePath); recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, basePath);
assertEquals(200, recordsRead.size()); assertEquals(200, recordsRead.size());

View File

@@ -71,14 +71,15 @@ public abstract class BootstrapIndex implements Serializable {
} }
/** /**
* Check if bootstrap Index is present and ensures readable. * Check if bootstrap Index is physically present. It does not guarantee the validity of the index.
* To ensure an index is valid, use useIndex() API.
*/ */
protected abstract boolean isPresent(); protected abstract boolean isPresent();
/** /**
* Bootstrap Index Reader Interface. * Bootstrap Index Reader Interface.
*/ */
public abstract static class IndexReader implements Serializable, AutoCloseable { public abstract static class IndexReader implements Serializable, AutoCloseable {
protected final HoodieTableMetaClient metaClient; protected final HoodieTableMetaClient metaClient;
@@ -102,7 +103,7 @@ public abstract class BootstrapIndex implements Serializable {
* Return list file-ids indexed. * Return list file-ids indexed.
* @return * @return
*/ */
public abstract List<String> getIndexedFileIds(); public abstract List<HoodieFileGroupId> getIndexedFileGroupIds();
/** /**
* Lookup bootstrap index by partition. * Lookup bootstrap index by partition.

View File

@@ -29,6 +29,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieIOException;
@@ -57,6 +58,7 @@ import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
/** /**
@@ -79,6 +81,13 @@ public class HFileBootstrapIndex extends BootstrapIndex {
public static final String BOOTSTRAP_INDEX_FILE_ID = "00000000-0000-0000-0000-000000000000-0"; public static final String BOOTSTRAP_INDEX_FILE_ID = "00000000-0000-0000-0000-000000000000-0";
private static final String PARTITION_KEY_PREFIX = "part";
private static final String FILE_ID_KEY_PREFIX = "fileid";
private static final String KEY_VALUE_SEPARATOR = "=";
private static final String KEY_PARTS_SEPARATOR = ";";
// This is part of the suffix that HFIle appends to every key
private static final String HFILE_CELL_KEY_SUFFIX_PART = "//LATEST_TIMESTAMP/Put/vlen";
// Additional Metadata written to HFiles. // Additional Metadata written to HFiles.
public static final byte[] INDEX_INFO_KEY = Bytes.toBytes("INDEX_INFO"); public static final byte[] INDEX_INFO_KEY = Bytes.toBytes("INDEX_INFO");
@@ -96,12 +105,44 @@ public class HFileBootstrapIndex extends BootstrapIndex {
} }
} }
/**
* Returns partition-key to be used in HFile.
* @param partition Partition-Path
* @return
*/
private static String getPartitionKey(String partition) { private static String getPartitionKey(String partition) {
return "part=" + partition; return getKeyValueString(PARTITION_KEY_PREFIX, partition);
} }
/**
* Returns file group key to be used in HFile.
* @param fileGroupId File Group Id.
* @return
*/
private static String getFileGroupKey(HoodieFileGroupId fileGroupId) { private static String getFileGroupKey(HoodieFileGroupId fileGroupId) {
return "part=" + fileGroupId.getPartitionPath() + ";fileid=" + fileGroupId.getFileId(); return getPartitionKey(fileGroupId.getPartitionPath()) + KEY_PARTS_SEPARATOR
+ getKeyValueString(FILE_ID_KEY_PREFIX, fileGroupId.getFileId());
}
private static String getPartitionFromKey(String key) {
String[] parts = key.split("=", 2);
ValidationUtils.checkArgument(parts[0].equals(PARTITION_KEY_PREFIX));
return parts[1];
}
private static String getFileIdFromKey(String key) {
String[] parts = key.split("=", 2);
ValidationUtils.checkArgument(parts[0].equals(FILE_ID_KEY_PREFIX));
return parts[1];
}
private static HoodieFileGroupId getFileGroupFromKey(String key) {
String[] parts = key.split(KEY_PARTS_SEPARATOR, 2);
return new HoodieFileGroupId(getPartitionFromKey(parts[0]), getFileIdFromKey(parts[1]));
}
private static String getKeyValueString(String key, String value) {
return key + KEY_VALUE_SEPARATOR + value;
} }
private static Path partitionIndexPath(HoodieTableMetaClient metaClient) { private static Path partitionIndexPath(HoodieTableMetaClient metaClient) {
@@ -116,6 +157,17 @@ public class HFileBootstrapIndex extends BootstrapIndex {
HoodieFileFormat.HFILE.getFileExtension())); HoodieFileFormat.HFILE.getFileExtension()));
} }
/**
* HFile stores cell key in the format example : "2020/03/18//LATEST_TIMESTAMP/Put/vlen=3692/seqid=0".
* This API returns only the user key part from it.
* @param cellKey HFIle Cell Key
* @return
*/
private static String getUserKeyFromCellKey(String cellKey) {
int hfileSuffixBeginIndex = cellKey.lastIndexOf(HFILE_CELL_KEY_SUFFIX_PART);
return cellKey.substring(0, hfileSuffixBeginIndex);
}
/** /**
* Helper method to create HFile Reader. * Helper method to create HFile Reader.
* *
@@ -160,7 +212,7 @@ public class HFileBootstrapIndex extends BootstrapIndex {
} }
@Override @Override
protected boolean isPresent() { public boolean isPresent() {
return isPresent; return isPresent;
} }
@@ -240,21 +292,21 @@ public class HFileBootstrapIndex extends BootstrapIndex {
@Override @Override
public List<String> getIndexedPartitionPaths() { public List<String> getIndexedPartitionPaths() {
HFileScanner scanner = partitionIndexReader().getScanner(true, true); HFileScanner scanner = partitionIndexReader().getScanner(true, true);
return getAllKeys(scanner); return getAllKeys(scanner, HFileBootstrapIndex::getPartitionFromKey);
} }
@Override @Override
public List<String> getIndexedFileIds() { public List<HoodieFileGroupId> getIndexedFileGroupIds() {
HFileScanner scanner = fileIdIndexReader().getScanner(true, true); HFileScanner scanner = fileIdIndexReader().getScanner(true, true);
return getAllKeys(scanner); return getAllKeys(scanner, HFileBootstrapIndex::getFileGroupFromKey);
} }
private List<String> getAllKeys(HFileScanner scanner) { private <T> List<T> getAllKeys(HFileScanner scanner, Function<String, T> converter) {
List<String> keys = new ArrayList<>(); List<T> keys = new ArrayList<>();
try { try {
boolean available = scanner.seekTo(); boolean available = scanner.seekTo();
while (available) { while (available) {
keys.add(CellUtil.getCellKeyAsString(scanner.getKeyValue())); keys.add(converter.apply(getUserKeyFromCellKey(CellUtil.getCellKeyAsString(scanner.getKeyValue()))));
available = scanner.next(); available = scanner.next();
} }
} catch (IOException ioe) { } catch (IOException ioe) {

View File

@@ -34,10 +34,12 @@ import org.apache.hadoop.fs.permission.FsAction;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.UUID; import java.util.UUID;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
@@ -50,17 +52,18 @@ import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
/** /**
* Unit Tests for Bootstrap Index. * Unit Tests for Bootstrap Index.
*/ */
public class TestBootstrapIndex extends HoodieCommonTestHarness { public class TestBootstrapIndex extends HoodieCommonTestHarness {
private static String[] PARTITIONS = {"2020/03/18", "2020/03/19", "2020/03/20", "2020/03/21"}; private static final String[] PARTITIONS = {"2020/03/18", "2020/03/19", "2020/03/20", "2020/03/21"};
private static String BOOTSTRAP_BASE_PATH = "/tmp/source/parquet_tables/table1"; private static final Set<String> PARTITION_SET = Arrays.stream(PARTITIONS).collect(Collectors.toSet());
private static final String BOOTSTRAP_BASE_PATH = "/tmp/source/parquet_tables/table1";
@BeforeEach @BeforeEach
public void init() throws IOException { public void init() throws IOException {
initMetaClient(); initMetaClient();
} }
@@ -127,11 +130,14 @@ public class TestBootstrapIndex extends HoodieCommonTestHarness {
private void validateBootstrapIndex(Map<String, List<BootstrapFileMapping>> bootstrapMapping) { private void validateBootstrapIndex(Map<String, List<BootstrapFileMapping>> bootstrapMapping) {
BootstrapIndex index = new HFileBootstrapIndex(metaClient); BootstrapIndex index = new HFileBootstrapIndex(metaClient);
try (BootstrapIndex.IndexReader reader = index.createReader()) { try (BootstrapIndex.IndexReader reader = index.createReader()) {
List<String> partitions = reader.getIndexedPartitionPaths(); List<String> indexedPartitions = reader.getIndexedPartitionPaths();
assertEquals(bootstrapMapping.size(), partitions.size()); assertEquals(bootstrapMapping.size(), indexedPartitions.size());
long expNumFileGroupKeys = bootstrapMapping.values().stream().flatMap(x -> x.stream()).count(); indexedPartitions.forEach(partition -> assertTrue(PARTITION_SET.contains(partition)));
long gotNumFileGroupKeys = reader.getIndexedFileIds().size(); long expNumFileGroupKeys = bootstrapMapping.values().stream().flatMap(Collection::stream).count();
List<HoodieFileGroupId> fileGroupIds = reader.getIndexedFileGroupIds();
long gotNumFileGroupKeys = fileGroupIds.size();
assertEquals(expNumFileGroupKeys, gotNumFileGroupKeys); assertEquals(expNumFileGroupKeys, gotNumFileGroupKeys);
fileGroupIds.forEach(fgId -> assertTrue(PARTITION_SET.contains(fgId.getPartitionPath())));
bootstrapMapping.entrySet().stream().forEach(e -> { bootstrapMapping.entrySet().stream().forEach(e -> {
List<BootstrapFileMapping> gotMapping = reader.getSourceFileMappingForPartition(e.getKey()); List<BootstrapFileMapping> gotMapping = reader.getSourceFileMappingForPartition(e.getKey());

View File

@@ -158,7 +158,7 @@ public class TestBootstrap extends HoodieClientTestBase {
public Schema generateNewDataSetAndReturnSchema(double timestamp, int numRecords, List<String> partitionPaths, public Schema generateNewDataSetAndReturnSchema(double timestamp, int numRecords, List<String> partitionPaths,
String srcPath) throws Exception { String srcPath) throws Exception {
boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty(); boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty();
Dataset<Row> df = generateTestRawTripDataset(timestamp, numRecords, partitionPaths, jsc, sqlContext); Dataset<Row> df = generateTestRawTripDataset(timestamp, 0, numRecords, partitionPaths, jsc, sqlContext);
df.printSchema(); df.printSchema();
if (isPartitioned) { if (isPartitioned) {
df.write().partitionBy("datestr").format("parquet").mode(SaveMode.Overwrite).save(srcPath); df.write().partitionBy("datestr").format("parquet").mode(SaveMode.Overwrite).save(srcPath);
@@ -550,11 +550,11 @@ public class TestBootstrap extends HoodieClientTestBase {
return builder; return builder;
} }
private static Dataset<Row> generateTestRawTripDataset(double timestamp, int numRecords, List<String> partitionPaths, public static Dataset<Row> generateTestRawTripDataset(double timestamp, int from, int to, List<String> partitionPaths,
JavaSparkContext jsc, SQLContext sqlContext) { JavaSparkContext jsc, SQLContext sqlContext) {
boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty(); boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty();
final List<String> records = new ArrayList<>(); final List<String> records = new ArrayList<>();
IntStream.range(0, numRecords).forEach(i -> { IntStream.range(from, to).forEach(i -> {
String id = "" + i; String id = "" + i;
records.add(generateGenericRecord("trip_" + id, "rider_" + id, "driver_" + id, records.add(generateGenericRecord("trip_" + id, "rider_" + id, "driver_" + id,
timestamp, false, false).toString()); timestamp, false, false).toString());

View File

@@ -26,6 +26,7 @@ import org.apache.hudi.common.config.DFSPropertiesConfiguration;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig; import org.apache.hudi.config.HoodieIndexConfig;
@@ -100,7 +101,7 @@ public class UtilHelpers {
public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg, public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg,
JavaSparkContext jssc) throws IOException { JavaSparkContext jssc) throws IOException {
try { try {
return schemaProviderClass == null ? null return StringUtils.isNullOrEmpty(schemaProviderClass) ? null
: (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc); : (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
} catch (Throwable e) { } catch (Throwable e) {
throw new IOException("Could not load schema provider class " + schemaProviderClass, e); throw new IOException("Could not load schema provider class " + schemaProviderClass, e);