1
0

[HUDI-242] Support for RFC-12/Bootstrapping of external datasets to hudi (#1876)

- [HUDI-418] Bootstrap Index Implementation using HFile with unit-test
 - [HUDI-421] FileSystem View Changes to support Bootstrap with unit-tests
 - [HUDI-424] Implement Query Side Integration for querying tables containing bootstrap file slices
 - [HUDI-423] Implement upsert functionality for handling updates to these bootstrap file slices
 - [HUDI-421] Bootstrap Write Client with tests
 - [HUDI-425] Added HoodieDeltaStreamer support
 - [HUDI-899] Add a knob to change partition-path style while performing metadata bootstrap
 - [HUDI-900] Metadata Bootstrap Key Generator needs to handle complex keys correctly
 - [HUDI-424] Simplify Record reader implementation
 - [HUDI-423] Implement upsert functionality for handling updates to these bootstrap file slices
 - [HUDI-420] Hoodie Demo working with hive and sparkSQL. Also, Hoodie CLI working with bootstrap tables

Co-authored-by: Mehrotra <uditme@amazon.com>
Co-authored-by: Vinoth Chandar <vinoth@apache.org>
Co-authored-by: Balaji Varadarajan <varadarb@uber.com>
This commit is contained in:
vinoth chandar
2020-08-03 20:19:21 -07:00
committed by GitHub
parent 266bce12b3
commit 539621bd33
175 changed files with 7540 additions and 779 deletions

View File

@@ -162,7 +162,9 @@ public abstract class ITTestBase {
.awaitCompletion();
int exitCode = dockerClient.inspectExecCmd(createCmdResponse.getId()).exec().getExitCode();
LOG.info("Exit code for command : " + exitCode);
LOG.error("\n\n ###### Stdout #######\n" + callback.getStdout().toString());
if (exitCode != 0) {
LOG.error("\n\n ###### Stdout #######\n" + callback.getStdout().toString());
}
LOG.error("\n\n ###### Stderr #######\n" + callback.getStderr().toString());
if (expectedToSucceed) {
@@ -240,6 +242,9 @@ public abstract class ITTestBase {
String filePath = System.getProperty("java.io.tmpdir") + "/" + System.currentTimeMillis() + "-hive.log";
FileIOUtils.writeStringToFile(hiveLogStr, filePath);
LOG.info("Hive log saved up at : " + filePath);
LOG.info("<=========== Full hive log ===============>\n"
+ "\n" + hiveLogStr
+ "\n <==========================================>");
} catch (Exception e) {
LOG.error("Unable to save up logs..", e);
}
@@ -268,7 +273,7 @@ public abstract class ITTestBase {
saveUpLogs();
}
assertEquals(times, count, "Did not find output the expected number of times");
assertEquals(times, count, "Did not find output the expected number of times.");
}
public class TestExecStartResultCallback extends ExecStartResultCallback {

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.junit.jupiter.api.Test;
import java.util.Collections;
@@ -53,11 +54,18 @@ public class ITTestHoodieDemo extends ITTestBase {
private static final String COW_TABLE_NAME = "stock_ticks_cow";
private static final String MOR_TABLE_NAME = "stock_ticks_mor";
private static final String BOOTSTRAPPED_SRC_PATH = "/user/hive/warehouse/stock_ticks_cow_bs_src";
private static final String COW_BOOTSTRAPPED_BASE_PATH = "/user/hive/warehouse/stock_ticks_cow_bs";
private static final String MOR_BOOTSTRAPPED_BASE_PATH = "/user/hive/warehouse/stock_ticks_mor_bs";
private static final String COW_BOOTSTRAPPED_TABLE_NAME = "stock_ticks_cow_bs";
private static final String MOR_BOOTSTRAPPED_TABLE_NAME = "stock_ticks_mor_bs";
private static final String DEMO_CONTAINER_SCRIPT = HOODIE_WS_ROOT + "/docker/demo/setup_demo_container.sh";
private static final String MIN_COMMIT_TIME_COW_SCRIPT = HOODIE_WS_ROOT + "/docker/demo/get_min_commit_time_cow.sh";
private static final String MIN_COMMIT_TIME_MOR_SCRIPT = HOODIE_WS_ROOT + "/docker/demo/get_min_commit_time_mor.sh";
private static final String HUDI_CLI_TOOL = HOODIE_WS_ROOT + "/hudi-cli/hudi-cli.sh";
private static final String COMPACTION_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/compaction.commands";
private static final String SPARKSQL_BS_PREP_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-bootstrap-prep-source.commands";
private static final String SPARKSQL_BATCH1_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-batch1.commands";
private static final String SPARKSQL_BATCH2_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-batch2.commands";
private static final String SPARKSQL_INCREMENTAL_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/sparksql-incremental.commands";
@@ -100,6 +108,7 @@ public class ITTestHoodieDemo extends ITTestBase {
// compaction
scheduleAndRunCompaction();
testHiveAfterSecondBatchAfterCompaction();
testPrestoAfterSecondBatchAfterCompaction();
testIncrementalHiveQueryAfterCompaction();
@@ -155,40 +164,80 @@ public class ITTestHoodieDemo extends ITTestBase {
+ " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)));
executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds);
executeSparkSQLCommand(SPARKSQL_BS_PREP_COMMANDS, true);
List<String> bootstrapCmds = CollectionUtils.createImmutableList(
"spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
+ " --table-type COPY_ON_WRITE "
+ " --run-bootstrap "
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
+ " --target-base-path " + COW_BOOTSTRAPPED_BASE_PATH + " --target-table " + COW_BOOTSTRAPPED_TABLE_NAME
+ " --props /var/demo/config/dfs-source.properties"
+ " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider "
+ " --initial-checkpoint-provider"
+ " org.apache.hudi.utilities.checkpointing.InitialCheckpointFromAnotherHoodieTimelineProvider"
+ " --hoodie-conf hoodie.bootstrap.base.path=" + BOOTSTRAPPED_SRC_PATH
+ " --hoodie-conf hoodie.bootstrap.recordkey.columns=key"
+ " --hoodie-conf hoodie.deltastreamer.checkpoint.provider.path=" + COW_BASE_PATH
+ " --hoodie-conf hoodie.bootstrap.parallelism=2 "
+ " --hoodie-conf hoodie.bootstrap.keygen.class=" + SimpleKeyGenerator.class.getName()
+ String.format(HIVE_SYNC_CMD_FMT, "dt", COW_BOOTSTRAPPED_TABLE_NAME),
"spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
+ " --table-type MERGE_ON_READ "
+ " --run-bootstrap "
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
+ " --target-base-path " + MOR_BOOTSTRAPPED_BASE_PATH + " --target-table " + MOR_BOOTSTRAPPED_TABLE_NAME
+ " --props /var/demo/config/dfs-source.properties"
+ " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider "
+ " --initial-checkpoint-provider"
+ " org.apache.hudi.utilities.checkpointing.InitialCheckpointFromAnotherHoodieTimelineProvider"
+ " --hoodie-conf hoodie.bootstrap.base.path=" + BOOTSTRAPPED_SRC_PATH
+ " --hoodie-conf hoodie.bootstrap.recordkey.columns=key"
+ " --hoodie-conf hoodie.deltastreamer.checkpoint.provider.path=" + COW_BASE_PATH
+ " --hoodie-conf hoodie.bootstrap.parallelism=2 "
+ " --hoodie-conf hoodie.bootstrap.keygen.class=" + SimpleKeyGenerator.class.getName()
+ String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_BOOTSTRAPPED_TABLE_NAME));
executeCommandStringsInDocker(ADHOC_1_CONTAINER, bootstrapCmds);
}
private void testHiveAfterFirstBatch() throws Exception {
Pair<String, String> stdOutErrPair = executeHiveCommandFile(HIVE_TBLCHECK_COMMANDS);
assertStdOutContains(stdOutErrPair, "| stock_ticks_cow |");
assertStdOutContains(stdOutErrPair, "| stock_ticks_cow_bs |");
assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_ro |");
assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_rt |");
assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_bs_ro |");
assertStdOutContains(stdOutErrPair, "| stock_ticks_mor_bs_rt |");
assertStdOutContains(stdOutErrPair,
"| partition |\n+----------------+\n| dt=2018-08-31 |\n+----------------+\n", 3);
stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS);
assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n"
+ "| GOOG | 2018-08-31 10:29:00 |\n", 3);
+ "| GOOG | 2018-08-31 10:29:00 |\n", 6);
assertStdOutContains(stdOutErrPair,
"| symbol | ts | volume | open | close |\n"
+ "+---------+----------------------+---------+------------+-----------+\n"
+ "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n"
+ "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n",
3);
6);
}
private void testSparkSQLAfterFirstBatch() throws Exception {
Pair<String, String> stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH1_COMMANDS, true);
assertStdOutContains(stdOutErrPair, "|default |stock_ticks_cow |false |\n"
+ "|default |stock_ticks_cow_bs |false |\n"
+ "|default |stock_ticks_mor_bs_ro |false |\n"
+ "|default |stock_ticks_mor_bs_rt |false |"
+ "|default |stock_ticks_mor_ro |false |\n"
+ "|default |stock_ticks_mor_rt |false |");
+ "|default |stock_ticks_mor_rt |false |");
assertStdOutContains(stdOutErrPair,
"+------+-------------------+\n|GOOG |2018-08-31 10:29:00|\n+------+-------------------+", 3);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 3);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|", 3);
"+------+-------------------+\n|GOOG |2018-08-31 10:29:00|\n+------+-------------------+", 6);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 6);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|", 6);
}
private void ingestSecondBatchAndHiveSync() throws Exception {
// Note : Unlike normal tables, bootstrapped tables do not have checkpoint. So, they
// begin with null checkpoint and read all states.
List<String> cmds = CollectionUtils.createImmutableList(
("hdfs dfs -copyFromLocal -f " + INPUT_BATCH_PATH2 + " " + HDFS_BATCH_PATH2),
("spark-submit"
@@ -208,14 +257,28 @@ public class ITTestHoodieDemo extends ITTestBase {
+ " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME
+ " --props /var/demo/config/dfs-source.properties"
+ " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider "
+ " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)));
+ " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_TABLE_NAME)),
("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
+ " --table-type COPY_ON_WRITE "
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
+ " --target-base-path " + COW_BOOTSTRAPPED_BASE_PATH + " --target-table " + COW_BOOTSTRAPPED_TABLE_NAME
+ " --props /var/demo/config/dfs-source.properties"
+ " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider "
+ String.format(HIVE_SYNC_CMD_FMT, "dt", COW_BOOTSTRAPPED_TABLE_NAME)),
("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
+ " --table-type MERGE_ON_READ "
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
+ " --target-base-path " + MOR_BOOTSTRAPPED_BASE_PATH + " --target-table " + MOR_BOOTSTRAPPED_TABLE_NAME
+ " --props /var/demo/config/dfs-source.properties"
+ " --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider "
+ " --disable-compaction " + String.format(HIVE_SYNC_CMD_FMT, "dt", MOR_BOOTSTRAPPED_TABLE_NAME)));
executeCommandStringsInDocker(ADHOC_1_CONTAINER, cmds);
}
private void testPrestoAfterFirstBatch() throws Exception {
Pair<String, String> stdOutErrPair = executePrestoCommandFile(HDFS_PRESTO_INPUT_TABLE_CHECK_PATH);
assertStdOutContains(stdOutErrPair, "stock_ticks_cow");
assertStdOutContains(stdOutErrPair, "stock_ticks_mor",2);
assertStdOutContains(stdOutErrPair, "stock_ticks_cow", 2);
assertStdOutContains(stdOutErrPair, "stock_ticks_mor",4);
stdOutErrPair = executePrestoCommandFile(HDFS_PRESTO_INPUT_BATCH1_PATH);
assertStdOutContains(stdOutErrPair,
@@ -229,20 +292,20 @@ public class ITTestHoodieDemo extends ITTestBase {
private void testHiveAfterSecondBatch() throws Exception {
Pair<String, String> stdOutErrPair = executeHiveCommandFile(HIVE_BATCH1_COMMANDS);
assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n"
+ "| GOOG | 2018-08-31 10:29:00 |\n");
+ "| GOOG | 2018-08-31 10:29:00 |\n", 2);
assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n"
+ "| GOOG | 2018-08-31 10:59:00 |\n", 2);
+ "| GOOG | 2018-08-31 10:59:00 |\n", 4);
assertStdOutContains(stdOutErrPair,
"| symbol | ts | volume | open | close |\n"
+ "+---------+----------------------+---------+------------+-----------+\n"
+ "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n"
+ "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n");
+ "| GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |\n", 2);
assertStdOutContains(stdOutErrPair,
"| symbol | ts | volume | open | close |\n"
+ "+---------+----------------------+---------+------------+-----------+\n"
+ "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n"
+ "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |\n",
2);
4);
}
private void testPrestoAfterSecondBatch() throws Exception {
@@ -262,13 +325,13 @@ public class ITTestHoodieDemo extends ITTestBase {
private void testHiveAfterSecondBatchAfterCompaction() throws Exception {
Pair<String, String> stdOutErrPair = executeHiveCommandFile(HIVE_BATCH2_COMMANDS);
assertStdOutContains(stdOutErrPair, "| symbol | _c1 |\n+---------+----------------------+\n"
+ "| GOOG | 2018-08-31 10:59:00 |", 2);
+ "| GOOG | 2018-08-31 10:59:00 |", 4);
assertStdOutContains(stdOutErrPair,
"| symbol | ts | volume | open | close |\n"
+ "+---------+----------------------+---------+------------+-----------+\n"
+ "| GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |\n"
+ "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |",
2);
4);
}
private void testPrestoAfterSecondBatchAfterCompaction() throws Exception {
@@ -284,13 +347,13 @@ public class ITTestHoodieDemo extends ITTestBase {
private void testSparkSQLAfterSecondBatch() throws Exception {
Pair<String, String> stdOutErrPair = executeSparkSQLCommand(SPARKSQL_BATCH2_COMMANDS, true);
assertStdOutContains(stdOutErrPair,
"+------+-------------------+\n|GOOG |2018-08-31 10:59:00|\n+------+-------------------+", 2);
"+------+-------------------+\n|GOOG |2018-08-31 10:59:00|\n+------+-------------------+", 4);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 3);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|", 2);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 09:59:00|6330 |1230.5 |1230.02 |", 6);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|", 4);
assertStdOutContains(stdOutErrPair,
"+------+-------------------+\n|GOOG |2018-08-31 10:29:00|\n+------+-------------------+");
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|");
"+------+-------------------+\n|GOOG |2018-08-31 10:29:00|\n+------+-------------------+", 2);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:29:00|3391 |1230.1899|1230.085|", 2);
}
private void testIncrementalHiveQuery(String minCommitTimeScript, String incrementalCommandsFile,
@@ -306,36 +369,40 @@ public class ITTestHoodieDemo extends ITTestBase {
String expectedOutput = "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |";
// verify that 10:59 is present in COW table because there is no compaction process for COW
testIncrementalHiveQuery(MIN_COMMIT_TIME_COW_SCRIPT, HIVE_INCREMENTAL_COW_COMMANDS, expectedOutput, 1);
testIncrementalHiveQuery(MIN_COMMIT_TIME_COW_SCRIPT, HIVE_INCREMENTAL_COW_COMMANDS, expectedOutput, 2);
// verify that 10:59 is NOT present in RO table because of pending compaction
testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RO_COMMANDS, expectedOutput, 0);
// verify that 10:59 is present in RT table even with pending compaction
testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RT_COMMANDS, expectedOutput, 1);
testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RT_COMMANDS, expectedOutput, 2);
}
private void testIncrementalHiveQueryAfterCompaction() throws Exception {
String expectedOutput = "| symbol | ts | volume | open | close |\n"
+ "+---------+----------------------+---------+------------+-----------+\n"
+ "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |";
+ "+---------+----------------------+---------+------------+-----------+\n"
+ "| GOOG | 2018-08-31 10:59:00 | 9021 | 1227.1993 | 1227.215 |";
// verify that 10:59 is present for all views because compaction is complete
testIncrementalHiveQuery(MIN_COMMIT_TIME_COW_SCRIPT, HIVE_INCREMENTAL_COW_COMMANDS, expectedOutput, 1);
testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RO_COMMANDS, expectedOutput, 1);
testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RT_COMMANDS, expectedOutput, 1);
testIncrementalHiveQuery(MIN_COMMIT_TIME_COW_SCRIPT, HIVE_INCREMENTAL_COW_COMMANDS, expectedOutput, 2);
testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RO_COMMANDS, expectedOutput, 2);
testIncrementalHiveQuery(MIN_COMMIT_TIME_MOR_SCRIPT, HIVE_INCREMENTAL_MOR_RT_COMMANDS, expectedOutput, 2);
}
private void testIncrementalSparkSQLQuery() throws Exception {
Pair<String, String> stdOutErrPair = executeSparkSQLCommand(SPARKSQL_INCREMENTAL_COMMANDS, true);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|");
assertStdOutContains(stdOutErrPair, "|default |stock_ticks_cow |false |\n"
+ "|default |stock_ticks_derived_mor_ro|false |\n"
+ "|default |stock_ticks_derived_mor_rt|false |\n"
+ "|default |stock_ticks_mor_ro |false |\n"
+ "|default |stock_ticks_mor_rt |false |\n"
+ "| |stock_ticks_cow_incr |true |");
assertStdOutContains(stdOutErrPair, "|count(1)|\n+--------+\n|99 |", 2);
assertStdOutContains(stdOutErrPair, "|GOOG |2018-08-31 10:59:00|9021 |1227.1993|1227.215|", 2);
assertStdOutContains(stdOutErrPair, "|default |stock_ticks_cow |false |\n"
+ "|default |stock_ticks_cow_bs |false |\n"
+ "|default |stock_ticks_derived_mor_bs_ro|false |\n"
+ "|default |stock_ticks_derived_mor_bs_rt|false |\n"
+ "|default |stock_ticks_derived_mor_ro |false |\n"
+ "|default |stock_ticks_derived_mor_rt |false |\n"
+ "|default |stock_ticks_mor_bs_ro |false |\n"
+ "|default |stock_ticks_mor_bs_rt |false |"
+ "|default |stock_ticks_mor_ro |false |\n"
+ "|default |stock_ticks_mor_rt |false |");
assertStdOutContains(stdOutErrPair, "|count(1)|\n+--------+\n|99 |", 4);
}
private void scheduleAndRunCompaction() throws Exception {