For Delete API, "hoodie.delete.shuffle.parallelism" isn't used as opposed to "hoodie.upsert.shuffle.parallelism" is used for upsert, this creates the performance difference between delete by upsert API with "EmptyHoodieRecordPayload" and delete API for certain cases. This patch makes the following fixes in this regard. - Let deduplicateKeys method use "hoodie.delete.shuffle.parallelism" - Repartition inputRDD as "hoodie.delete.shuffle.parallelism" in case "hoodie.combine.before.delete=false"
399 lines
17 KiB
Java
399 lines
17 KiB
Java
/*
|
|
* Licensed to the Apache Software Foundation (ASF) under one
|
|
* or more contributor license agreements. See the NOTICE file
|
|
* distributed with this work for additional information
|
|
* regarding copyright ownership. The ASF licenses this file
|
|
* to you under the Apache License, Version 2.0 (the
|
|
* "License"); you may not use this file except in compliance
|
|
* with the License. You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
import java.util.stream.Collectors;
|
|
import org.apache.hudi.DataSourceReadOptions;
|
|
import org.apache.hudi.DataSourceWriteOptions;
|
|
import org.apache.hudi.HoodieDataSourceHelpers;
|
|
import org.apache.hudi.common.model.HoodieTableType;
|
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
|
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
|
import org.apache.hudi.common.util.ValidationUtils;
|
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
|
import org.apache.hudi.config.HoodieWriteConfig;
|
|
import org.apache.hudi.exception.TableNotFoundException;
|
|
import org.apache.hudi.hive.MultiPartKeysValueExtractor;
|
|
|
|
import com.beust.jcommander.JCommander;
|
|
import com.beust.jcommander.Parameter;
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
import org.apache.hadoop.fs.Path;
|
|
import org.apache.log4j.LogManager;
|
|
import org.apache.log4j.Logger;
|
|
import org.apache.spark.api.java.JavaSparkContext;
|
|
import org.apache.spark.sql.Dataset;
|
|
import org.apache.spark.sql.Row;
|
|
import org.apache.spark.sql.SaveMode;
|
|
import org.apache.spark.sql.SparkSession;
|
|
import org.apache.spark.sql.streaming.DataStreamWriter;
|
|
import org.apache.spark.sql.streaming.OutputMode;
|
|
import org.apache.spark.sql.streaming.ProcessingTime;
|
|
|
|
import java.util.List;
|
|
import java.util.concurrent.ExecutorService;
|
|
import java.util.concurrent.Executors;
|
|
import java.util.concurrent.Future;
|
|
import org.apache.spark.sql.streaming.StreamingQuery;
|
|
|
|
import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings;
|
|
|
|
/**
|
|
* Sample program that writes & reads hoodie tables via the Spark datasource streaming.
|
|
*/
|
|
public class HoodieJavaStreamingApp {
|
|
|
|
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
|
|
private String tablePath = "/tmp/hoodie/streaming/sample-table";
|
|
|
|
@Parameter(names = {"--streaming-source-path", "-ssp"}, description = "path for streaming source file folder")
|
|
private String streamingSourcePath = "/tmp/hoodie/streaming/source";
|
|
|
|
@Parameter(names = {"--streaming-checkpointing-path", "-scp"},
|
|
description = "path for streaming checking pointing folder")
|
|
private String streamingCheckpointingPath = "/tmp/hoodie/streaming/checkpoint";
|
|
|
|
@Parameter(names = {"--streaming-duration-in-ms", "-sdm"},
|
|
description = "time in millisecond for the streaming duration")
|
|
private Long streamingDurationInMs = 15000L;
|
|
|
|
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
|
|
private String tableName = "hoodie_test";
|
|
|
|
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
|
|
private String tableType = HoodieTableType.MERGE_ON_READ.name();
|
|
|
|
@Parameter(names = {"--hive-sync", "-hv"}, description = "Enable syncing to hive")
|
|
private Boolean enableHiveSync = false;
|
|
|
|
@Parameter(names = {"--hive-db", "-hd"}, description = "hive database")
|
|
private String hiveDB = "default";
|
|
|
|
@Parameter(names = {"--hive-table", "-ht"}, description = "hive table")
|
|
private String hiveTable = "hoodie_sample_test";
|
|
|
|
@Parameter(names = {"--hive-user", "-hu"}, description = "hive username")
|
|
private String hiveUser = "hive";
|
|
|
|
@Parameter(names = {"--hive-password", "-hp"}, description = "hive password")
|
|
private String hivePass = "hive";
|
|
|
|
@Parameter(names = {"--hive-url", "-hl"}, description = "hive JDBC URL")
|
|
private String hiveJdbcUrl = "jdbc:hive2://localhost:10000";
|
|
|
|
@Parameter(names = {"--use-multi-partition-keys", "-mp"}, description = "Use Multiple Partition Keys")
|
|
private Boolean useMultiPartitionKeys = false;
|
|
|
|
@Parameter(names = {"--help", "-h"}, help = true)
|
|
public Boolean help = false;
|
|
|
|
|
|
private static final Logger LOG = LogManager.getLogger(HoodieJavaStreamingApp.class);
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
HoodieJavaStreamingApp cli = new HoodieJavaStreamingApp();
|
|
JCommander cmd = new JCommander(cli, null, args);
|
|
|
|
if (cli.help) {
|
|
cmd.usage();
|
|
System.exit(1);
|
|
}
|
|
int errStatus = 0;
|
|
try {
|
|
cli.run();
|
|
} catch (Exception ex) {
|
|
LOG.error("Got error running app ", ex);
|
|
errStatus = -1;
|
|
} finally {
|
|
System.exit(errStatus);
|
|
}
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @throws Exception
|
|
*/
|
|
public void run() throws Exception {
|
|
// Spark session setup..
|
|
SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
|
|
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
|
|
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
|
|
|
|
// folder path clean up and creation, preparing the environment
|
|
FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
|
|
fs.delete(new Path(streamingSourcePath), true);
|
|
fs.delete(new Path(streamingCheckpointingPath), true);
|
|
fs.delete(new Path(tablePath), true);
|
|
fs.mkdirs(new Path(streamingSourcePath));
|
|
|
|
// Generator of some records to be loaded in.
|
|
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
|
|
|
|
List<String> records1 = recordsToStrings(dataGen.generateInserts("001", 100));
|
|
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
|
|
|
|
List<String> records2 = recordsToStrings(dataGen.generateUpdatesForAllRecords("002"));
|
|
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
|
|
|
|
|
|
String ckptPath = streamingCheckpointingPath + "/stream1";
|
|
String srcPath = streamingSourcePath + "/stream1";
|
|
fs.mkdirs(new Path(ckptPath));
|
|
fs.mkdirs(new Path(srcPath));
|
|
|
|
// setup the input for streaming
|
|
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(srcPath + "/*");
|
|
|
|
// start streaming and showing
|
|
ExecutorService executor = Executors.newFixedThreadPool(2);
|
|
int numInitialCommits = 0;
|
|
|
|
// thread for spark strucutured streaming
|
|
try {
|
|
Future<Void> streamFuture = executor.submit(() -> {
|
|
LOG.info("===== Streaming Starting =====");
|
|
stream(streamingInput, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL(), ckptPath);
|
|
LOG.info("===== Streaming Ends =====");
|
|
return null;
|
|
});
|
|
|
|
// thread for adding data to the streaming source and showing results over time
|
|
Future<Integer> showFuture = executor.submit(() -> {
|
|
LOG.info("===== Showing Starting =====");
|
|
int numCommits = addInputAndValidateIngestion(spark, fs, srcPath,0, 100, inputDF1, inputDF2, true);
|
|
LOG.info("===== Showing Ends =====");
|
|
return numCommits;
|
|
});
|
|
|
|
// let the threads run
|
|
streamFuture.get();
|
|
numInitialCommits = showFuture.get();
|
|
} finally {
|
|
executor.shutdownNow();
|
|
}
|
|
|
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), tablePath);
|
|
if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
|
|
// Ensure we have successfully completed one compaction commit
|
|
ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() == 1);
|
|
} else {
|
|
ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() >= 1);
|
|
}
|
|
|
|
// Deletes Stream
|
|
// Need to restart application to ensure spark does not assume there are multiple streams active.
|
|
spark.close();
|
|
SparkSession newSpark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
|
|
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
|
|
jssc = new JavaSparkContext(newSpark.sparkContext());
|
|
String ckptPath2 = streamingCheckpointingPath + "/stream2";
|
|
String srcPath2 = srcPath + "/stream2";
|
|
fs.mkdirs(new Path(ckptPath2));
|
|
fs.mkdirs(new Path(srcPath2));
|
|
Dataset<Row> delStreamingInput = newSpark.readStream().schema(inputDF1.schema()).json(srcPath2 + "/*");
|
|
List<String> deletes = recordsToStrings(dataGen.generateUniqueUpdates("002", 20));
|
|
Dataset<Row> inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2));
|
|
executor = Executors.newFixedThreadPool(2);
|
|
|
|
// thread for spark strucutured streaming
|
|
try {
|
|
Future<Void> streamFuture = executor.submit(() -> {
|
|
LOG.info("===== Streaming Starting =====");
|
|
stream(delStreamingInput, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), ckptPath2);
|
|
LOG.info("===== Streaming Ends =====");
|
|
return null;
|
|
});
|
|
|
|
final int numCommits = numInitialCommits;
|
|
// thread for adding data to the streaming source and showing results over time
|
|
Future<Void> showFuture = executor.submit(() -> {
|
|
LOG.info("===== Showing Starting =====");
|
|
addInputAndValidateIngestion(newSpark, fs, srcPath2, numCommits, 80, inputDF3, null, false);
|
|
LOG.info("===== Showing Ends =====");
|
|
return null;
|
|
});
|
|
|
|
// let the threads run
|
|
streamFuture.get();
|
|
showFuture.get();
|
|
} finally {
|
|
executor.shutdown();
|
|
}
|
|
}
|
|
|
|
private void waitTillNCommits(FileSystem fs, int numCommits, int timeoutSecs, int sleepSecsAfterEachRun)
|
|
throws InterruptedException {
|
|
long beginTime = System.currentTimeMillis();
|
|
long currTime = beginTime;
|
|
long timeoutMsecs = timeoutSecs * 1000;
|
|
|
|
while ((currTime - beginTime) < timeoutMsecs) {
|
|
try {
|
|
HoodieTimeline timeline = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, tablePath);
|
|
LOG.info("Timeline :" + timeline.getInstants().collect(Collectors.toList()));
|
|
if (timeline.countInstants() >= numCommits) {
|
|
return;
|
|
}
|
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath, true);
|
|
System.out.println("Instants :" + metaClient.getActiveTimeline().getInstants().collect(Collectors.toList()));
|
|
} catch (TableNotFoundException te) {
|
|
LOG.info("Got table not found exception. Retrying");
|
|
} finally {
|
|
Thread.sleep(sleepSecsAfterEachRun * 1000);
|
|
currTime = System.currentTimeMillis();
|
|
}
|
|
}
|
|
throw new IllegalStateException("Timedout waiting for " + numCommits + " commits to appear in " + tablePath);
|
|
}
|
|
|
|
/**
|
|
* Adding data to the streaming source and showing results over time.
|
|
*
|
|
* @param spark
|
|
* @param fs
|
|
* @param inputDF1
|
|
* @param inputDF2
|
|
* @throws Exception
|
|
*/
|
|
public int addInputAndValidateIngestion(SparkSession spark, FileSystem fs, String srcPath,
|
|
int initialCommits, int expRecords,
|
|
Dataset<Row> inputDF1, Dataset<Row> inputDF2, boolean instantTimeValidation) throws Exception {
|
|
// Ensure, we always write only one file. This is very important to ensure a single batch is reliably read
|
|
// atomically by one iteration of spark streaming.
|
|
inputDF1.coalesce(1).write().mode(SaveMode.Append).json(srcPath);
|
|
|
|
int numExpCommits = initialCommits + 1;
|
|
// wait for spark streaming to process one microbatch
|
|
waitTillNCommits(fs, numExpCommits, 180, 3);
|
|
String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
|
|
LOG.info("First commit at instant time :" + commitInstantTime1);
|
|
|
|
String commitInstantTime2 = commitInstantTime1;
|
|
if (null != inputDF2) {
|
|
numExpCommits += 1;
|
|
inputDF2.write().mode(SaveMode.Append).json(srcPath);
|
|
// wait for spark streaming to process one microbatch
|
|
Thread.sleep(3000);
|
|
waitTillNCommits(fs, numExpCommits, 180, 3);
|
|
commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
|
|
LOG.info("Second commit at instant time :" + commitInstantTime2);
|
|
}
|
|
|
|
if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
|
|
numExpCommits += 1;
|
|
// Wait for compaction to also finish and track latest timestamp as commit timestamp
|
|
waitTillNCommits(fs, numExpCommits, 180, 3);
|
|
commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
|
|
LOG.info("Compaction commit at instant time :" + commitInstantTime2);
|
|
}
|
|
|
|
/**
|
|
* Read & do some queries
|
|
*/
|
|
Dataset<Row> hoodieROViewDF = spark.read().format("hudi")
|
|
// pass any path glob, can include hoodie & non-hoodie
|
|
// datasets
|
|
.load(tablePath + "/*/*/*/*");
|
|
hoodieROViewDF.registerTempTable("hoodie_ro");
|
|
spark.sql("describe hoodie_ro").show();
|
|
// all trips whose fare amount was greater than 2.
|
|
spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();
|
|
|
|
if (instantTimeValidation) {
|
|
System.out.println("Showing all records. Latest Instant Time =" + commitInstantTime2);
|
|
spark.sql("select * from hoodie_ro").show(200, false);
|
|
long numRecordsAtInstant2 =
|
|
spark.sql("select * from hoodie_ro where _hoodie_commit_time = " + commitInstantTime2).count();
|
|
ValidationUtils.checkArgument(numRecordsAtInstant2 == expRecords,
|
|
"Expecting " + expRecords + " records, Got " + numRecordsAtInstant2);
|
|
}
|
|
|
|
long numRecords = spark.sql("select * from hoodie_ro").count();
|
|
ValidationUtils.checkArgument(numRecords == expRecords,
|
|
"Expecting " + expRecords + " records, Got " + numRecords);
|
|
|
|
if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
|
|
/**
|
|
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
|
|
*/
|
|
Dataset<Row> hoodieIncViewDF = spark.read().format("hudi")
|
|
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
|
|
// Only changes in write 2 above
|
|
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
|
|
// For incremental view, pass in the root/base path of dataset
|
|
.load(tablePath);
|
|
|
|
LOG.info("You will only see records from : " + commitInstantTime2);
|
|
hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
|
|
}
|
|
return numExpCommits;
|
|
}
|
|
|
|
/**
|
|
* Hoodie spark streaming job.
|
|
*
|
|
* @param streamingInput
|
|
* @throws Exception
|
|
*/
|
|
public void stream(Dataset<Row> streamingInput, String operationType, String checkpointLocation) throws Exception {
|
|
|
|
DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
|
|
.option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2")
|
|
.option("hoodie.delete.shuffle.parallelism", "2")
|
|
.option(DataSourceWriteOptions.OPERATION_OPT_KEY(), operationType)
|
|
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType)
|
|
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
|
|
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
|
|
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
|
|
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1")
|
|
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY(), "true")
|
|
.option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", checkpointLocation)
|
|
.outputMode(OutputMode.Append());
|
|
|
|
updateHiveSyncConfig(writer);
|
|
StreamingQuery query = writer.trigger(new ProcessingTime(500)).start(tablePath);
|
|
query.awaitTermination(streamingDurationInMs);
|
|
}
|
|
|
|
/**
|
|
* Setup configs for syncing to hive.
|
|
*
|
|
* @param writer
|
|
* @return
|
|
*/
|
|
private DataStreamWriter<Row> updateHiveSyncConfig(DataStreamWriter<Row> writer) {
|
|
if (enableHiveSync) {
|
|
LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
|
|
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), hiveTable)
|
|
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), hiveDB)
|
|
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), hiveJdbcUrl)
|
|
.option(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), hiveUser)
|
|
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
|
|
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
|
|
if (useMultiPartitionKeys) {
|
|
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option(
|
|
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
|
MultiPartKeysValueExtractor.class.getCanonicalName());
|
|
} else {
|
|
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
|
|
}
|
|
}
|
|
return writer;
|
|
}
|
|
}
|