[HUDI-575] Spark Streaming with async compaction support (#1752)
This commit is contained in:
committed by
GitHub
parent
61e027fadd
commit
7a2429f5ba
@@ -16,12 +16,18 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hudi.DataSourceReadOptions;
|
||||
import org.apache.hudi.DataSourceWriteOptions;
|
||||
import org.apache.hudi.HoodieDataSourceHelpers;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.TableNotFoundException;
|
||||
import org.apache.hudi.hive.MultiPartKeysValueExtractor;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
@@ -43,6 +49,7 @@ import java.util.List;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import org.apache.spark.sql.streaming.StreamingQuery;
|
||||
|
||||
import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings;
|
||||
|
||||
@@ -52,14 +59,14 @@ import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrin
|
||||
public class HoodieJavaStreamingApp {
|
||||
|
||||
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
|
||||
private String tablePath = "file:///tmp/hoodie/streaming/sample-table";
|
||||
private String tablePath = "/tmp/hoodie/streaming/sample-table";
|
||||
|
||||
@Parameter(names = {"--streaming-source-path", "-ssp"}, description = "path for streaming source file folder")
|
||||
private String streamingSourcePath = "file:///tmp/hoodie/streaming/source";
|
||||
private String streamingSourcePath = "/tmp/hoodie/streaming/source";
|
||||
|
||||
@Parameter(names = {"--streaming-checkpointing-path", "-scp"},
|
||||
description = "path for streaming checking pointing folder")
|
||||
private String streamingCheckpointingPath = "file:///tmp/hoodie/streaming/checkpoint";
|
||||
private String streamingCheckpointingPath = "/tmp/hoodie/streaming/checkpoint";
|
||||
|
||||
@Parameter(names = {"--streaming-duration-in-ms", "-sdm"},
|
||||
description = "time in millisecond for the streaming duration")
|
||||
@@ -106,7 +113,15 @@ public class HoodieJavaStreamingApp {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
}
|
||||
cli.run();
|
||||
int errStatus = 0;
|
||||
try {
|
||||
cli.run();
|
||||
} catch (Exception ex) {
|
||||
LOG.error("Got error running app ", ex);
|
||||
errStatus = -1;
|
||||
} finally {
|
||||
System.exit(errStatus);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -132,38 +147,118 @@ public class HoodieJavaStreamingApp {
|
||||
List<String> records1 = recordsToStrings(dataGen.generateInserts("001", 100));
|
||||
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
|
||||
|
||||
List<String> records2 = recordsToStrings(dataGen.generateUpdates("002", 100));
|
||||
|
||||
List<String> records2 = recordsToStrings(dataGen.generateUpdatesForAllRecords("002"));
|
||||
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
|
||||
|
||||
// setup the input for streaming
|
||||
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(streamingSourcePath);
|
||||
|
||||
String ckptPath = streamingCheckpointingPath + "/stream1";
|
||||
String srcPath = streamingSourcePath + "/stream1";
|
||||
fs.mkdirs(new Path(ckptPath));
|
||||
fs.mkdirs(new Path(srcPath));
|
||||
|
||||
// setup the input for streaming
|
||||
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(srcPath + "/*");
|
||||
|
||||
// start streaming and showing
|
||||
ExecutorService executor = Executors.newFixedThreadPool(2);
|
||||
int numInitialCommits = 0;
|
||||
|
||||
// thread for spark strucutured streaming
|
||||
Future<Void> streamFuture = executor.submit(() -> {
|
||||
LOG.info("===== Streaming Starting =====");
|
||||
stream(streamingInput);
|
||||
LOG.info("===== Streaming Ends =====");
|
||||
return null;
|
||||
});
|
||||
try {
|
||||
Future<Void> streamFuture = executor.submit(() -> {
|
||||
LOG.info("===== Streaming Starting =====");
|
||||
stream(streamingInput, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL(), ckptPath);
|
||||
LOG.info("===== Streaming Ends =====");
|
||||
return null;
|
||||
});
|
||||
|
||||
// thread for adding data to the streaming source and showing results over time
|
||||
Future<Void> showFuture = executor.submit(() -> {
|
||||
LOG.info("===== Showing Starting =====");
|
||||
show(spark, fs, inputDF1, inputDF2);
|
||||
LOG.info("===== Showing Ends =====");
|
||||
return null;
|
||||
});
|
||||
// thread for adding data to the streaming source and showing results over time
|
||||
Future<Integer> showFuture = executor.submit(() -> {
|
||||
LOG.info("===== Showing Starting =====");
|
||||
int numCommits = addInputAndValidateIngestion(spark, fs, srcPath,0, 100, inputDF1, inputDF2, true);
|
||||
LOG.info("===== Showing Ends =====");
|
||||
return numCommits;
|
||||
});
|
||||
|
||||
// let the threads run
|
||||
streamFuture.get();
|
||||
showFuture.get();
|
||||
// let the threads run
|
||||
streamFuture.get();
|
||||
numInitialCommits = showFuture.get();
|
||||
} finally {
|
||||
executor.shutdownNow();
|
||||
}
|
||||
|
||||
executor.shutdown();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), tablePath);
|
||||
if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
|
||||
// Ensure we have successfully completed one compaction commit
|
||||
ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() == 1);
|
||||
} else {
|
||||
ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() >= 1);
|
||||
}
|
||||
|
||||
// Deletes Stream
|
||||
// Need to restart application to ensure spark does not assume there are multiple streams active.
|
||||
spark.close();
|
||||
SparkSession newSpark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
|
||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
|
||||
jssc = new JavaSparkContext(newSpark.sparkContext());
|
||||
String ckptPath2 = streamingCheckpointingPath + "/stream2";
|
||||
String srcPath2 = srcPath + "/stream2";
|
||||
fs.mkdirs(new Path(ckptPath2));
|
||||
fs.mkdirs(new Path(srcPath2));
|
||||
Dataset<Row> delStreamingInput = newSpark.readStream().schema(inputDF1.schema()).json(srcPath2 + "/*");
|
||||
List<String> deletes = recordsToStrings(dataGen.generateUniqueUpdates("002", 20));
|
||||
Dataset<Row> inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2));
|
||||
executor = Executors.newFixedThreadPool(2);
|
||||
|
||||
// thread for spark strucutured streaming
|
||||
try {
|
||||
Future<Void> streamFuture = executor.submit(() -> {
|
||||
LOG.info("===== Streaming Starting =====");
|
||||
stream(delStreamingInput, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), ckptPath2);
|
||||
LOG.info("===== Streaming Ends =====");
|
||||
return null;
|
||||
});
|
||||
|
||||
final int numCommits = numInitialCommits;
|
||||
// thread for adding data to the streaming source and showing results over time
|
||||
Future<Void> showFuture = executor.submit(() -> {
|
||||
LOG.info("===== Showing Starting =====");
|
||||
addInputAndValidateIngestion(newSpark, fs, srcPath2, numCommits, 80, inputDF3, null, false);
|
||||
LOG.info("===== Showing Ends =====");
|
||||
return null;
|
||||
});
|
||||
|
||||
// let the threads run
|
||||
streamFuture.get();
|
||||
showFuture.get();
|
||||
} finally {
|
||||
executor.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
private void waitTillNCommits(FileSystem fs, int numCommits, int timeoutSecs, int sleepSecsAfterEachRun)
|
||||
throws InterruptedException {
|
||||
long beginTime = System.currentTimeMillis();
|
||||
long currTime = beginTime;
|
||||
long timeoutMsecs = timeoutSecs * 1000;
|
||||
|
||||
while ((currTime - beginTime) < timeoutMsecs) {
|
||||
try {
|
||||
HoodieTimeline timeline = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, tablePath);
|
||||
LOG.info("Timeline :" + timeline.getInstants().collect(Collectors.toList()));
|
||||
if (timeline.countInstants() >= numCommits) {
|
||||
return;
|
||||
}
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath, true);
|
||||
System.out.println("Instants :" + metaClient.getActiveTimeline().getInstants().collect(Collectors.toList()));
|
||||
} catch (TableNotFoundException te) {
|
||||
LOG.info("Got table not found exception. Retrying");
|
||||
} finally {
|
||||
Thread.sleep(sleepSecsAfterEachRun * 1000);
|
||||
currTime = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
throw new IllegalStateException("Timedout waiting for " + numCommits + " commits to appear in " + tablePath);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -175,23 +270,40 @@ public class HoodieJavaStreamingApp {
|
||||
* @param inputDF2
|
||||
* @throws Exception
|
||||
*/
|
||||
public void show(SparkSession spark, FileSystem fs, Dataset<Row> inputDF1, Dataset<Row> inputDF2) throws Exception {
|
||||
inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath);
|
||||
public int addInputAndValidateIngestion(SparkSession spark, FileSystem fs, String srcPath,
|
||||
int initialCommits, int expRecords,
|
||||
Dataset<Row> inputDF1, Dataset<Row> inputDF2, boolean instantTimeValidation) throws Exception {
|
||||
inputDF1.write().mode(SaveMode.Append).json(srcPath);
|
||||
|
||||
int numExpCommits = initialCommits + 1;
|
||||
// wait for spark streaming to process one microbatch
|
||||
Thread.sleep(3000);
|
||||
waitTillNCommits(fs, numExpCommits, 180, 3);
|
||||
String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
|
||||
LOG.info("First commit at instant time :" + commitInstantTime1);
|
||||
|
||||
inputDF2.write().mode(SaveMode.Append).json(streamingSourcePath);
|
||||
// wait for spark streaming to process one microbatch
|
||||
Thread.sleep(3000);
|
||||
String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
|
||||
LOG.info("Second commit at instant time :" + commitInstantTime2);
|
||||
String commitInstantTime2 = commitInstantTime1;
|
||||
if (null != inputDF2) {
|
||||
numExpCommits += 1;
|
||||
inputDF2.write().mode(SaveMode.Append).json(srcPath);
|
||||
// wait for spark streaming to process one microbatch
|
||||
Thread.sleep(3000);
|
||||
waitTillNCommits(fs, numExpCommits, 180, 3);
|
||||
commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
|
||||
LOG.info("Second commit at instant time :" + commitInstantTime2);
|
||||
}
|
||||
|
||||
if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
|
||||
numExpCommits += 1;
|
||||
// Wait for compaction to also finish and track latest timestamp as commit timestamp
|
||||
waitTillNCommits(fs, numExpCommits, 180, 3);
|
||||
commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
|
||||
LOG.info("Compaction commit at instant time :" + commitInstantTime2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Read & do some queries
|
||||
*/
|
||||
Dataset<Row> hoodieROViewDF = spark.read().format("org.apache.hudi")
|
||||
Dataset<Row> hoodieROViewDF = spark.read().format("hudi")
|
||||
// pass any path glob, can include hoodie & non-hoodie
|
||||
// datasets
|
||||
.load(tablePath + "/*/*/*/*");
|
||||
@@ -200,11 +312,24 @@ public class HoodieJavaStreamingApp {
|
||||
// all trips whose fare amount was greater than 2.
|
||||
spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();
|
||||
|
||||
if (instantTimeValidation) {
|
||||
System.out.println("Showing all records. Latest Instant Time =" + commitInstantTime2);
|
||||
spark.sql("select * from hoodie_ro").show(200, false);
|
||||
long numRecordsAtInstant2 =
|
||||
spark.sql("select * from hoodie_ro where _hoodie_commit_time = " + commitInstantTime2).count();
|
||||
ValidationUtils.checkArgument(numRecordsAtInstant2 == expRecords,
|
||||
"Expecting " + expRecords + " records, Got " + numRecordsAtInstant2);
|
||||
}
|
||||
|
||||
long numRecords = spark.sql("select * from hoodie_ro").count();
|
||||
ValidationUtils.checkArgument(numRecords == expRecords,
|
||||
"Expecting " + expRecords + " records, Got " + numRecords);
|
||||
|
||||
if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
|
||||
/**
|
||||
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
|
||||
*/
|
||||
Dataset<Row> hoodieIncViewDF = spark.read().format("org.apache.hudi")
|
||||
Dataset<Row> hoodieIncViewDF = spark.read().format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
|
||||
// Only changes in write 2 above
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
|
||||
@@ -214,6 +339,7 @@ public class HoodieJavaStreamingApp {
|
||||
LOG.info("You will only see records from : " + commitInstantTime2);
|
||||
hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
|
||||
}
|
||||
return numExpCommits;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -222,19 +348,23 @@ public class HoodieJavaStreamingApp {
|
||||
* @param streamingInput
|
||||
* @throws Exception
|
||||
*/
|
||||
public void stream(Dataset<Row> streamingInput) throws Exception {
|
||||
public void stream(Dataset<Row> streamingInput, String operationType, String checkpointLocation) throws Exception {
|
||||
|
||||
DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
|
||||
.option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY(), operationType)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType)
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
|
||||
.option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", streamingCheckpointingPath)
|
||||
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1")
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_KEY(), "true")
|
||||
.option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", checkpointLocation)
|
||||
.outputMode(OutputMode.Append());
|
||||
|
||||
updateHiveSyncConfig(writer);
|
||||
writer.trigger(new ProcessingTime(500)).start(tablePath).awaitTermination(streamingDurationInMs);
|
||||
StreamingQuery query = writer.trigger(new ProcessingTime(500)).start(tablePath);
|
||||
query.awaitTermination(streamingDurationInMs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user