1
0

[HUDI-575] Spark Streaming with async compaction support (#1752)

This commit is contained in:
Balaji Varadarajan
2020-08-05 07:50:15 -07:00
committed by GitHub
parent 61e027fadd
commit 7a2429f5ba
22 changed files with 835 additions and 304 deletions

View File

@@ -151,6 +151,7 @@ public class HoodieJavaApp {
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
: SimpleKeyGenerator.class.getCanonicalName())
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_KEY(), "false")
// This will remove any existing data at path below, and create a
.mode(SaveMode.Overwrite);
@@ -177,6 +178,7 @@ public class HoodieJavaApp {
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
: SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1")
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_KEY(), "false")
.option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append);
updateHiveSyncConfig(writer);
@@ -202,6 +204,7 @@ public class HoodieJavaApp {
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
: SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1")
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_KEY(), "false")
.option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append);
updateHiveSyncConfig(writer);

View File

@@ -16,12 +16,18 @@
* limitations under the License.
*/
import java.util.stream.Collectors;
import org.apache.hudi.DataSourceReadOptions;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.HoodieDataSourceHelpers;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.TableNotFoundException;
import org.apache.hudi.hive.MultiPartKeysValueExtractor;
import com.beust.jcommander.JCommander;
@@ -43,6 +49,7 @@ import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.apache.spark.sql.streaming.StreamingQuery;
import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings;
@@ -52,14 +59,14 @@ import static org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrin
public class HoodieJavaStreamingApp {
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
private String tablePath = "file:///tmp/hoodie/streaming/sample-table";
private String tablePath = "/tmp/hoodie/streaming/sample-table";
@Parameter(names = {"--streaming-source-path", "-ssp"}, description = "path for streaming source file folder")
private String streamingSourcePath = "file:///tmp/hoodie/streaming/source";
private String streamingSourcePath = "/tmp/hoodie/streaming/source";
@Parameter(names = {"--streaming-checkpointing-path", "-scp"},
description = "path for streaming checking pointing folder")
private String streamingCheckpointingPath = "file:///tmp/hoodie/streaming/checkpoint";
private String streamingCheckpointingPath = "/tmp/hoodie/streaming/checkpoint";
@Parameter(names = {"--streaming-duration-in-ms", "-sdm"},
description = "time in millisecond for the streaming duration")
@@ -106,7 +113,15 @@ public class HoodieJavaStreamingApp {
cmd.usage();
System.exit(1);
}
cli.run();
int errStatus = 0;
try {
cli.run();
} catch (Exception ex) {
LOG.error("Got error running app ", ex);
errStatus = -1;
} finally {
System.exit(errStatus);
}
}
/**
@@ -132,38 +147,118 @@ public class HoodieJavaStreamingApp {
List<String> records1 = recordsToStrings(dataGen.generateInserts("001", 100));
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
List<String> records2 = recordsToStrings(dataGen.generateUpdates("002", 100));
List<String> records2 = recordsToStrings(dataGen.generateUpdatesForAllRecords("002"));
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
// setup the input for streaming
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(streamingSourcePath);
String ckptPath = streamingCheckpointingPath + "/stream1";
String srcPath = streamingSourcePath + "/stream1";
fs.mkdirs(new Path(ckptPath));
fs.mkdirs(new Path(srcPath));
// setup the input for streaming
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(srcPath + "/*");
// start streaming and showing
ExecutorService executor = Executors.newFixedThreadPool(2);
int numInitialCommits = 0;
// thread for spark strucutured streaming
Future<Void> streamFuture = executor.submit(() -> {
LOG.info("===== Streaming Starting =====");
stream(streamingInput);
LOG.info("===== Streaming Ends =====");
return null;
});
try {
Future<Void> streamFuture = executor.submit(() -> {
LOG.info("===== Streaming Starting =====");
stream(streamingInput, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL(), ckptPath);
LOG.info("===== Streaming Ends =====");
return null;
});
// thread for adding data to the streaming source and showing results over time
Future<Void> showFuture = executor.submit(() -> {
LOG.info("===== Showing Starting =====");
show(spark, fs, inputDF1, inputDF2);
LOG.info("===== Showing Ends =====");
return null;
});
// thread for adding data to the streaming source and showing results over time
Future<Integer> showFuture = executor.submit(() -> {
LOG.info("===== Showing Starting =====");
int numCommits = addInputAndValidateIngestion(spark, fs, srcPath,0, 100, inputDF1, inputDF2, true);
LOG.info("===== Showing Ends =====");
return numCommits;
});
// let the threads run
streamFuture.get();
showFuture.get();
// let the threads run
streamFuture.get();
numInitialCommits = showFuture.get();
} finally {
executor.shutdownNow();
}
executor.shutdown();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(), tablePath);
if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
// Ensure we have successfully completed one compaction commit
ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() == 1);
} else {
ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() >= 1);
}
// Deletes Stream
// Need to restart application to ensure spark does not assume there are multiple streams active.
spark.close();
SparkSession newSpark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
jssc = new JavaSparkContext(newSpark.sparkContext());
String ckptPath2 = streamingCheckpointingPath + "/stream2";
String srcPath2 = srcPath + "/stream2";
fs.mkdirs(new Path(ckptPath2));
fs.mkdirs(new Path(srcPath2));
Dataset<Row> delStreamingInput = newSpark.readStream().schema(inputDF1.schema()).json(srcPath2 + "/*");
List<String> deletes = recordsToStrings(dataGen.generateUniqueUpdates("002", 20));
Dataset<Row> inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2));
executor = Executors.newFixedThreadPool(2);
// thread for spark strucutured streaming
try {
Future<Void> streamFuture = executor.submit(() -> {
LOG.info("===== Streaming Starting =====");
stream(delStreamingInput, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), ckptPath2);
LOG.info("===== Streaming Ends =====");
return null;
});
final int numCommits = numInitialCommits;
// thread for adding data to the streaming source and showing results over time
Future<Void> showFuture = executor.submit(() -> {
LOG.info("===== Showing Starting =====");
addInputAndValidateIngestion(newSpark, fs, srcPath2, numCommits, 80, inputDF3, null, false);
LOG.info("===== Showing Ends =====");
return null;
});
// let the threads run
streamFuture.get();
showFuture.get();
} finally {
executor.shutdown();
}
}
private void waitTillNCommits(FileSystem fs, int numCommits, int timeoutSecs, int sleepSecsAfterEachRun)
throws InterruptedException {
long beginTime = System.currentTimeMillis();
long currTime = beginTime;
long timeoutMsecs = timeoutSecs * 1000;
while ((currTime - beginTime) < timeoutMsecs) {
try {
HoodieTimeline timeline = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, tablePath);
LOG.info("Timeline :" + timeline.getInstants().collect(Collectors.toList()));
if (timeline.countInstants() >= numCommits) {
return;
}
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath, true);
System.out.println("Instants :" + metaClient.getActiveTimeline().getInstants().collect(Collectors.toList()));
} catch (TableNotFoundException te) {
LOG.info("Got table not found exception. Retrying");
} finally {
Thread.sleep(sleepSecsAfterEachRun * 1000);
currTime = System.currentTimeMillis();
}
}
throw new IllegalStateException("Timedout waiting for " + numCommits + " commits to appear in " + tablePath);
}
/**
@@ -175,23 +270,40 @@ public class HoodieJavaStreamingApp {
* @param inputDF2
* @throws Exception
*/
public void show(SparkSession spark, FileSystem fs, Dataset<Row> inputDF1, Dataset<Row> inputDF2) throws Exception {
inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath);
public int addInputAndValidateIngestion(SparkSession spark, FileSystem fs, String srcPath,
int initialCommits, int expRecords,
Dataset<Row> inputDF1, Dataset<Row> inputDF2, boolean instantTimeValidation) throws Exception {
inputDF1.write().mode(SaveMode.Append).json(srcPath);
int numExpCommits = initialCommits + 1;
// wait for spark streaming to process one microbatch
Thread.sleep(3000);
waitTillNCommits(fs, numExpCommits, 180, 3);
String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
LOG.info("First commit at instant time :" + commitInstantTime1);
inputDF2.write().mode(SaveMode.Append).json(streamingSourcePath);
// wait for spark streaming to process one microbatch
Thread.sleep(3000);
String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
LOG.info("Second commit at instant time :" + commitInstantTime2);
String commitInstantTime2 = commitInstantTime1;
if (null != inputDF2) {
numExpCommits += 1;
inputDF2.write().mode(SaveMode.Append).json(srcPath);
// wait for spark streaming to process one microbatch
Thread.sleep(3000);
waitTillNCommits(fs, numExpCommits, 180, 3);
commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
LOG.info("Second commit at instant time :" + commitInstantTime2);
}
if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
numExpCommits += 1;
// Wait for compaction to also finish and track latest timestamp as commit timestamp
waitTillNCommits(fs, numExpCommits, 180, 3);
commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
LOG.info("Compaction commit at instant time :" + commitInstantTime2);
}
/**
* Read & do some queries
*/
Dataset<Row> hoodieROViewDF = spark.read().format("org.apache.hudi")
Dataset<Row> hoodieROViewDF = spark.read().format("hudi")
// pass any path glob, can include hoodie & non-hoodie
// datasets
.load(tablePath + "/*/*/*/*");
@@ -200,11 +312,24 @@ public class HoodieJavaStreamingApp {
// all trips whose fare amount was greater than 2.
spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();
if (instantTimeValidation) {
System.out.println("Showing all records. Latest Instant Time =" + commitInstantTime2);
spark.sql("select * from hoodie_ro").show(200, false);
long numRecordsAtInstant2 =
spark.sql("select * from hoodie_ro where _hoodie_commit_time = " + commitInstantTime2).count();
ValidationUtils.checkArgument(numRecordsAtInstant2 == expRecords,
"Expecting " + expRecords + " records, Got " + numRecordsAtInstant2);
}
long numRecords = spark.sql("select * from hoodie_ro").count();
ValidationUtils.checkArgument(numRecords == expRecords,
"Expecting " + expRecords + " records, Got " + numRecords);
if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
/**
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
*/
Dataset<Row> hoodieIncViewDF = spark.read().format("org.apache.hudi")
Dataset<Row> hoodieIncViewDF = spark.read().format("hudi")
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
// Only changes in write 2 above
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
@@ -214,6 +339,7 @@ public class HoodieJavaStreamingApp {
LOG.info("You will only see records from : " + commitInstantTime2);
hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
}
return numExpCommits;
}
/**
@@ -222,19 +348,23 @@ public class HoodieJavaStreamingApp {
* @param streamingInput
* @throws Exception
*/
public void stream(Dataset<Row> streamingInput) throws Exception {
public void stream(Dataset<Row> streamingInput, String operationType, String checkpointLocation) throws Exception {
DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
.option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2")
.option(DataSourceWriteOptions.OPERATION_OPT_KEY(), operationType)
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY(), tableType)
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
.option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", streamingCheckpointingPath)
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP, "1")
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_KEY(), "true")
.option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", checkpointLocation)
.outputMode(OutputMode.Append());
updateHiveSyncConfig(writer);
writer.trigger(new ProcessingTime(500)).start(tablePath).awaitTermination(streamingDurationInMs);
StreamingQuery query = writer.trigger(new ProcessingTime(500)).start(tablePath);
query.awaitTermination(streamingDurationInMs);
}
/**

View File

@@ -50,7 +50,8 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
try {
val sqlContext = session.sqlContext
val options = Map("path" -> "hoodie/test/path", HoodieWriteConfig.TABLE_NAME -> "hoodie_test_tbl")
val e = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.ErrorIfExists, options, session.emptyDataFrame))
val e = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.ErrorIfExists, options,
session.emptyDataFrame))
assert(e.getMessage.contains("spark.serializer"))
} finally {
session.stop()

View File

@@ -17,12 +17,16 @@
package org.apache.hudi.functional
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hudi.common.fs.FSUtils
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.exception.TableNotFoundException
import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers}
import org.apache.log4j.LogManager
import org.apache.spark.sql._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime}
@@ -39,6 +43,7 @@ import scala.concurrent.{Await, Future}
* Basic tests on the spark datasource
*/
class TestDataSource {
private val log = LogManager.getLogger(getClass)
var spark: SparkSession = null
var dataGen: HoodieTestDataGenerator = null
@@ -214,7 +219,7 @@ class TestDataSource {
assertEquals(hoodieIncViewDF2.count(), insert2NewKeyCnt)
}
//@Test (TODO: re-enable after fixing noisyness)
@Test
def testStructuredStreaming(): Unit = {
fs.delete(new Path(basePath), true)
val sourcePath = basePath + "/source"
@@ -254,7 +259,7 @@ class TestDataSource {
val f2 = Future {
inputDF1.write.mode(SaveMode.Append).json(sourcePath)
// wait for spark streaming to process one microbatch
Thread.sleep(3000)
val currNumCommits = waitTillAtleastNCommits(fs, destPath, 1, 120, 5);
assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, destPath, "000"))
val commitInstantTime1: String = HoodieDataSourceHelpers.latestCommit(fs, destPath)
// Read RO View
@@ -264,9 +269,8 @@ class TestDataSource {
inputDF2.write.mode(SaveMode.Append).json(sourcePath)
// wait for spark streaming to process one microbatch
Thread.sleep(10000)
waitTillAtleastNCommits(fs, destPath, currNumCommits + 1, 120, 5);
val commitInstantTime2: String = HoodieDataSourceHelpers.latestCommit(fs, destPath)
assertEquals(2, HoodieDataSourceHelpers.listCommitsSince(fs, destPath, "000").size())
// Read RO View
val hoodieROViewDF2 = spark.read.format("org.apache.hudi")
@@ -299,8 +303,35 @@ class TestDataSource {
assertEquals(1, countsPerCommit.length)
assertEquals(commitInstantTime2, countsPerCommit(0).get(0))
}
Await.result(Future.sequence(Seq(f1, f2)), Duration.Inf)
}
@throws[InterruptedException]
private def waitTillAtleastNCommits(fs: FileSystem, tablePath: String,
numCommits: Int, timeoutSecs: Int, sleepSecsAfterEachRun: Int): Int = {
val beginTime = System.currentTimeMillis
var currTime = beginTime
val timeoutMsecs = timeoutSecs * 1000
var numInstants = 0
var success: Boolean = false
while ({!success && (currTime - beginTime) < timeoutMsecs}) try {
val timeline = HoodieDataSourceHelpers.allCompletedCommitsCompactions(fs, tablePath)
log.info("Timeline :" + timeline.getInstants.toArray)
if (timeline.countInstants >= numCommits) {
numInstants = timeline.countInstants
success = true
}
val metaClient = new HoodieTableMetaClient(fs.getConf, tablePath, true)
} catch {
case te: TableNotFoundException =>
log.info("Got table not found exception. Retrying")
} finally {
Thread.sleep(sleepSecsAfterEachRun * 1000)
currTime = System.currentTimeMillis
}
if (!success) {
throw new IllegalStateException("Timed-out waiting for " + numCommits + " commits to appear in " + tablePath)
}
numInstants
}
}