1
0

[HUDI-296] Explore use of spotless to auto fix formatting errors (#945)

- Add spotless format fixing to project
- One time reformatting for conformity
- Build fails for formatting changes and mvn spotless:apply autofixes them
This commit is contained in:
leesf
2019-10-10 20:19:40 +08:00
committed by vinoth chandar
parent 834c591955
commit b19bed442d
381 changed files with 7350 additions and 9064 deletions

View File

@@ -113,9 +113,7 @@ public class HoodieJavaStreamingApp {
public void run() throws Exception {
// Spark session setup..
SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
.config("spark.serializer",
"org.apache.spark.serializer.KryoSerializer").master("local[1]")
.getOrCreate();
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
// folder path clean up and creation, preparing the environment
@@ -128,18 +126,15 @@ public class HoodieJavaStreamingApp {
// Generator of some records to be loaded in.
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
List<String> records1 = DataSourceTestUtils.convertToStringList(
dataGen.generateInserts("001", 100));
List<String> records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001", 100));
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
List<String> records2 = DataSourceTestUtils.convertToStringList(
dataGen.generateUpdates("002", 100));
List<String> records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002", 100));
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
// setup the input for streaming
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema())
.json(streamingSourcePath);
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(streamingSourcePath);
// start streaming and showing
@@ -174,16 +169,14 @@ public class HoodieJavaStreamingApp {
/**
* Adding data to the streaming source and showing results over time
*
* @param spark
* @param fs
* @param inputDF1
* @param inputDF2
* @throws Exception
*/
public void show(SparkSession spark,
FileSystem fs,
Dataset<Row> inputDF1,
Dataset<Row> inputDF2) throws Exception {
public void show(SparkSession spark, FileSystem fs, Dataset<Row> inputDF1, Dataset<Row> inputDF2) throws Exception {
inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath);
// wait for spark streaming to process one microbatch
Thread.sleep(3000);
@@ -206,20 +199,18 @@ public class HoodieJavaStreamingApp {
hoodieROViewDF.registerTempTable("hoodie_ro");
spark.sql("describe hoodie_ro").show();
// all trips whose fare was greater than 2.
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0")
.show();
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show();
if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
/**
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
*/
Dataset<Row> hoodieIncViewDF = spark.read().format("org.apache.hudi")
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(),
commitInstantTime1) // Only changes in write 2 above
.load(
tablePath); // For incremental view, pass in the root/base path of dataset
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
// Only changes in write 2 above
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
// For incremental view, pass in the root/base path of dataset
.load(tablePath);
logger.info("You will only see records from : " + commitInstantTime2);
hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
@@ -228,33 +219,28 @@ public class HoodieJavaStreamingApp {
/**
* Hoodie spark streaming job
*
* @param streamingInput
* @throws Exception
*/
public void stream(Dataset<Row> streamingInput) throws Exception {
DataStreamWriter<Row> writer = streamingInput
.writeStream()
.format("org.apache.hudi")
.option("hoodie.insert.shuffle.parallelism", "2")
.option("hoodie.upsert.shuffle.parallelism", "2")
DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
.option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2")
.option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType)
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
.option(HoodieWriteConfig.TABLE_NAME, tableName)
.option("checkpointLocation", streamingCheckpointingPath)
.option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", streamingCheckpointingPath)
.outputMode(OutputMode.Append());
updateHiveSyncConfig(writer);
writer
.trigger(new ProcessingTime(500))
.start(tablePath)
.awaitTermination(streamingDurationInMs);
writer.trigger(new ProcessingTime(500)).start(tablePath).awaitTermination(streamingDurationInMs);
}
/**
* Setup configs for syncing to hive
*
* @param writer
* @return
*/
@@ -268,8 +254,8 @@ public class HoodieJavaStreamingApp {
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
if (useMultiPartitionKeys) {
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day")
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option(
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
MultiPartKeysValueExtractor.class.getCanonicalName());
} else {
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");