[HUDI-296] Explore use of spotless to auto fix formatting errors (#945)
- Add spotless format fixing to project - One time reformatting for conformity - Build fails for formatting changes and mvn spotless:apply autofixes them
This commit is contained in:
@@ -32,15 +32,14 @@ public class DataSourceTestUtils {
|
||||
try {
|
||||
String str = ((TestRawTripPayload) record.getData()).getJsonData();
|
||||
str = "{" + str.substring(str.indexOf("\"timestamp\":"));
|
||||
return Option.of(str.replaceAll("}",
|
||||
", \"partition\": \"" + record.getPartitionPath() + "\"}"));
|
||||
return Option.of(str.replaceAll("}", ", \"partition\": \"" + record.getPartitionPath() + "\"}"));
|
||||
} catch (IOException e) {
|
||||
return Option.empty();
|
||||
}
|
||||
}
|
||||
|
||||
public static List<String> convertToStringList(List<HoodieRecord> records) {
|
||||
return records.stream().map(hr -> convertToString(hr)).filter(os -> os.isPresent())
|
||||
.map(os -> os.get()).collect(Collectors.toList());
|
||||
return records.stream().map(hr -> convertToString(hr)).filter(os -> os.isPresent()).map(os -> os.get())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -97,9 +97,7 @@ public class HoodieJavaApp {
|
||||
|
||||
// Spark session setup..
|
||||
SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP")
|
||||
.config("spark.serializer",
|
||||
"org.apache.spark.serializer.KryoSerializer").master("local[1]")
|
||||
.getOrCreate();
|
||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
|
||||
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
|
||||
FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
|
||||
|
||||
@@ -107,7 +105,7 @@ public class HoodieJavaApp {
|
||||
HoodieTestDataGenerator dataGen = null;
|
||||
if (nonPartitionedTable) {
|
||||
// All data goes to base-path
|
||||
dataGen = new HoodieTestDataGenerator(new String[]{""});
|
||||
dataGen = new HoodieTestDataGenerator(new String[] {""});
|
||||
} else {
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
}
|
||||
@@ -116,31 +114,34 @@ public class HoodieJavaApp {
|
||||
* Commit with only inserts
|
||||
*/
|
||||
// Generate some input..
|
||||
List<String> records1 = DataSourceTestUtils.convertToStringList(
|
||||
dataGen.generateInserts("001"/* ignore */, 100));
|
||||
List<String> records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001"/* ignore */, 100));
|
||||
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
|
||||
|
||||
// Save as hoodie dataset (copy on write)
|
||||
DataFrameWriter<Row> writer = inputDF1.write().format("org.apache.hudi") // specify the hoodie source
|
||||
.option("hoodie.insert.shuffle.parallelism",
|
||||
"2") // any hoodie client config can be passed like this
|
||||
.option("hoodie.upsert.shuffle.parallelism",
|
||||
"2") // full list in HoodieWriteConfig & its package
|
||||
.option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY(),
|
||||
DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(),
|
||||
"_row_key") // This is the record key
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(),
|
||||
"partition") // this is the partition to place it into
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(),
|
||||
"timestamp") // use to combine duplicate records in input/with disk val
|
||||
.option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries
|
||||
// specify the hoodie source
|
||||
DataFrameWriter<Row> writer = inputDF1.write().format("org.apache.hudi")
|
||||
// any hoodie client config can be passed like this
|
||||
.option("hoodie.insert.shuffle.parallelism", "2")
|
||||
// full list in HoodieWriteConfig & its package
|
||||
.option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
// Hoodie Table Type
|
||||
.option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType)
|
||||
// insert
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())
|
||||
// This is the record key
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
|
||||
// this is the partition to place it into
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
|
||||
// use to combine duplicate records in input/with disk val
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
|
||||
// Used by hive sync and queries
|
||||
.option(HoodieWriteConfig.TABLE_NAME, tableName)
|
||||
// Add Key Extractor
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
|
||||
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() :
|
||||
SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
|
||||
.mode(
|
||||
SaveMode.Overwrite); // This will remove any existing data at path below, and create a
|
||||
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
|
||||
: SimpleKeyGenerator.class.getCanonicalName())
|
||||
// This will remove any existing data at path below, and create a
|
||||
.mode(SaveMode.Overwrite);
|
||||
|
||||
updateHiveSyncConfig(writer);
|
||||
// new dataset if needed
|
||||
@@ -151,8 +152,7 @@ public class HoodieJavaApp {
|
||||
/**
|
||||
* Commit that updates records
|
||||
*/
|
||||
List<String> records2 = DataSourceTestUtils.convertToStringList(
|
||||
dataGen.generateUpdates("002"/* ignore */, 100));
|
||||
List<String> records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100));
|
||||
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
|
||||
writer = inputDF2.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2")
|
||||
.option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
@@ -161,8 +161,8 @@ public class HoodieJavaApp {
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
|
||||
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() :
|
||||
SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
|
||||
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
|
||||
: SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
|
||||
.option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append);
|
||||
|
||||
updateHiveSyncConfig(writer);
|
||||
@@ -180,20 +180,18 @@ public class HoodieJavaApp {
|
||||
hoodieROViewDF.registerTempTable("hoodie_ro");
|
||||
spark.sql("describe hoodie_ro").show();
|
||||
// all trips whose fare was greater than 2.
|
||||
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0")
|
||||
.show();
|
||||
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show();
|
||||
|
||||
if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
|
||||
/**
|
||||
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
|
||||
*/
|
||||
Dataset<Row> hoodieIncViewDF = spark.read().format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
|
||||
DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(),
|
||||
commitInstantTime1) // Only changes in write 2 above
|
||||
.load(
|
||||
tablePath); // For incremental view, pass in the root/base path of dataset
|
||||
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
|
||||
// Only changes in write 2 above
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
|
||||
// For incremental view, pass in the root/base path of dataset
|
||||
.load(tablePath);
|
||||
|
||||
logger.info("You will only see records from : " + commitInstantTime2);
|
||||
hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
|
||||
@@ -202,6 +200,7 @@ public class HoodieJavaApp {
|
||||
|
||||
/**
|
||||
* Setup configs for syncing to hive
|
||||
*
|
||||
* @param writer
|
||||
* @return
|
||||
*/
|
||||
@@ -215,12 +214,13 @@ public class HoodieJavaApp {
|
||||
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
|
||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
|
||||
if (nonPartitionedTable) {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
||||
NonPartitionedExtractor.class.getCanonicalName())
|
||||
writer = writer
|
||||
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
||||
NonPartitionedExtractor.class.getCanonicalName())
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "");
|
||||
} else if (useMultiPartitionKeys) {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day")
|
||||
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option(
|
||||
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
||||
MultiPartKeysValueExtractor.class.getCanonicalName());
|
||||
} else {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
|
||||
|
||||
@@ -113,9 +113,7 @@ public class HoodieJavaStreamingApp {
|
||||
public void run() throws Exception {
|
||||
// Spark session setup..
|
||||
SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP")
|
||||
.config("spark.serializer",
|
||||
"org.apache.spark.serializer.KryoSerializer").master("local[1]")
|
||||
.getOrCreate();
|
||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
|
||||
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
// folder path clean up and creation, preparing the environment
|
||||
@@ -128,18 +126,15 @@ public class HoodieJavaStreamingApp {
|
||||
// Generator of some records to be loaded in.
|
||||
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
|
||||
|
||||
List<String> records1 = DataSourceTestUtils.convertToStringList(
|
||||
dataGen.generateInserts("001", 100));
|
||||
List<String> records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001", 100));
|
||||
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
|
||||
|
||||
List<String> records2 = DataSourceTestUtils.convertToStringList(
|
||||
dataGen.generateUpdates("002", 100));
|
||||
List<String> records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002", 100));
|
||||
|
||||
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
|
||||
|
||||
// setup the input for streaming
|
||||
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema())
|
||||
.json(streamingSourcePath);
|
||||
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(streamingSourcePath);
|
||||
|
||||
|
||||
// start streaming and showing
|
||||
@@ -174,16 +169,14 @@ public class HoodieJavaStreamingApp {
|
||||
|
||||
/**
|
||||
* Adding data to the streaming source and showing results over time
|
||||
*
|
||||
* @param spark
|
||||
* @param fs
|
||||
* @param inputDF1
|
||||
* @param inputDF2
|
||||
* @throws Exception
|
||||
*/
|
||||
public void show(SparkSession spark,
|
||||
FileSystem fs,
|
||||
Dataset<Row> inputDF1,
|
||||
Dataset<Row> inputDF2) throws Exception {
|
||||
public void show(SparkSession spark, FileSystem fs, Dataset<Row> inputDF1, Dataset<Row> inputDF2) throws Exception {
|
||||
inputDF1.write().mode(SaveMode.Append).json(streamingSourcePath);
|
||||
// wait for spark streaming to process one microbatch
|
||||
Thread.sleep(3000);
|
||||
@@ -206,20 +199,18 @@ public class HoodieJavaStreamingApp {
|
||||
hoodieROViewDF.registerTempTable("hoodie_ro");
|
||||
spark.sql("describe hoodie_ro").show();
|
||||
// all trips whose fare was greater than 2.
|
||||
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0")
|
||||
.show();
|
||||
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show();
|
||||
|
||||
if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
|
||||
/**
|
||||
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
|
||||
*/
|
||||
Dataset<Row> hoodieIncViewDF = spark.read().format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
|
||||
DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(),
|
||||
commitInstantTime1) // Only changes in write 2 above
|
||||
.load(
|
||||
tablePath); // For incremental view, pass in the root/base path of dataset
|
||||
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
|
||||
// Only changes in write 2 above
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1)
|
||||
// For incremental view, pass in the root/base path of dataset
|
||||
.load(tablePath);
|
||||
|
||||
logger.info("You will only see records from : " + commitInstantTime2);
|
||||
hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
|
||||
@@ -228,33 +219,28 @@ public class HoodieJavaStreamingApp {
|
||||
|
||||
/**
|
||||
* Hoodie spark streaming job
|
||||
*
|
||||
* @param streamingInput
|
||||
* @throws Exception
|
||||
*/
|
||||
public void stream(Dataset<Row> streamingInput) throws Exception {
|
||||
|
||||
DataStreamWriter<Row> writer = streamingInput
|
||||
.writeStream()
|
||||
.format("org.apache.hudi")
|
||||
.option("hoodie.insert.shuffle.parallelism", "2")
|
||||
.option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
|
||||
.option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
.option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType)
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
|
||||
.option(HoodieWriteConfig.TABLE_NAME, tableName)
|
||||
.option("checkpointLocation", streamingCheckpointingPath)
|
||||
.option(HoodieWriteConfig.TABLE_NAME, tableName).option("checkpointLocation", streamingCheckpointingPath)
|
||||
.outputMode(OutputMode.Append());
|
||||
|
||||
updateHiveSyncConfig(writer);
|
||||
writer
|
||||
.trigger(new ProcessingTime(500))
|
||||
.start(tablePath)
|
||||
.awaitTermination(streamingDurationInMs);
|
||||
writer.trigger(new ProcessingTime(500)).start(tablePath).awaitTermination(streamingDurationInMs);
|
||||
}
|
||||
|
||||
/**
|
||||
* Setup configs for syncing to hive
|
||||
*
|
||||
* @param writer
|
||||
* @return
|
||||
*/
|
||||
@@ -268,8 +254,8 @@ public class HoodieJavaStreamingApp {
|
||||
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY(), hivePass)
|
||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY(), "true");
|
||||
if (useMultiPartitionKeys) {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day")
|
||||
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "year,month,day").option(
|
||||
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY(),
|
||||
MultiPartKeysValueExtractor.class.getCanonicalName());
|
||||
} else {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "dateStr");
|
||||
|
||||
Reference in New Issue
Block a user