1
0

Reformatting code per Google Code Style all over

This commit is contained in:
Vinoth Chandar
2017-11-12 22:54:56 -08:00
committed by vinoth chandar
parent 5a62480a92
commit e45679f5e2
254 changed files with 21580 additions and 21108 deletions

View File

@@ -18,9 +18,6 @@
import com.uber.hoodie.common.TestRawTripPayload;
import com.uber.hoodie.common.model.HoodieRecord;
import org.apache.spark.api.java.JavaRDD;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
@@ -31,20 +28,21 @@ import java.util.stream.Collectors;
*/
public class DataSourceTestUtils {
public static Optional<String> convertToString(HoodieRecord record) {
try {
String str = ((TestRawTripPayload) record.getData()).getJsonData();
str = "{" + str.substring(str.indexOf("\"timestamp\":"));
return Optional.of(str.replaceAll("}", ", \"partition\": \"" + record.getPartitionPath() + "\"}"));
} catch (IOException e) {
return Optional.empty();
}
public static Optional<String> convertToString(HoodieRecord record) {
try {
String str = ((TestRawTripPayload) record.getData()).getJsonData();
str = "{" + str.substring(str.indexOf("\"timestamp\":"));
return Optional
.of(str.replaceAll("}", ", \"partition\": \"" + record.getPartitionPath() + "\"}"));
} catch (IOException e) {
return Optional.empty();
}
}
public static List<String> convertToStringList(List<HoodieRecord> records) {
return records.stream().map(hr -> convertToString(hr))
.filter(os -> os.isPresent())
.map(os -> os.get())
.collect(Collectors.toList());
}
public static List<String> convertToStringList(List<HoodieRecord> records) {
return records.stream().map(hr -> convertToString(hr))
.filter(os -> os.isPresent())
.map(os -> os.get())
.collect(Collectors.toList());
}
}

View File

@@ -25,7 +25,7 @@ import com.uber.hoodie.HoodieDataSourceHelpers;
import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.config.HoodieWriteConfig;
import java.util.List;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -35,113 +35,123 @@ import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import java.util.List;
/**
* Sample program that writes & reads hoodie datasets via the Spark datasource
*/
public class HoodieJavaApp {
@Parameter(names={"--table-path", "-p"}, description = "path for Hoodie sample table")
private String tablePath = "file:///tmp/hoodie/sample-table";
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
private String tablePath = "file:///tmp/hoodie/sample-table";
@Parameter(names={"--table-name", "-n"}, description = "table name for Hoodie sample table")
private String tableName = "hoodie_test";
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
private String tableName = "hoodie_test";
@Parameter(names={"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
private static Logger logger = LogManager.getLogger(HoodieJavaApp.class);
private static Logger logger = LogManager.getLogger(HoodieJavaApp.class);
public static void main(String[] args) throws Exception {
HoodieJavaApp cli = new HoodieJavaApp();
JCommander cmd = new JCommander(cli, args);
public static void main(String[] args) throws Exception {
HoodieJavaApp cli = new HoodieJavaApp();
JCommander cmd = new JCommander(cli, args);
if (cli.help) {
cmd.usage();
System.exit(1);
}
cli.run();
if (cli.help) {
cmd.usage();
System.exit(1);
}
cli.run();
}
public void run() throws Exception {
public void run() throws Exception {
// Spark session setup..
SparkSession spark = SparkSession.builder()
.appName("Hoodie Spark APP")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master("local[1]")
.getOrCreate();
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
// Spark session setup..
SparkSession spark = SparkSession.builder()
.appName("Hoodie Spark APP")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master("local[1]")
.getOrCreate();
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
// Generator of some records to be loaded in.
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
// Generator of some records to be loaded in.
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
/**
* Commit with only inserts
*/
// Generate some input..
List<String> records1 = DataSourceTestUtils.convertToStringList(dataGen.generateInserts("001"/* ignore */, 100));
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
/**
* Commit with only inserts
*/
// Generate some input..
List<String> records1 = DataSourceTestUtils
.convertToStringList(dataGen.generateInserts("001"/* ignore */, 100));
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
// Save as hoodie dataset (copy on write)
inputDF1.write()
.format("com.uber.hoodie") // specify the hoodie source
.option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this
.option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package
.option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val
.option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries
.mode(SaveMode.Overwrite) // This will remove any existing data at path below, and create a new dataset if needed
.save(tablePath); // ultimately where the dataset will be placed
String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
logger.info("First commit at instant time :" + commitInstantTime1);
// Save as hoodie dataset (copy on write)
inputDF1.write()
.format("com.uber.hoodie") // specify the hoodie source
.option("hoodie.insert.shuffle.parallelism",
"2") // any hoodie client config can be passed like this
.option("hoodie.upsert.shuffle.parallelism",
"2") // full list in HoodieWriteConfig & its package
.option(DataSourceWriteOptions.OPERATION_OPT_KEY(),
DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(),
"_row_key") // This is the record key
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(),
"partition") // this is the partition to place it into
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(),
"timestamp") // use to combine duplicate records in input/with disk val
.option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries
.mode(
SaveMode.Overwrite) // This will remove any existing data at path below, and create a new dataset if needed
.save(tablePath); // ultimately where the dataset will be placed
String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
logger.info("First commit at instant time :" + commitInstantTime1);
/**
* Commit that updates records
*/
List<String> records2 = DataSourceTestUtils.convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100));
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
inputDF2.write()
.format("com.uber.hoodie")
.option("hoodie.insert.shuffle.parallelism", "2")
.option("hoodie.upsert.shuffle.parallelism", "2")
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
.option(HoodieWriteConfig.TABLE_NAME, tableName)
.mode(SaveMode.Append)
.save(tablePath);
String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
logger.info("Second commit at instant time :" + commitInstantTime1);
/**
* Commit that updates records
*/
List<String> records2 = DataSourceTestUtils
.convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100));
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
inputDF2.write()
.format("com.uber.hoodie")
.option("hoodie.insert.shuffle.parallelism", "2")
.option("hoodie.upsert.shuffle.parallelism", "2")
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
.option(HoodieWriteConfig.TABLE_NAME, tableName)
.mode(SaveMode.Append)
.save(tablePath);
String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
logger.info("Second commit at instant time :" + commitInstantTime1);
/**
* Read & do some queries
*/
Dataset<Row> hoodieROViewDF = spark.read()
.format("com.uber.hoodie")
// pass any path glob, can include hoodie & non-hoodie datasets
.load(tablePath + "/*/*/*/*");
hoodieROViewDF.registerTempTable("hoodie_ro");
spark.sql("describe hoodie_ro").show();
// all trips whose fare was greater than 2.
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show();
/**
* Read & do some queries
*/
Dataset<Row> hoodieROViewDF = spark.read()
.format("com.uber.hoodie")
// pass any path glob, can include hoodie & non-hoodie datasets
.load(tablePath + "/*/*/*/*");
hoodieROViewDF.registerTempTable("hoodie_ro");
spark.sql("describe hoodie_ro").show();
// all trips whose fare was greater than 2.
spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0")
.show();
/**
* Consume incrementally, only changes in commit 2 above.
*/
Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie")
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(),
commitInstantTime1) // Only changes in write 2 above
.load(tablePath); // For incremental view, pass in the root/base path of dataset
/**
* Consume incrementally, only changes in commit 2 above.
*/
Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie")
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above
.load(tablePath); // For incremental view, pass in the root/base path of dataset
logger.info("You will only see records from : " + commitInstantTime2);
hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
}
logger.info("You will only see records from : " + commitInstantTime2);
hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
}
}

View File

@@ -20,7 +20,6 @@ log4j.category.com.uber.hoodie.io=WARN
log4j.category.com.uber.hoodie.common=WARN
log4j.category.com.uber.hoodie.table.log=WARN
log4j.category.org.apache.parquet.hadoop=WARN
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.

View File

@@ -16,9 +16,9 @@
*
*/
import com.uber.hoodie.{DataSourceWriteOptions, OverwriteWithLatestAvroPayload, SimpleKeyGenerator}
import com.uber.hoodie.common.util.SchemaTestUtil
import com.uber.hoodie.exception.HoodieException
import com.uber.hoodie.{DataSourceWriteOptions, OverwriteWithLatestAvroPayload, SimpleKeyGenerator}
import org.apache.avro.generic.GenericRecord
import org.apache.commons.configuration.PropertiesConfiguration
import org.junit.Assert._
@@ -31,7 +31,7 @@ import org.scalatest.junit.AssertionsForJUnit
class DataSourceDefaultsTest extends AssertionsForJUnit {
val schema = SchemaTestUtil.getComplexEvolvedSchema
var baseRecord : GenericRecord = null
var baseRecord: GenericRecord = null
@Before def initialize(): Unit = {
baseRecord = SchemaTestUtil
@@ -39,12 +39,13 @@ class DataSourceDefaultsTest extends AssertionsForJUnit {
}
private def getKeyConfig(recordKeyFieldName: String, paritionPathField: String): PropertiesConfiguration = {
private def getKeyConfig(recordKeyFieldName: String, paritionPathField: String): PropertiesConfiguration = {
val props = new PropertiesConfiguration()
props.addProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY, recordKeyFieldName)
props.addProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY, paritionPathField)
props
}
@Test def testSimpleKeyGenerator() = {
// top level, valid fields
val hk1 = new SimpleKeyGenerator(getKeyConfig("field1", "name")).getKey(baseRecord)

View File

@@ -20,11 +20,11 @@ import com.uber.hoodie.common.HoodieTestDataGenerator
import com.uber.hoodie.common.util.FSUtils
import com.uber.hoodie.config.HoodieWriteConfig
import com.uber.hoodie.{DataSourceReadOptions, DataSourceWriteOptions, HoodieDataSourceHelpers}
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql._
import org.junit.Assert._
import org.junit.{Before, Test}
import org.junit.rules.TemporaryFolder
import org.junit.{Before, Test}
import org.scalatest.junit.AssertionsForJUnit
import scala.collection.JavaConversions._
@@ -44,8 +44,8 @@ class DataSourceTest extends AssertionsForJUnit {
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "timestamp",
HoodieWriteConfig.TABLE_NAME -> "hoodie_test"
)
var basePath : String = null
var fs : FileSystem = null
var basePath: String = null
var fs: FileSystem = null
@Before def initialize() {
spark = SparkSession.builder