[HUDI-2255] Refactor Datasource options (#3373)
Co-authored-by: Wenning Ding <wenningd@amazon.com>
This commit is contained in:
@@ -140,23 +140,23 @@ public class HoodieJavaApp {
|
||||
// full list in HoodieWriteConfig & its package
|
||||
.option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
// Hoodie Table Type
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY().key(), tableType)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType)
|
||||
// insert
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())
|
||||
.option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())
|
||||
// This is the record key
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key")
|
||||
// this is the partition to place it into
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "partition")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition")
|
||||
// use to combine duplicate records in input/with disk val
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY().key(), "timestamp")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp")
|
||||
// Used by hive sync and queries
|
||||
.option(HoodieWriteConfig.TABLE_NAME.key(), tableName)
|
||||
// Add Key Extractor
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY().key(),
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS().key(),
|
||||
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
|
||||
: SimpleKeyGenerator.class.getCanonicalName())
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY().key(), "false")
|
||||
.option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE_OPT_KEY().key(), "true")
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false")
|
||||
.option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true")
|
||||
// This will remove any existing data at path below, and create a
|
||||
.mode(SaveMode.Overwrite);
|
||||
|
||||
@@ -175,16 +175,16 @@ public class HoodieJavaApp {
|
||||
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
|
||||
writer = inputDF2.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2")
|
||||
.option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY().key(), tableType) // Hoodie Table Type
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY().key(), "timestamp")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY().key(),
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType) // Hoodie Table Type
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS().key(),
|
||||
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
|
||||
: SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
|
||||
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP.key(), "1")
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY().key(), "false")
|
||||
.option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE_OPT_KEY().key(), "true")
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false")
|
||||
.option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true")
|
||||
.option(HoodieWriteConfig.TABLE_NAME.key(), tableName).mode(SaveMode.Append);
|
||||
|
||||
updateHiveSyncConfig(writer);
|
||||
@@ -202,17 +202,17 @@ public class HoodieJavaApp {
|
||||
writer = inputDF3.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2")
|
||||
.option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
.option("hoodie.delete.shuffle.parallelism", "2")
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY().key(), tableType) // Hoodie Table Type
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY().key(), "delete")
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY().key(),
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType) // Hoodie Table Type
|
||||
.option(DataSourceWriteOptions.OPERATION().key(), "delete")
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS().key(),
|
||||
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
|
||||
: SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
|
||||
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP.key(), "1")
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY().key(), "false")
|
||||
.option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE_OPT_KEY().key(), "true")
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false")
|
||||
.option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true")
|
||||
.option(HoodieWriteConfig.TABLE_NAME.key(), tableName).mode(SaveMode.Append);
|
||||
|
||||
updateHiveSyncConfig(writer);
|
||||
@@ -237,9 +237,9 @@ public class HoodieJavaApp {
|
||||
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
|
||||
*/
|
||||
Dataset<Row> incQueryDF = spark.read().format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
|
||||
.option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
|
||||
// Only changes in write 2 above
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY().key(), commitInstantTime1)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), commitInstantTime1)
|
||||
// For incremental view, pass in the root/base path of dataset
|
||||
.load(tablePath);
|
||||
|
||||
@@ -254,23 +254,23 @@ public class HoodieJavaApp {
|
||||
private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) {
|
||||
if (enableHiveSync) {
|
||||
LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY().key(), hiveTable)
|
||||
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY().key(), hiveDB)
|
||||
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY().key(), hiveJdbcUrl)
|
||||
.option(DataSourceWriteOptions.HIVE_USER_OPT_KEY().key(), hiveUser)
|
||||
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY().key(), hivePass)
|
||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY().key(), "true");
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE().key(), hiveTable)
|
||||
.option(DataSourceWriteOptions.HIVE_DATABASE().key(), hiveDB)
|
||||
.option(DataSourceWriteOptions.HIVE_URL().key(), hiveJdbcUrl)
|
||||
.option(DataSourceWriteOptions.HIVE_USER().key(), hiveUser)
|
||||
.option(DataSourceWriteOptions.HIVE_PASS().key(), hivePass)
|
||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED().key(), "true");
|
||||
if (nonPartitionedTable) {
|
||||
writer = writer
|
||||
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY().key(),
|
||||
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(),
|
||||
NonPartitionedExtractor.class.getCanonicalName())
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "");
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "");
|
||||
} else if (useMultiPartitionKeys) {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY().key(), "year,month,day").option(
|
||||
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY().key(),
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "year,month,day").option(
|
||||
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(),
|
||||
MultiPartKeysValueExtractor.class.getCanonicalName());
|
||||
} else {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY().key(), "dateStr");
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "dateStr");
|
||||
}
|
||||
}
|
||||
return writer;
|
||||
|
||||
@@ -124,23 +124,23 @@ public class HoodieJavaGenerateApp {
|
||||
private DataFrameWriter<Row> updateHiveSyncConfig(DataFrameWriter<Row> writer) {
|
||||
if (enableHiveSync) {
|
||||
LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY().key(), hiveTable)
|
||||
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY().key(), hiveDB)
|
||||
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY().key(), hiveJdbcUrl)
|
||||
.option(DataSourceWriteOptions.HIVE_USER_OPT_KEY().key(), hiveUser)
|
||||
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY().key(), hivePass)
|
||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY().key(), "true");
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE().key(), hiveTable)
|
||||
.option(DataSourceWriteOptions.HIVE_DATABASE().key(), hiveDB)
|
||||
.option(DataSourceWriteOptions.HIVE_URL().key(), hiveJdbcUrl)
|
||||
.option(DataSourceWriteOptions.HIVE_USER().key(), hiveUser)
|
||||
.option(DataSourceWriteOptions.HIVE_PASS().key(), hivePass)
|
||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED().key(), "true");
|
||||
if (nonPartitionedTable) {
|
||||
writer = writer
|
||||
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY().key(),
|
||||
.option(DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(),
|
||||
NonPartitionedExtractor.class.getCanonicalName())
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "");
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "");
|
||||
} else if (useMultiPartitionKeys) {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY().key(), "year,month,day").option(
|
||||
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY().key(),
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "year,month,day").option(
|
||||
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(),
|
||||
MultiPartKeysValueExtractor.class.getCanonicalName());
|
||||
} else {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY().key(), "dateStr");
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "dateStr");
|
||||
}
|
||||
}
|
||||
return writer;
|
||||
@@ -165,19 +165,19 @@ public class HoodieJavaGenerateApp {
|
||||
// full list in HoodieWriteConfig & its package
|
||||
.option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
// Hoodie Table Type
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY().key(), tableType)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType)
|
||||
// insert
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())
|
||||
.option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())
|
||||
// This is the record key
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key")
|
||||
// this is the partition to place it into
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "partition")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition")
|
||||
// use to combine duplicate records in input/with disk val
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY().key(), "timestamp")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp")
|
||||
// Used by hive sync and queries
|
||||
.option(HoodieWriteConfig.TABLE_NAME.key(), tableName)
|
||||
// Add Key Extractor
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY().key(),
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS().key(),
|
||||
nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
|
||||
: SimpleKeyGenerator.class.getCanonicalName())
|
||||
.mode(commitType);
|
||||
|
||||
@@ -332,9 +332,9 @@ public class HoodieJavaStreamingApp {
|
||||
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
|
||||
*/
|
||||
Dataset<Row> hoodieIncViewDF = spark.read().format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
|
||||
.option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
|
||||
// Only changes in write 2 above
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY().key(), commitInstantTime1)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), commitInstantTime1)
|
||||
// For incremental view, pass in the root/base path of dataset
|
||||
.load(tablePath);
|
||||
|
||||
@@ -355,14 +355,14 @@ public class HoodieJavaStreamingApp {
|
||||
DataStreamWriter<Row> writer = streamingInput.writeStream().format("org.apache.hudi")
|
||||
.option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2")
|
||||
.option("hoodie.delete.shuffle.parallelism", "2")
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY().key(), operationType)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY().key(), tableType)
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY().key(), "timestamp")
|
||||
.option(DataSourceWriteOptions.OPERATION().key(), operationType)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType)
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition")
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp")
|
||||
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP.key(), "1")
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY().key(), "true")
|
||||
.option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE_OPT_KEY().key(), "true")
|
||||
.option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "true")
|
||||
.option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true")
|
||||
.option(HoodieWriteConfig.TABLE_NAME.key(), tableName).option("checkpointLocation", checkpointLocation)
|
||||
.outputMode(OutputMode.Append());
|
||||
|
||||
@@ -380,18 +380,18 @@ public class HoodieJavaStreamingApp {
|
||||
private DataStreamWriter<Row> updateHiveSyncConfig(DataStreamWriter<Row> writer) {
|
||||
if (enableHiveSync) {
|
||||
LOG.info("Enabling Hive sync to " + hiveJdbcUrl);
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY().key(), hiveTable)
|
||||
.option(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY().key(), hiveDB)
|
||||
.option(DataSourceWriteOptions.HIVE_URL_OPT_KEY().key(), hiveJdbcUrl)
|
||||
.option(DataSourceWriteOptions.HIVE_USER_OPT_KEY().key(), hiveUser)
|
||||
.option(DataSourceWriteOptions.HIVE_PASS_OPT_KEY().key(), hivePass)
|
||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED_OPT_KEY().key(), "true");
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_TABLE().key(), hiveTable)
|
||||
.option(DataSourceWriteOptions.HIVE_DATABASE().key(), hiveDB)
|
||||
.option(DataSourceWriteOptions.HIVE_URL().key(), hiveJdbcUrl)
|
||||
.option(DataSourceWriteOptions.HIVE_USER().key(), hiveUser)
|
||||
.option(DataSourceWriteOptions.HIVE_PASS().key(), hivePass)
|
||||
.option(DataSourceWriteOptions.HIVE_SYNC_ENABLED().key(), "true");
|
||||
if (useMultiPartitionKeys) {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY().key(), "year,month,day").option(
|
||||
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY().key(),
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "year,month,day").option(
|
||||
DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS().key(),
|
||||
MultiPartKeysValueExtractor.class.getCanonicalName());
|
||||
} else {
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY().key(), "dateStr");
|
||||
writer = writer.option(DataSourceWriteOptions.HIVE_PARTITION_FIELDS().key(), "dateStr");
|
||||
}
|
||||
}
|
||||
return writer;
|
||||
|
||||
@@ -209,18 +209,18 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
|
||||
private Map<String, String> getProps(boolean setAll, boolean setKeyGen, boolean setRecordKey, boolean setPartitionPath) {
|
||||
Map<String, String> props = new HashMap<>();
|
||||
if (setAll) {
|
||||
props.put(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY().key(), "org.apache.hudi.keygen.SimpleKeyGenerator");
|
||||
props.put(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), "_row_key");
|
||||
props.put(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "partition");
|
||||
props.put(DataSourceWriteOptions.KEYGENERATOR_CLASS().key(), "org.apache.hudi.keygen.SimpleKeyGenerator");
|
||||
props.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key");
|
||||
props.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition");
|
||||
} else {
|
||||
if (setKeyGen) {
|
||||
props.put(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY().key(), "org.apache.hudi.keygen.SimpleKeyGenerator");
|
||||
props.put(DataSourceWriteOptions.KEYGENERATOR_CLASS().key(), "org.apache.hudi.keygen.SimpleKeyGenerator");
|
||||
}
|
||||
if (setRecordKey) {
|
||||
props.put(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), "_row_key");
|
||||
props.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key");
|
||||
}
|
||||
if (setPartitionPath) {
|
||||
props.put(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "partition");
|
||||
props.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition");
|
||||
}
|
||||
}
|
||||
return props;
|
||||
|
||||
@@ -550,8 +550,8 @@ public class TestBootstrap extends HoodieClientTestBase {
|
||||
HoodieWriteConfig.Builder builder = getConfigBuilder(schemaStr, IndexType.BLOOM)
|
||||
.withExternalSchemaTrasformation(true);
|
||||
TypedProperties properties = new TypedProperties();
|
||||
properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), "_row_key");
|
||||
properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), "datestr");
|
||||
properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key");
|
||||
properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "datestr");
|
||||
builder = builder.withProps(properties);
|
||||
return builder;
|
||||
}
|
||||
|
||||
@@ -55,9 +55,9 @@ class TestDataSourceDefaults {
|
||||
|
||||
private def getKeyConfig(recordKeyFieldName: String, partitionPathField: String, hiveStylePartitioning: String): TypedProperties = {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, recordKeyFieldName)
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, partitionPathField)
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING_OPT_KEY.key, hiveStylePartitioning)
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, recordKeyFieldName)
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, partitionPathField)
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key, hiveStylePartitioning)
|
||||
props
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ class TestDataSourceDefaults {
|
||||
// partition path field not specified
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1")
|
||||
new SimpleKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -86,7 +86,7 @@ class TestDataSourceDefaults {
|
||||
// partition path field not specified using Row
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1")
|
||||
val keyGen = new SimpleKeyGenerator(props)
|
||||
keyGen.getRecordKey(baseRow)
|
||||
fail("Should have errored out")
|
||||
@@ -98,7 +98,7 @@ class TestDataSourceDefaults {
|
||||
// recordkey field not specified
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key(), "partitionField")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key(), "partitionField")
|
||||
new SimpleKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -109,7 +109,7 @@ class TestDataSourceDefaults {
|
||||
// recordkey field not specified using Row
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "partitionField")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "partitionField")
|
||||
val keyGen = new SimpleKeyGenerator(props)
|
||||
keyGen.getPartitionPath(baseRow)
|
||||
fail("Should have errored out")
|
||||
@@ -181,8 +181,8 @@ class TestDataSourceDefaults {
|
||||
try {
|
||||
baseRecord.put("field1", "")
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "name")
|
||||
new SimpleKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -193,8 +193,8 @@ class TestDataSourceDefaults {
|
||||
// if record key is empty, throw error. Using Row
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "name")
|
||||
keyGen = new SimpleKeyGenerator(props)
|
||||
baseRow = KeyGeneratorTestUtilities.getRow(baseRecord, schema, structType)
|
||||
keyGen.getRecordKey(baseRow)
|
||||
@@ -208,8 +208,8 @@ class TestDataSourceDefaults {
|
||||
try {
|
||||
baseRecord.put("field1", null)
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "name")
|
||||
new SimpleKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -220,8 +220,8 @@ class TestDataSourceDefaults {
|
||||
// if record key is null, throw error. Using Row
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "name")
|
||||
keyGen = new SimpleKeyGenerator(props)
|
||||
baseRow = KeyGeneratorTestUtilities.getRow(baseRecord, schema, structType)
|
||||
keyGen.getRecordKey(baseRow)
|
||||
@@ -239,8 +239,8 @@ class TestDataSourceDefaults {
|
||||
}
|
||||
|
||||
class UserDefinedKeyGenerator(props: TypedProperties) extends KeyGenerator(props) with SparkKeyGeneratorInterface {
|
||||
val recordKeyProp: String = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key)
|
||||
val partitionPathProp: String = props.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key)
|
||||
val recordKeyProp: String = props.getString(DataSourceWriteOptions.RECORDKEY_FIELD.key)
|
||||
val partitionPathProp: String = props.getString(DataSourceWriteOptions.PARTITIONPATH_FIELD.key)
|
||||
val STRUCT_NAME: String = "hoodieRowTopLevelField"
|
||||
val NAMESPACE: String = "hoodieRow"
|
||||
var converterFn: Function1[Any, Any] = _
|
||||
@@ -279,7 +279,7 @@ class TestDataSourceDefaults {
|
||||
// partition path field not specified
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1")
|
||||
new ComplexKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -290,7 +290,7 @@ class TestDataSourceDefaults {
|
||||
// partition path field not specified using Row
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1")
|
||||
val keyGen = new ComplexKeyGenerator(props)
|
||||
keyGen.getRecordKey(baseRow)
|
||||
fail("Should have errored out")
|
||||
@@ -302,7 +302,7 @@ class TestDataSourceDefaults {
|
||||
// recordkey field not specified
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "partitionField")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "partitionField")
|
||||
new ComplexKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -313,7 +313,7 @@ class TestDataSourceDefaults {
|
||||
// recordkey field not specified
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "partitionField")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "partitionField")
|
||||
val keyGen = new ComplexKeyGenerator(props)
|
||||
keyGen.getPartitionPath(baseRow)
|
||||
fail("Should have errored out")
|
||||
@@ -395,8 +395,8 @@ class TestDataSourceDefaults {
|
||||
baseRecord.put("name", "")
|
||||
baseRecord.put("field1", null)
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "field1,name")
|
||||
new ComplexKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -409,8 +409,8 @@ class TestDataSourceDefaults {
|
||||
baseRecord.put("name", "")
|
||||
baseRecord.put("field1", null)
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "field1,name")
|
||||
keyGen = new ComplexKeyGenerator(props)
|
||||
baseRow = KeyGeneratorTestUtilities.getRow(baseRecord, schema, structType)
|
||||
keyGen.getRecordKey(baseRow)
|
||||
@@ -453,7 +453,7 @@ class TestDataSourceDefaults {
|
||||
|
||||
// top level, partition value not included
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1,name")
|
||||
keyGen = new GlobalDeleteKeyGenerator(props)
|
||||
val hk2 = keyGen.getKey(baseRecord)
|
||||
assertEquals("field1:field1,name:name1", hk2.getRecordKey)
|
||||
@@ -487,7 +487,7 @@ class TestDataSourceDefaults {
|
||||
// recordkey field not specified
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "partitionField")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "partitionField")
|
||||
new GlobalDeleteKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -498,7 +498,7 @@ class TestDataSourceDefaults {
|
||||
// recordkey field not specified
|
||||
try {
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "partitionField")
|
||||
props.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "partitionField")
|
||||
val keyGen = new GlobalDeleteKeyGenerator(props)
|
||||
keyGen.getRecordKey(baseRow)
|
||||
fail("Should have errored out")
|
||||
@@ -532,7 +532,7 @@ class TestDataSourceDefaults {
|
||||
baseRecord.put("name", "")
|
||||
baseRecord.put("field1", null)
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1,name")
|
||||
new GlobalDeleteKeyGenerator(props).getKey(baseRecord)
|
||||
fail("Should have errored out")
|
||||
} catch {
|
||||
@@ -546,7 +546,7 @@ class TestDataSourceDefaults {
|
||||
baseRecord.put("field1", null)
|
||||
baseRow = KeyGeneratorTestUtilities.getRow(baseRecord, schema, structType)
|
||||
val props = new TypedProperties()
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "field1,name")
|
||||
props.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, "field1,name")
|
||||
val keyGen = new GlobalDeleteKeyGenerator(props)
|
||||
keyGen.getRecordKey(baseRow)
|
||||
fail("Should have errored out")
|
||||
|
||||
@@ -47,14 +47,14 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
||||
val commonOpts = Map(
|
||||
"hoodie.insert.shuffle.parallelism" -> "4",
|
||||
"hoodie.upsert.shuffle.parallelism" -> "4",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key -> "timestamp",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
|
||||
HoodieWriteConfig.TABLE_NAME.key -> "hoodie_test"
|
||||
)
|
||||
|
||||
var queryOpts = Map(
|
||||
DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key -> DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL
|
||||
DataSourceReadOptions.QUERY_TYPE.key -> DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL
|
||||
)
|
||||
|
||||
@BeforeEach override def setUp() {
|
||||
@@ -75,8 +75,8 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
||||
inputDF1.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, partitionEncode)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient)
|
||||
@@ -94,8 +94,8 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
||||
inputDF1.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key, keyGenerator)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS.key, keyGenerator)
|
||||
.option(Config.TIMESTAMP_TYPE_FIELD_PROP, TimestampType.DATE_STRING.name())
|
||||
.option(Config.TIMESTAMP_INPUT_DATE_FORMAT_PROP, "yyyy/MM/dd")
|
||||
.option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyy-MM-dd")
|
||||
@@ -115,9 +115,9 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
||||
inputDF1.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key, keyGenerator)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "partition:simple")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS.key, keyGenerator)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "partition:simple")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient)
|
||||
@@ -134,8 +134,8 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
||||
inputDF1.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, partitionEncode)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient)
|
||||
@@ -172,12 +172,12 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
||||
|
||||
inputDF1.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(RECORDKEY_FIELD_OPT_KEY.key, "id")
|
||||
.option(PRECOMBINE_FIELD_OPT_KEY.key, "version")
|
||||
.option(PARTITIONPATH_FIELD_OPT_KEY.key, "dt,hh")
|
||||
.option(KEYGENERATOR_CLASS_OPT_KEY.key, classOf[ComplexKeyGenerator].getName)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, "false")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(RECORDKEY_FIELD.key, "id")
|
||||
.option(PRECOMBINE_FIELD.key, "version")
|
||||
.option(PARTITIONPATH_FIELD.key, "dt,hh")
|
||||
.option(KEYGENERATOR_CLASS.key, classOf[ComplexKeyGenerator].getName)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, "false")
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, useMetaFileList)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
@@ -209,12 +209,12 @@ class TestHoodieFileIndex extends HoodieClientTestBase {
|
||||
s"2021/03/0${i % 2 + 1}", "10")).toDF("id", "name", "price", "version", "dt", "hh")
|
||||
inputDF2.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(RECORDKEY_FIELD_OPT_KEY.key, "id")
|
||||
.option(PRECOMBINE_FIELD_OPT_KEY.key, "version")
|
||||
.option(PARTITIONPATH_FIELD_OPT_KEY.key, "dt,hh")
|
||||
.option(KEYGENERATOR_CLASS_OPT_KEY.key, classOf[ComplexKeyGenerator].getName)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, "false")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(RECORDKEY_FIELD.key, "id")
|
||||
.option(PRECOMBINE_FIELD.key, "version")
|
||||
.option(PARTITIONPATH_FIELD.key, "dt,hh")
|
||||
.option(KEYGENERATOR_CLASS.key, classOf[ComplexKeyGenerator].getName)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, "false")
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key(), useMetaFileList)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
|
||||
@@ -57,7 +57,7 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val originals = HoodieWriterUtils.parametersWithWriteDefaults(Map.empty)
|
||||
val rhsKey = "hoodie.right.hand.side.key"
|
||||
val rhsVal = "hoodie.right.hand.side.val"
|
||||
val modifier = Map(OPERATION_OPT_KEY.key -> INSERT_OPERATION_OPT_VAL, TABLE_TYPE_OPT_KEY.key -> MOR_TABLE_TYPE_OPT_VAL, rhsKey -> rhsVal)
|
||||
val modifier = Map(OPERATION.key -> INSERT_OPERATION_OPT_VAL, TABLE_TYPE.key -> MOR_TABLE_TYPE_OPT_VAL, rhsKey -> rhsVal)
|
||||
val modified = HoodieWriterUtils.parametersWithWriteDefaults(modifier)
|
||||
val matcher = (k: String, v: String) => modified(k) should be(v)
|
||||
|
||||
@@ -111,7 +111,7 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
assert(tableAlreadyExistException.getMessage.contains("hoodie table with name " + hoodieFooTableName + " already exist"))
|
||||
|
||||
//on same path try append with delete operation and different("hoodie_bar_tbl") table name which should throw an exception
|
||||
val deleteTableParams = barTableParams ++ Map(OPERATION_OPT_KEY.key -> "delete")
|
||||
val deleteTableParams = barTableParams ++ Map(OPERATION.key -> "delete")
|
||||
val deleteCmdException = intercept[HoodieException](HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, deleteTableParams, dataFrame2))
|
||||
assert(deleteCmdException.getMessage.contains("hoodie table with name " + hoodieFooTableName + " already exist"))
|
||||
} finally {
|
||||
@@ -154,15 +154,15 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
//create a new table
|
||||
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||
DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL,
|
||||
DataSourceWriteOptions.TABLE_TYPE.key -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL,
|
||||
"hoodie.bulkinsert.shuffle.parallelism" -> "4",
|
||||
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.ENABLE_ROW_WRITER_OPT_KEY.key -> "true",
|
||||
DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.ENABLE_ROW_WRITER.key -> "true",
|
||||
HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key() -> String.valueOf(populateMetaFields),
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
HoodieWriteConfig.BULKINSERT_SORT_MODE.key() -> sortMode.name(),
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||
|
||||
// generate the inserts
|
||||
@@ -218,14 +218,14 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
//create a new table
|
||||
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||
DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL,
|
||||
DataSourceWriteOptions.TABLE_TYPE.key -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL,
|
||||
"hoodie.bulkinsert.shuffle.parallelism" -> "4",
|
||||
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.ENABLE_ROW_WRITER_OPT_KEY.key -> "true",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.ENABLE_ROW_WRITER.key -> "true",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
HoodieWriteConfig.BULKINSERT_SORT_MODE.key() -> BulkInsertSortMode.NONE.name(),
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||
|
||||
// generate the inserts
|
||||
@@ -256,14 +256,14 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
//create a new table
|
||||
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||
DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL,
|
||||
DataSourceWriteOptions.TABLE_TYPE.key -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL,
|
||||
"hoodie.bulkinsert.shuffle.parallelism" -> "4",
|
||||
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.ENABLE_ROW_WRITER_OPT_KEY.key -> "true",
|
||||
INSERT_DROP_DUPS_OPT_KEY.key -> "true",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.ENABLE_ROW_WRITER.key -> "true",
|
||||
INSERT_DROP_DUPS.key -> "true",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||
|
||||
// generate the inserts
|
||||
@@ -296,11 +296,11 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||
"hoodie.bulkinsert.shuffle.parallelism" -> "1",
|
||||
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.INSERT_DROP_DUPS_OPT_KEY.key -> "false",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.INSERT_DROP_DUPS.key -> "false",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||
|
||||
// generate the inserts
|
||||
@@ -310,7 +310,7 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val recordsSeq = convertRowListToSeq(records)
|
||||
val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType)
|
||||
// write to Hudi
|
||||
HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams - DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key, df)
|
||||
HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams - DataSourceWriteOptions.PRECOMBINE_FIELD.key, df)
|
||||
|
||||
// collect all parition paths to issue read of parquet files
|
||||
val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH,
|
||||
@@ -347,11 +347,11 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||
"hoodie.bulkinsert.shuffle.parallelism" -> "4",
|
||||
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.ENABLE_ROW_WRITER_OPT_KEY.key -> "true",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.ENABLE_ROW_WRITER.key -> "true",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||
|
||||
val partitions = Seq(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH,
|
||||
@@ -409,13 +409,13 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||
HoodieWriteConfig.BASE_FILE_FORMAT.key -> baseFileFormat,
|
||||
DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key -> tableType,
|
||||
DataSourceWriteOptions.TABLE_TYPE.key -> tableType,
|
||||
HoodieWriteConfig.INSERT_PARALLELISM.key -> "4",
|
||||
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key() -> String.valueOf(populateMetaFields),
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> classOf[SimpleKeyGenerator].getCanonicalName)
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS.key -> classOf[SimpleKeyGenerator].getCanonicalName)
|
||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||
|
||||
// generate the inserts
|
||||
@@ -494,11 +494,11 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
|
||||
HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP.key -> srcPath.toAbsolutePath.toString,
|
||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||
DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key -> tableType,
|
||||
DataSourceWriteOptions.TABLE_TYPE.key -> tableType,
|
||||
HoodieBootstrapConfig.BOOTSTRAP_PARALLELISM.key -> "4",
|
||||
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.OPERATION.key -> DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL,
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS.key -> classOf[NonpartitionedKeyGenerator].getCanonicalName)
|
||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||
|
||||
@@ -538,10 +538,10 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
|
||||
"hoodie.insert.shuffle.parallelism" -> "1",
|
||||
"hoodie.upsert.shuffle.parallelism" -> "1",
|
||||
DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key -> tableType,
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
DataSourceWriteOptions.TABLE_TYPE.key -> tableType,
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
|
||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||
|
||||
// generate the inserts
|
||||
@@ -618,8 +618,8 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val basePath = "/tmp/hoodie_test"
|
||||
val params = Map(
|
||||
"path" -> basePath,
|
||||
DataSourceWriteOptions.TABLE_NAME_OPT_KEY.key -> "test_hoodie",
|
||||
DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.TABLE_NAME.key -> "test_hoodie",
|
||||
DataSourceWriteOptions.HIVE_PARTITION_FIELDS.key -> "partition",
|
||||
DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX.key -> "true",
|
||||
DataSourceWriteOptions.HIVE_CREATE_MANAGED_TABLE.key -> "true"
|
||||
)
|
||||
@@ -645,8 +645,8 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val basePath = "/tmp/hoodie_test"
|
||||
val params = Map(
|
||||
"path" -> basePath,
|
||||
DataSourceWriteOptions.TABLE_NAME_OPT_KEY.key -> "test_hoodie",
|
||||
DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY.key -> "partition"
|
||||
DataSourceWriteOptions.TABLE_NAME.key -> "test_hoodie",
|
||||
DataSourceWriteOptions.HIVE_PARTITION_FIELDS.key -> "partition"
|
||||
)
|
||||
val parameters = HoodieWriterUtils.parametersWithWriteDefaults(params)
|
||||
val hoodieConfig = HoodieWriterUtils.convertMapToHoodieConfig(parameters)
|
||||
@@ -687,11 +687,11 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
val bootStrapPath = java.nio.file.Files.createTempDirectory("hoodie_test_bootstrap")
|
||||
val basePath = path.toAbsolutePath.toString
|
||||
val baseBootStrapPath = bootStrapPath.toAbsolutePath.toString
|
||||
val options = Map(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key -> tableType,
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key -> "col3",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "keyid",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.NonpartitionedKeyGenerator",
|
||||
val options = Map(DataSourceWriteOptions.TABLE_TYPE.key -> tableType,
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "col3",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "keyid",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "",
|
||||
DataSourceWriteOptions.KEYGENERATOR_CLASS.key -> "org.apache.hudi.keygen.NonpartitionedKeyGenerator",
|
||||
HoodieWriteConfig.TABLE_NAME.key -> "hoodie_test")
|
||||
try {
|
||||
val df = spark.range(0, 1000).toDF("keyid")
|
||||
@@ -701,20 +701,20 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
|
||||
df.write.format("hudi")
|
||||
.options(options)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, "insert")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, "insert")
|
||||
.option("hoodie.insert.shuffle.parallelism", "4")
|
||||
.mode(SaveMode.Overwrite).save(basePath)
|
||||
|
||||
df.write.format("hudi")
|
||||
.options(options)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, "insert_overwrite_table")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, "insert_overwrite_table")
|
||||
.option("hoodie.insert.shuffle.parallelism", "4")
|
||||
.mode(SaveMode.Append).save(basePath)
|
||||
|
||||
val currentCommits = spark.read.format("hudi").load(basePath).select("_hoodie_commit_time").take(1).map(_.getString(0))
|
||||
val incrementalKeyIdNum = spark.read.format("hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "0000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, currentCommits(0))
|
||||
val incrementalKeyIdNum = spark.read.format("hudi").option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "0000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, currentCommits(0))
|
||||
.load(basePath).select("keyid").orderBy("keyid").count
|
||||
assert(incrementalKeyIdNum == 1000)
|
||||
|
||||
@@ -725,20 +725,20 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
.options(options)
|
||||
.option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP.key, baseBootStrapPath)
|
||||
.option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS.key, classOf[NonpartitionedKeyGenerator].getCanonicalName)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
|
||||
.option(HoodieBootstrapConfig.BOOTSTRAP_PARALLELISM.key, "4")
|
||||
.mode(SaveMode.Overwrite).save(basePath)
|
||||
|
||||
df.write.format("hudi")
|
||||
.options(options)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, "insert_overwrite_table")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, "insert_overwrite_table")
|
||||
.option("hoodie.insert.shuffle.parallelism", "4")
|
||||
.mode(SaveMode.Append).save(basePath)
|
||||
|
||||
val currentCommitsBootstrap = spark.read.format("hudi").load(basePath).select("_hoodie_commit_time").take(1).map(_.getString(0))
|
||||
val incrementalKeyIdNumBootstrap = spark.read.format("hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "0000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, currentCommitsBootstrap(0))
|
||||
val incrementalKeyIdNumBootstrap = spark.read.format("hudi").option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "0000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, currentCommitsBootstrap(0))
|
||||
.load(basePath).select("keyid").orderBy("keyid").count
|
||||
assert(incrementalKeyIdNumBootstrap == 1000)
|
||||
} finally {
|
||||
@@ -761,12 +761,12 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
.withColumn("age", expr("keyid + 1000"))
|
||||
|
||||
df.write.format("hudi")
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, tableType)
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key, "col3")
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "keyid")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key, "org.apache.hudi.keygen.NonpartitionedKeyGenerator")
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, "insert")
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, tableType)
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD.key, "col3")
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD.key, "keyid")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS.key, "org.apache.hudi.keygen.NonpartitionedKeyGenerator")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, "insert")
|
||||
.option("hoodie.insert.shuffle.parallelism", "1")
|
||||
.option("hoodie.metadata.enable", "true")
|
||||
.option(HoodieWriteConfig.TABLE_NAME.key, "hoodie_test")
|
||||
@@ -776,12 +776,12 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
||||
.withColumn("col3", expr("keyid"))
|
||||
.withColumn("age", expr("keyid + 2000"))
|
||||
df_update.write.format("hudi")
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, tableType)
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key, "col3")
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key, "keyid")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key, "org.apache.hudi.keygen.NonpartitionedKeyGenerator")
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, "upsert")
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, tableType)
|
||||
.option(DataSourceWriteOptions.PRECOMBINE_FIELD.key, "col3")
|
||||
.option(DataSourceWriteOptions.RECORDKEY_FIELD.key, "keyid")
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "")
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS.key, "org.apache.hudi.keygen.NonpartitionedKeyGenerator")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, "upsert")
|
||||
.option("hoodie.upsert.shuffle.parallelism", "1")
|
||||
.option("hoodie.metadata.enable", "true")
|
||||
.option(HoodieWriteConfig.TABLE_NAME.key, "hoodie_test")
|
||||
|
||||
@@ -55,9 +55,9 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
"hoodie.upsert.shuffle.parallelism" -> "4",
|
||||
"hoodie.bulkinsert.shuffle.parallelism" -> "2",
|
||||
"hoodie.delete.shuffle.parallelism" -> "1",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key -> "timestamp",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
|
||||
HoodieWriteConfig.TABLE_NAME.key -> "hoodie_test"
|
||||
)
|
||||
|
||||
@@ -84,7 +84,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF = spark.read.json(spark.sparkContext.parallelize(records, 2))
|
||||
inputDF.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
|
||||
@@ -112,7 +112,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
.option("hoodie.keep.max.commits", "2")
|
||||
.option("hoodie.cleaner.commits.retained", "0")
|
||||
.option("hoodie.datasource.write.row.writer.enable", "true")
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL)
|
||||
.mode(if (i == 0) SaveMode.Overwrite else SaveMode.Append)
|
||||
.save(basePath)
|
||||
}
|
||||
@@ -149,7 +149,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
|
||||
inputDF2.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -167,7 +167,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, isMetadataEnabled)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
@@ -222,9 +222,9 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
// we have 2 commits, try pulling the first commit (which is not the latest)
|
||||
val firstCommit = HoodieDataSourceHelpers.listCommitsSince(fs, basePath, "000").get(0)
|
||||
val hoodieIncViewDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, firstCommit)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, firstCommit)
|
||||
.load(basePath)
|
||||
assertEquals(100, hoodieIncViewDF1.count()) // 100 initial inserts must be pulled
|
||||
var countsPerCommit = hoodieIncViewDF1.groupBy("_hoodie_commit_time").count().collect()
|
||||
@@ -233,9 +233,9 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
|
||||
// Test incremental query has no instant in range
|
||||
val emptyIncDF = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, "001")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, "001")
|
||||
.load(basePath)
|
||||
assertEquals(0, emptyIncDF.count())
|
||||
|
||||
@@ -250,8 +250,8 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
|
||||
// pull the latest commit
|
||||
val hoodieIncViewDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commitInstantTime2)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime2)
|
||||
.load(basePath)
|
||||
|
||||
assertEquals(uniqueKeyCnt, hoodieIncViewDF2.count()) // 100 records must be pulled
|
||||
@@ -261,16 +261,16 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
|
||||
// pull the latest commit within certain partitions
|
||||
val hoodieIncViewDF3 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commitInstantTime2)
|
||||
.option(DataSourceReadOptions.INCR_PATH_GLOB_OPT_KEY.key, "/2016/*/*/*")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime2)
|
||||
.option(DataSourceReadOptions.INCR_PATH_GLOB.key, "/2016/*/*/*")
|
||||
.load(basePath)
|
||||
assertEquals(hoodieIncViewDF2.filter(col("_hoodie_partition_path").contains("2016")).count(), hoodieIncViewDF3.count())
|
||||
|
||||
val timeTravelDF = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, firstCommit)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, firstCommit)
|
||||
.load(basePath)
|
||||
assertEquals(100, timeTravelDF.count()) // 100 initial inserts must be pulled
|
||||
}
|
||||
@@ -280,7 +280,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -288,7 +288,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2))
|
||||
inputDF2.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -306,7 +306,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -314,7 +314,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2))
|
||||
inputDF2.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
|
||||
@@ -333,7 +333,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -342,7 +342,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2))
|
||||
inputDF2.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -351,7 +351,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF3 = spark.read.json(spark.sparkContext.parallelize(records3, 2))
|
||||
inputDF3.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -389,7 +389,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -398,7 +398,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2))
|
||||
inputDF2.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
|
||||
@@ -448,7 +448,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
val hoodieROViewDF1 = spark.read.format("org.apache.hudi")
|
||||
@@ -460,7 +460,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2))
|
||||
inputDF2.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.INSERT_DROP_DUPS_OPT_KEY.key, "true")
|
||||
.option(DataSourceWriteOptions.INSERT_DROP_DUPS.key, "true")
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
val hoodieROViewDF2 = spark.read.format("org.apache.hudi")
|
||||
@@ -468,8 +468,8 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
assertEquals(hoodieROViewDF2.count(), totalUniqueKeyToGenerate)
|
||||
|
||||
val hoodieIncViewDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commitInstantTime1)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime1)
|
||||
.load(basePath)
|
||||
assertEquals(hoodieIncViewDF2.count(), insert2NewKeyCnt)
|
||||
}
|
||||
@@ -511,7 +511,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(HoodieWriteConfig.HOODIE_AUTO_COMMIT_PROP.key, "true")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
@@ -525,7 +525,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
|
||||
inputDF.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key, keyGenerator)
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS.key, keyGenerator)
|
||||
.mode(SaveMode.Overwrite)
|
||||
}
|
||||
|
||||
@@ -670,13 +670,13 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
@CsvSource(Array("true,false", "true,true", "false,true", "false,false"))
|
||||
def testQueryCOWWithBasePathAndFileIndex(partitionEncode: Boolean, isMetadataEnabled: Boolean): Unit = {
|
||||
val N = 20
|
||||
// Test query with partition prune if URL_ENCODE_PARTITIONING_OPT_KEY has enable
|
||||
// Test query with partition prune if URL_ENCODE_PARTITIONING has enable
|
||||
val records1 = dataGen.generateInsertsContainsAllPartitions("000", N)
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
||||
inputDF1.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, partitionEncode)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode)
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, isMetadataEnabled)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
@@ -704,15 +704,15 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records2), 2))
|
||||
inputDF2.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, partitionEncode)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode)
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, isMetadataEnabled)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
// Incremental query without "*" in path
|
||||
val hoodieIncViewDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commitInstantTime1)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime1)
|
||||
.load(basePath)
|
||||
assertEquals(N + 1, hoodieIncViewDF1.count())
|
||||
}
|
||||
|
||||
@@ -47,9 +47,9 @@ class TestDataSourceForBootstrap {
|
||||
HoodieWriteConfig.BULKINSERT_PARALLELISM.key -> "4",
|
||||
HoodieWriteConfig.FINALIZE_WRITE_PARALLELISM.key -> "4",
|
||||
HoodieBootstrapConfig.BOOTSTRAP_PARALLELISM.key -> "4",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key -> "timestamp",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
|
||||
HoodieWriteConfig.TABLE_NAME.key -> "hoodie_test"
|
||||
)
|
||||
var basePath: String = _
|
||||
@@ -117,9 +117,9 @@ class TestDataSourceForBootstrap {
|
||||
updateDF.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key, "org.apache.hudi.keygen.NonpartitionedKeyGenerator")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.KEYGENERATOR_CLASS.key, "org.apache.hudi.keygen.NonpartitionedKeyGenerator")
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -168,11 +168,11 @@ class TestDataSourceForBootstrap {
|
||||
updateDF.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "datestr")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "datestr")
|
||||
// Required because source data is hive style partitioned
|
||||
.option(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING_OPT_KEY.key, "true")
|
||||
.option(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key, "true")
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -221,9 +221,9 @@ class TestDataSourceForBootstrap {
|
||||
updateDf1.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "datestr")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "datestr")
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -241,9 +241,9 @@ class TestDataSourceForBootstrap {
|
||||
updateDF2.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "datestr")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "datestr")
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -282,7 +282,7 @@ class TestDataSourceForBootstrap {
|
||||
|
||||
// Read bootstrapped table and verify count
|
||||
val hoodieROViewDF1 = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key,
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key,
|
||||
DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.load(basePath + "/*")
|
||||
assertEquals(numRecords, hoodieROViewDF1.count())
|
||||
@@ -295,9 +295,9 @@ class TestDataSourceForBootstrap {
|
||||
updateDF.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "datestr")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "datestr")
|
||||
.option(HoodieCompactionConfig.INLINE_COMPACT_PROP.key, "true")
|
||||
.option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS_PROP.key, "1")
|
||||
.mode(SaveMode.Append)
|
||||
@@ -309,14 +309,14 @@ class TestDataSourceForBootstrap {
|
||||
// Read table after upsert and verify count. Since we have inline compaction enabled the RO view will have
|
||||
// the updated rows.
|
||||
val hoodieROViewDF2 = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key,
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key,
|
||||
DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.load(basePath + "/*")
|
||||
assertEquals(numRecords, hoodieROViewDF2.count())
|
||||
assertEquals(numRecordsUpdate, hoodieROViewDF2.filter(s"timestamp == $updateTimestamp").count())
|
||||
// Test query without "*" for MOR READ_OPTIMIZED
|
||||
val hoodieROViewDFWithBasePath = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key,
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key,
|
||||
DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.load(basePath)
|
||||
assertEquals(numRecords, hoodieROViewDFWithBasePath.count())
|
||||
@@ -347,13 +347,13 @@ class TestDataSourceForBootstrap {
|
||||
|
||||
// Read bootstrapped table and verify count
|
||||
val hoodieROViewDF1 = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key,
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key,
|
||||
DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.load(basePath + "/*")
|
||||
assertEquals(numRecords, hoodieROViewDF1.count())
|
||||
// Read bootstrapped table without "*"
|
||||
val hoodieROViewDFWithBasePath = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key,
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key,
|
||||
DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.load(basePath)
|
||||
assertEquals(numRecords, hoodieROViewDFWithBasePath.count())
|
||||
@@ -363,16 +363,16 @@ class TestDataSourceForBootstrap {
|
||||
updateDf1.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "datestr")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "datestr")
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
// Read table after upsert and verify the value
|
||||
assertEquals(1, HoodieDataSourceHelpers.listCommitsSince(fs, basePath, commitInstantTime1).size())
|
||||
val hoodieROViewDF2 = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key,
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key,
|
||||
DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.load(basePath + "/*")
|
||||
hoodieROViewDF2.collect()
|
||||
@@ -386,9 +386,9 @@ class TestDataSourceForBootstrap {
|
||||
updateDF2.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "datestr")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "datestr")
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -398,7 +398,7 @@ class TestDataSourceForBootstrap {
|
||||
// Read table after upsert and verify count. Since we have inline compaction off the RO view will have
|
||||
// no updated rows.
|
||||
val hoodieROViewDF3 = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key,
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key,
|
||||
DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.load(basePath + "/*")
|
||||
assertEquals(numRecords, hoodieROViewDF3.count())
|
||||
@@ -428,8 +428,8 @@ class TestDataSourceForBootstrap {
|
||||
bootstrapDF.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "datestr")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "datestr")
|
||||
.option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP.key, srcPath)
|
||||
.option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS.key, classOf[SimpleKeyGenerator].getName)
|
||||
.option(HoodieBootstrapConfig.BOOTSTRAP_MODE_SELECTOR.key, classOf[FullRecordBootstrapModeSelector].getName)
|
||||
@@ -455,9 +455,9 @@ class TestDataSourceForBootstrap {
|
||||
updateDF.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, "datestr")
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, "datestr")
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
@@ -478,9 +478,9 @@ class TestDataSourceForBootstrap {
|
||||
bootstrapDF.write
|
||||
.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, tableType)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key, partitionColumns.getOrElse(""))
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.BOOTSTRAP_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, tableType)
|
||||
.option(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, partitionColumns.getOrElse(""))
|
||||
.option(HoodieBootstrapConfig.BOOTSTRAP_BASE_PATH_PROP.key, srcPath)
|
||||
.option(HoodieBootstrapConfig.BOOTSTRAP_KEYGEN_CLASS.key, classOf[SimpleKeyGenerator].getName)
|
||||
.mode(SaveMode.Overwrite)
|
||||
@@ -496,9 +496,9 @@ class TestDataSourceForBootstrap {
|
||||
// incrementally pull only changes in the bootstrap commit, which would pull all the initial records written
|
||||
// during bootstrap
|
||||
val hoodieIncViewDF1 = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, bootstrapCommitInstantTime)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, bootstrapCommitInstantTime)
|
||||
.load(basePath)
|
||||
|
||||
assertEquals(numRecords, hoodieIncViewDF1.count())
|
||||
@@ -509,8 +509,8 @@ class TestDataSourceForBootstrap {
|
||||
// incrementally pull only changes after bootstrap commit, which would pull only the updated records in the
|
||||
// later commits
|
||||
val hoodieIncViewDF2 = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, bootstrapCommitInstantTime)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, bootstrapCommitInstantTime)
|
||||
.load(basePath);
|
||||
|
||||
assertEquals(numRecordsUpdate, hoodieIncViewDF2.count())
|
||||
@@ -522,9 +522,9 @@ class TestDataSourceForBootstrap {
|
||||
val relativePartitionPath = if (isHiveStylePartitioned) "/datestr=2020-04-02/*" else "/2020-04-02/*"
|
||||
// pull the update commits within certain partitions
|
||||
val hoodieIncViewDF3 = spark.read.format("hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, bootstrapCommitInstantTime)
|
||||
.option(DataSourceReadOptions.INCR_PATH_GLOB_OPT_KEY.key, relativePartitionPath)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, bootstrapCommitInstantTime)
|
||||
.option(DataSourceReadOptions.INCR_PATH_GLOB.key, relativePartitionPath)
|
||||
.load(basePath)
|
||||
|
||||
assertEquals(hoodieIncViewDF2.filter(col("_hoodie_partition_path").contains("2020-04-02")).count(),
|
||||
|
||||
@@ -31,9 +31,9 @@ class TestEmptyCommit extends HoodieClientTestBase {
|
||||
val commonOpts = Map(
|
||||
"hoodie.insert.shuffle.parallelism" -> "4",
|
||||
"hoodie.upsert.shuffle.parallelism" -> "4",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key -> "timestamp",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
|
||||
HoodieWriteConfig.TABLE_NAME.key -> "hoodie_test"
|
||||
)
|
||||
|
||||
@@ -57,7 +57,7 @@ class TestEmptyCommit extends HoodieClientTestBase {
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(Seq.empty[String], 1))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(HoodieWriteConfig.ALLOW_EMPTY_COMMIT.key(), allowEmptyCommit.toString)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
|
||||
@@ -20,7 +20,7 @@ package org.apache.hudi.functional
|
||||
import org.apache.hadoop.fs.Path
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import org.apache.hudi.DataSourceWriteOptions.{KEYGENERATOR_CLASS_OPT_KEY, PARTITIONPATH_FIELD_OPT_KEY, PAYLOAD_CLASS_OPT_KEY, PRECOMBINE_FIELD_OPT_KEY, RECORDKEY_FIELD_OPT_KEY}
|
||||
import org.apache.hudi.DataSourceWriteOptions.{KEYGENERATOR_CLASS, PARTITIONPATH_FIELD, PAYLOAD_CLASS, PRECOMBINE_FIELD, RECORDKEY_FIELD}
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||
import org.apache.hudi.common.fs.FSUtils
|
||||
import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieTableType}
|
||||
@@ -53,9 +53,9 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
val commonOpts = Map(
|
||||
"hoodie.insert.shuffle.parallelism" -> "4",
|
||||
"hoodie.upsert.shuffle.parallelism" -> "4",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key -> "timestamp",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
|
||||
HoodieWriteConfig.TABLE_NAME.key -> "hoodie_test"
|
||||
)
|
||||
|
||||
@@ -87,8 +87,8 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, isMetadataEnabled)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
@@ -97,7 +97,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
|
||||
// Read RO View
|
||||
val hudiRODF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, isMetadataEnabled)
|
||||
.load(basePath + "/*/*/*")
|
||||
|
||||
@@ -118,7 +118,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
// Read Snapshot query
|
||||
val updateCommitTime = HoodieDataSourceHelpers.latestCommit(fs, basePath)
|
||||
val hudiSnapshotDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, isMetadataEnabled)
|
||||
.load(basePath + "/*/*/*")
|
||||
|
||||
@@ -151,13 +151,13 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000"))
|
||||
val hudiSnapshotDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(100, hudiSnapshotDF1.count()) // still 100, since we only updated
|
||||
|
||||
@@ -171,7 +171,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(100, hudiSnapshotDF2.count()) // still 100, since we only updated
|
||||
val commit1Time = hudiSnapshotDF1.select("_hoodie_commit_time").head().get(0).toString
|
||||
@@ -183,9 +183,9 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
// incremental view
|
||||
// base file only
|
||||
val hudiIncDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, commit1Time)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, commit1Time)
|
||||
.load(basePath)
|
||||
assertEquals(100, hudiIncDF1.count())
|
||||
assertEquals(1, hudiIncDF1.select("_hoodie_commit_time").distinct().count())
|
||||
@@ -193,9 +193,9 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
hudiIncDF1.show(1)
|
||||
// log file only
|
||||
val hudiIncDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commit1Time)
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, commit2Time)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit1Time)
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, commit2Time)
|
||||
.load(basePath)
|
||||
assertEquals(100, hudiIncDF2.count())
|
||||
assertEquals(1, hudiIncDF2.select("_hoodie_commit_time").distinct().count())
|
||||
@@ -204,9 +204,9 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
|
||||
// base file + log file
|
||||
val hudiIncDF3 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, commit2Time)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, commit2Time)
|
||||
.load(basePath)
|
||||
assertEquals(100, hudiIncDF3.count())
|
||||
// log file being load
|
||||
@@ -215,16 +215,16 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
|
||||
// Test incremental query has no instant in range
|
||||
val emptyIncDF = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, "001")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, "001")
|
||||
.load(basePath)
|
||||
assertEquals(0, emptyIncDF.count())
|
||||
|
||||
// Unmerge
|
||||
val hudiSnapshotSkipMergeDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.REALTIME_MERGE_OPT_KEY.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(200, hudiSnapshotSkipMergeDF2.count())
|
||||
assertEquals(100, hudiSnapshotSkipMergeDF2.select("_hoodie_record_key").distinct().count())
|
||||
@@ -232,7 +232,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
|
||||
// Test Read Optimized Query on MOR table
|
||||
val hudiRODF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(100, hudiRODF2.count())
|
||||
|
||||
@@ -246,7 +246,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF3 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
// still 100, because we only updated the existing records
|
||||
assertEquals(100, hudiSnapshotDF3.count())
|
||||
@@ -259,17 +259,17 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
|
||||
// incremental query from commit2Time
|
||||
val hudiIncDF4 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commit2Time)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit2Time)
|
||||
.load(basePath)
|
||||
assertEquals(50, hudiIncDF4.count())
|
||||
|
||||
// skip merge incremental view
|
||||
// including commit 2 and commit 3
|
||||
val hudiIncDF4SkipMerge = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.REALTIME_MERGE_OPT_KEY.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL)
|
||||
.load(basePath)
|
||||
assertEquals(200, hudiIncDF4SkipMerge.count())
|
||||
|
||||
@@ -286,7 +286,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF4 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
// 200, because we insert 100 records to a new partition
|
||||
assertEquals(200, hudiSnapshotDF4.count())
|
||||
@@ -295,8 +295,8 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
|
||||
// Incremental query, 50 from log file, 100 from base file of the new partition.
|
||||
val hudiIncDF5 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commit2Time)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit2Time)
|
||||
.load(basePath)
|
||||
assertEquals(150, hudiIncDF5.count())
|
||||
|
||||
@@ -312,7 +312,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.save(basePath)
|
||||
val commit5Time = HoodieDataSourceHelpers.latestCommit(fs, basePath)
|
||||
val hudiSnapshotDF5 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(200, hudiSnapshotDF5.count())
|
||||
|
||||
@@ -327,13 +327,13 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.save(basePath)
|
||||
val commit6Time = HoodieDataSourceHelpers.latestCommit(fs, basePath)
|
||||
val hudiSnapshotDF6 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/2020/01/10/*")
|
||||
assertEquals(102, hudiSnapshotDF6.count())
|
||||
val hudiIncDF6 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commit5Time)
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, commit6Time)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit5Time)
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, commit6Time)
|
||||
.load(basePath)
|
||||
// compaction updated 150 rows + inserted 2 new row
|
||||
assertEquals(152, hudiIncDF6.count())
|
||||
@@ -349,13 +349,13 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
assertTrue(HoodieDataSourceHelpers.hasNewCommits(fs, basePath, "000"))
|
||||
val hudiSnapshotDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(100, hudiSnapshotDF1.count()) // still 100, since we only updated
|
||||
|
||||
@@ -369,7 +369,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(50, hudiSnapshotDF2.count()) // 50 records were deleted
|
||||
assertEquals(hudiSnapshotDF2.select("_hoodie_commit_time").distinct().count(), 1)
|
||||
@@ -380,15 +380,15 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
|
||||
// unmerge query, skip the delete records
|
||||
val hudiSnapshotDF2Unmerge = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.REALTIME_MERGE_OPT_KEY.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(100, hudiSnapshotDF2Unmerge.count())
|
||||
|
||||
// incremental query, read 50 delete records from log file and get 0 count.
|
||||
val hudiIncDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commit2Time)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit2Time)
|
||||
.load(basePath)
|
||||
assertEquals(0, hudiIncDF1.count())
|
||||
|
||||
@@ -402,7 +402,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF3 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(0, hudiSnapshotDF3.count()) // 100 records were deleted, 0 record to load
|
||||
}
|
||||
@@ -417,12 +417,12 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
val commit1Time = hudiSnapshotDF1.select("_hoodie_commit_time").head().get(0).toString
|
||||
|
||||
@@ -445,20 +445,20 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
val hudiIncDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.load(basePath)
|
||||
val hudiIncDF1Skipmerge = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.REALTIME_MERGE_OPT_KEY.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.REALTIME_MERGE.key, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.load(basePath)
|
||||
val hudiIncDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commit1Time)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commit1Time)
|
||||
.load(basePath)
|
||||
|
||||
// filter first commit and only read log records
|
||||
@@ -498,12 +498,12 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option("hoodie.compact.inline", "false") // else fails due to compaction & deltacommit instant times being same
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(100, hudiSnapshotDF1.count())
|
||||
|
||||
@@ -515,7 +515,7 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
val hudiSnapshotDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL)
|
||||
.load(basePath + "/*/*/*/*")
|
||||
assertEquals(100, hudiSnapshotDF2.count())
|
||||
|
||||
@@ -553,12 +553,12 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
df.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
// use DefaultHoodieRecordPayload here
|
||||
.option(PAYLOAD_CLASS_OPT_KEY.key, classOf[DefaultHoodieRecordPayload].getCanonicalName)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(RECORDKEY_FIELD_OPT_KEY.key, "id")
|
||||
.option(PRECOMBINE_FIELD_OPT_KEY.key, "version")
|
||||
.option(PARTITIONPATH_FIELD_OPT_KEY.key, "")
|
||||
.option(KEYGENERATOR_CLASS_OPT_KEY.key, classOf[NonpartitionedKeyGenerator].getName)
|
||||
.option(PAYLOAD_CLASS.key, classOf[DefaultHoodieRecordPayload].getCanonicalName)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(RECORDKEY_FIELD.key, "id")
|
||||
.option(PRECOMBINE_FIELD.key, "version")
|
||||
.option(PARTITIONPATH_FIELD.key, "")
|
||||
.option(KEYGENERATOR_CLASS.key, classOf[NonpartitionedKeyGenerator].getName)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
}
|
||||
@@ -595,14 +595,14 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
@CsvSource(Array("true,false", "true,true", "false,true", "false,false"))
|
||||
def testQueryMORWithBasePathAndFileIndex(partitionEncode: Boolean, isMetadataEnabled: Boolean): Unit = {
|
||||
val N = 20
|
||||
// Test query with partition prune if URL_ENCODE_PARTITIONING_OPT_KEY has enable
|
||||
// Test query with partition prune if URL_ENCODE_PARTITIONING has enable
|
||||
val records1 = dataGen.generateInsertsContainsAllPartitions("000", N)
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
||||
inputDF1.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, partitionEncode)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode)
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, isMetadataEnabled)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
@@ -630,16 +630,16 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records2), 2))
|
||||
inputDF2.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, partitionEncode)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode)
|
||||
.option(HoodieMetadataConfig.METADATA_ENABLE_PROP.key, isMetadataEnabled)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
// Incremental query without "*" in path
|
||||
val hoodieIncViewDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commitInstantTime1)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime1)
|
||||
.load(basePath)
|
||||
assertEquals(N + 1, hoodieIncViewDF1.count())
|
||||
}
|
||||
@@ -656,10 +656,10 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
|
||||
inputDF1.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING_OPT_KEY.key, partitionEncode)
|
||||
.option(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING_OPT_KEY.key, hiveStylePartition)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.URL_ENCODE_PARTITIONING.key, partitionEncode)
|
||||
.option(DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key, hiveStylePartition)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
|
||||
@@ -714,8 +714,8 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
val inputDF = spark.read.json(spark.sparkContext.parallelize(recordsToStrings(records1), 2))
|
||||
inputDF.write.format("hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key, DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
// Use InMemoryIndex to generate log only mor table.
|
||||
.option(HoodieIndexConfig.INDEX_TYPE_PROP.key, IndexType.INMEMORY.toString)
|
||||
.mode(SaveMode.Overwrite)
|
||||
@@ -732,8 +732,8 @@ class TestMORDataSource extends HoodieClientTestBase {
|
||||
val inputDF1: Dataset[Row] = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY.key(), DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key(), DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.OPERATION.key(), DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL)
|
||||
.option(DataSourceWriteOptions.TABLE_TYPE.key(), DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
|
||||
// option for clustering
|
||||
.option("hoodie.parquet.small.file.limit", "0")
|
||||
.option("hoodie.clustering.inline", "true")
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
package org.apache.hudi.functional
|
||||
|
||||
import org.apache.hudi.DataSourceWriteOptions
|
||||
import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD_OPT_KEY, RECORDKEY_FIELD_OPT_KEY}
|
||||
import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD, RECORDKEY_FIELD}
|
||||
import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_READ}
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.config.HoodieWriteConfig.{DELETE_PARALLELISM, INSERT_PARALLELISM, TABLE_NAME, UPSERT_PARALLELISM}
|
||||
@@ -29,8 +29,8 @@ class TestStreamingSource extends StreamTest {
|
||||
|
||||
import testImplicits._
|
||||
private val commonOptions = Map(
|
||||
RECORDKEY_FIELD_OPT_KEY.key -> "id",
|
||||
PRECOMBINE_FIELD_OPT_KEY.key -> "ts",
|
||||
RECORDKEY_FIELD.key -> "id",
|
||||
PRECOMBINE_FIELD.key -> "ts",
|
||||
INSERT_PARALLELISM.key -> "4",
|
||||
UPSERT_PARALLELISM.key -> "4",
|
||||
DELETE_PARALLELISM.key -> "4"
|
||||
@@ -47,7 +47,7 @@ class TestStreamingSource extends StreamTest {
|
||||
HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(COPY_ON_WRITE)
|
||||
.setTableName(getTableName(tablePath))
|
||||
.setPayloadClassName(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY.defaultValue)
|
||||
.setPayloadClassName(DataSourceWriteOptions.PAYLOAD_CLASS.defaultValue)
|
||||
.initTable(spark.sessionState.newHadoopConf(), tablePath)
|
||||
|
||||
addData(tablePath, Seq(("1", "a1", "10", "000")))
|
||||
@@ -97,7 +97,7 @@ class TestStreamingSource extends StreamTest {
|
||||
HoodieTableMetaClient.withPropertyBuilder()
|
||||
.setTableType(MERGE_ON_READ)
|
||||
.setTableName(getTableName(tablePath))
|
||||
.setPayloadClassName(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY.defaultValue)
|
||||
.setPayloadClassName(DataSourceWriteOptions.PAYLOAD_CLASS.defaultValue)
|
||||
.initTable(spark.sessionState.newHadoopConf(), tablePath)
|
||||
|
||||
addData(tablePath, Seq(("1", "a1", "10", "000")))
|
||||
|
||||
@@ -47,9 +47,9 @@ class TestStructuredStreaming extends HoodieClientTestBase {
|
||||
val commonOpts = Map(
|
||||
"hoodie.insert.shuffle.parallelism" -> "4",
|
||||
"hoodie.upsert.shuffle.parallelism" -> "4",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY.key -> "timestamp",
|
||||
DataSourceWriteOptions.RECORDKEY_FIELD.key -> "_row_key",
|
||||
DataSourceWriteOptions.PARTITIONPATH_FIELD.key -> "partition",
|
||||
DataSourceWriteOptions.PRECOMBINE_FIELD.key -> "timestamp",
|
||||
HoodieWriteConfig.TABLE_NAME.key -> "hoodie_test"
|
||||
)
|
||||
|
||||
@@ -137,9 +137,9 @@ class TestStructuredStreaming extends HoodieClientTestBase {
|
||||
// we have 2 commits, try pulling the first commit (which is not the latest)
|
||||
val firstCommit = HoodieDataSourceHelpers.listCommitsSince(fs, destPath, "000").get(0)
|
||||
val hoodieIncViewDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, firstCommit)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, firstCommit)
|
||||
.load(destPath)
|
||||
assertEquals(100, hoodieIncViewDF1.count())
|
||||
// 100 initial inserts must be pulled
|
||||
@@ -149,8 +149,8 @@ class TestStructuredStreaming extends HoodieClientTestBase {
|
||||
|
||||
// pull the latest commit
|
||||
val hoodieIncViewDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, commitInstantTime1)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, commitInstantTime1)
|
||||
.load(destPath)
|
||||
|
||||
assertEquals(uniqueKeyCnt, hoodieIncViewDF2.count()) // 100 records must be pulled
|
||||
@@ -195,8 +195,8 @@ class TestStructuredStreaming extends HoodieClientTestBase {
|
||||
clusteringNumCommit: String, fileMaxRecordNum: Int):Map[String, String] = {
|
||||
commonOpts + (HoodieClusteringConfig.INLINE_CLUSTERING_PROP.key -> isInlineClustering,
|
||||
HoodieClusteringConfig.INLINE_CLUSTERING_MAX_COMMIT_PROP.key -> clusteringNumCommit,
|
||||
DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE_OPT_KEY.key -> isAsyncClustering,
|
||||
DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY.key -> isAsyncCompaction,
|
||||
DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE.key -> isAsyncClustering,
|
||||
DataSourceWriteOptions.ASYNC_COMPACT_ENABLE.key -> isAsyncCompaction,
|
||||
HoodieClusteringConfig.ASYNC_CLUSTERING_MAX_COMMIT_PROP.key -> clusteringNumCommit,
|
||||
HoodieStorageConfig.PARQUET_FILE_MAX_BYTES.key -> dataGen.getEstimatedFileSizeInBytes(fileMaxRecordNum).toString
|
||||
)
|
||||
|
||||
@@ -495,9 +495,9 @@ class TestMergeIntoTable extends TestHoodieSqlBase {
|
||||
)
|
||||
// Test incremental query
|
||||
val hudiIncDF1 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY.key, firstCommitTime)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, "000")
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME.key, firstCommitTime)
|
||||
.load(targetBasePath)
|
||||
hudiIncDF1.createOrReplaceTempView("inc1")
|
||||
checkAnswer(s"select id, name, price, _ts from inc1")(
|
||||
@@ -520,8 +520,8 @@ class TestMergeIntoTable extends TestHoodieSqlBase {
|
||||
)
|
||||
// Test incremental query
|
||||
val hudiIncDF2 = spark.read.format("org.apache.hudi")
|
||||
.option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY.key, secondCommitTime)
|
||||
.option(DataSourceReadOptions.QUERY_TYPE.key, DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL)
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME.key, secondCommitTime)
|
||||
.load(targetBasePath)
|
||||
hudiIncDF2.createOrReplaceTempView("inc2")
|
||||
checkAnswer(s"select id, name, price, _ts from inc2 order by id")(
|
||||
|
||||
Reference in New Issue
Block a user