[HUDI-1349] spark sql support overwrite use insert_overwrite_table (#2196)
This commit is contained in:
@@ -191,7 +191,7 @@ public class DataSourceUtils {
|
||||
}
|
||||
|
||||
public static String getCommitActionType(WriteOperationType operation, HoodieTableType tableType) {
|
||||
if (operation == WriteOperationType.INSERT_OVERWRITE) {
|
||||
if (operation == WriteOperationType.INSERT_OVERWRITE || operation == WriteOperationType.INSERT_OVERWRITE_TABLE) {
|
||||
return HoodieTimeline.REPLACE_COMMIT_ACTION;
|
||||
} else {
|
||||
return CommitUtils.getCommitActionType(tableType);
|
||||
@@ -211,6 +211,8 @@ public class DataSourceUtils {
|
||||
return new HoodieWriteResult(client.upsert(hoodieRecords, instantTime));
|
||||
case INSERT_OVERWRITE:
|
||||
return client.insertOverwrite(hoodieRecords, instantTime);
|
||||
case INSERT_OVERWRITE_TABLE:
|
||||
return client.insertOverwriteTable(hoodieRecords, instantTime);
|
||||
default:
|
||||
throw new HoodieException("Not a valid operation type for doWriteOperation: " + operation.toString());
|
||||
}
|
||||
|
||||
@@ -74,7 +74,8 @@ public class HoodieDataSourceHelpers {
|
||||
if (metaClient.getTableType().equals(HoodieTableType.MERGE_ON_READ)) {
|
||||
return metaClient.getActiveTimeline().getTimelineOfActions(
|
||||
CollectionUtils.createSet(HoodieActiveTimeline.COMMIT_ACTION,
|
||||
HoodieActiveTimeline.DELTA_COMMIT_ACTION)).filterCompletedInstants();
|
||||
HoodieActiveTimeline.DELTA_COMMIT_ACTION,
|
||||
HoodieActiveTimeline.REPLACE_COMMIT_ACTION)).filterCompletedInstants();
|
||||
} else {
|
||||
return metaClient.getCommitTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
@@ -157,6 +157,7 @@ object DataSourceWriteOptions {
|
||||
val DELETE_OPERATION_OPT_VAL = WriteOperationType.DELETE.value
|
||||
val BOOTSTRAP_OPERATION_OPT_VAL = WriteOperationType.BOOTSTRAP.value
|
||||
val INSERT_OVERWRITE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE.value
|
||||
val INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE_TABLE.value
|
||||
val DEFAULT_OPERATION_OPT_VAL = UPSERT_OPERATION_OPT_VAL
|
||||
|
||||
/**
|
||||
|
||||
@@ -93,6 +93,13 @@ private[hudi] object HoodieSparkSqlWriter {
|
||||
operation = WriteOperationType.INSERT
|
||||
}
|
||||
|
||||
// If the mode is Overwrite, can set operation to INSERT_OVERWRITE_TABLE.
|
||||
// Then in DataSourceUtils.doWriteOperation will use client.insertOverwriteTable to overwrite
|
||||
// the table. This will replace the old fs.delete(tablepath) mode.
|
||||
if (mode == SaveMode.Overwrite && operation != WriteOperationType.INSERT_OVERWRITE_TABLE) {
|
||||
operation = WriteOperationType.INSERT_OVERWRITE_TABLE
|
||||
}
|
||||
|
||||
val jsc = new JavaSparkContext(sparkContext)
|
||||
val basePath = new Path(path.get)
|
||||
val instantTime = HoodieActiveTimeline.createNewInstantTime()
|
||||
@@ -319,10 +326,6 @@ private[hudi] object HoodieSparkSqlWriter {
|
||||
if (operation != WriteOperationType.DELETE) {
|
||||
if (mode == SaveMode.ErrorIfExists && tableExists) {
|
||||
throw new HoodieException(s"hoodie table at $tablePath already exists.")
|
||||
} else if (mode == SaveMode.Overwrite && tableExists) {
|
||||
log.warn(s"hoodie table at $tablePath already exists. Deleting existing data & overwriting with new data.")
|
||||
fs.delete(tablePath, true)
|
||||
tableExists = false
|
||||
}
|
||||
} else {
|
||||
// Delete Operation only supports Append mode
|
||||
|
||||
@@ -18,7 +18,13 @@
|
||||
package org.apache.hudi.functional
|
||||
|
||||
import java.sql.{Date, Timestamp}
|
||||
import java.util.function.Supplier
|
||||
import java.util.stream.Stream
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
|
||||
import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.testutils.HoodieClientTestBase
|
||||
@@ -156,6 +162,79 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
assertEquals(100, timeTravelDF.count()) // 100 initial inserts must be pulled
|
||||
}
|
||||
|
||||
@Test def testOverWriteModeUseReplaceAction(): Unit = {
|
||||
val records1 = recordsToStrings(dataGen.generateInserts("001", 5)).toList
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
val records2 = recordsToStrings(dataGen.generateInserts("002", 5)).toList
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2))
|
||||
inputDF2.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
|
||||
val metaClient = new HoodieTableMetaClient(spark.sparkContext.hadoopConfiguration, basePath, true)
|
||||
val commits = metaClient.getActiveTimeline.filterCompletedInstants().getInstants.toArray
|
||||
.map(instant => (instant.asInstanceOf[HoodieInstant]).getAction)
|
||||
assertEquals(2, commits.size)
|
||||
assertEquals("commit", commits(0))
|
||||
assertEquals("replacecommit", commits(1))
|
||||
}
|
||||
|
||||
@Test def testOverWriteModeUseReplaceActionOnDisJointPartitions(): Unit = {
|
||||
// step1: Write 5 records to hoodie table for partition1 DEFAULT_FIRST_PARTITION_PATH
|
||||
val records1 = recordsToStrings(dataGen.generateInsertsForPartition("001", 5, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList
|
||||
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
|
||||
inputDF1.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL)
|
||||
.mode(SaveMode.Append)
|
||||
.save(basePath)
|
||||
|
||||
// step2: Write 7 more rectestOverWriteModeUseReplaceActionords using SaveMode.Overwrite for partition2 DEFAULT_SECOND_PARTITION_PATH
|
||||
val records2 = recordsToStrings(dataGen.generateInsertsForPartition("002", 7, HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)).toList
|
||||
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2))
|
||||
inputDF2.write.format("org.apache.hudi")
|
||||
.options(commonOpts)
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
|
||||
val allRecords = spark.read.format("org.apache.hudi").load(basePath + "/*/*/*")
|
||||
allRecords.registerTempTable("tmpTable")
|
||||
|
||||
spark.sql(String.format("select count(*) from tmpTable")).show()
|
||||
|
||||
// step3: Query the rows count from hoodie table for partition1 DEFAULT_FIRST_PARTITION_PATH
|
||||
val recordCountForParititon1 = spark.sql(String.format("select count(*) from tmpTable where partition = '%s'", HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).collect()
|
||||
assertEquals("0", recordCountForParititon1(0).get(0).toString)
|
||||
|
||||
// step4: Query the rows count from hoodie table for partition1 DEFAULT_SECOND_PARTITION_PATH
|
||||
val recordCountForParititon2 = spark.sql(String.format("select count(*) from tmpTable where partition = '%s'", HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)).collect()
|
||||
assertEquals("7", recordCountForParititon2(0).get(0).toString)
|
||||
|
||||
// step5: Query the rows count from hoodie table
|
||||
val recordCount = spark.sql(String.format("select count(*) from tmpTable")).collect()
|
||||
assertEquals("7", recordCountForParititon2(0).get(0).toString)
|
||||
|
||||
// step6: Query the rows count from hoodie table for partition1 DEFAULT_SECOND_PARTITION_PATH using spark.collect and then filter mode
|
||||
val recordsForPartitionColumn = spark.sql(String.format("select partition from tmpTable")).collect()
|
||||
val filterSecondPartitionCount = recordsForPartitionColumn.filter(row => row.get(0).equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)).size
|
||||
assertEquals(7,filterSecondPartitionCount)
|
||||
|
||||
val metaClient = new HoodieTableMetaClient(spark.sparkContext.hadoopConfiguration, basePath, true)
|
||||
val commits = metaClient.getActiveTimeline.filterCompletedInstants().getInstants.toArray
|
||||
.map(instant => instant.asInstanceOf[HoodieInstant].getAction)
|
||||
assertEquals(2, commits.size)
|
||||
assertEquals("commit", commits(0))
|
||||
assertEquals("replacecommit", commits(1))
|
||||
}
|
||||
|
||||
@Test def testDropInsertDup(): Unit = {
|
||||
val insert1Cnt = 10
|
||||
val insert2DupKeyCnt = 9
|
||||
|
||||
Reference in New Issue
Block a user