[HUDI-2307] When using delete_partition with ds should not rely on the primary key (#3469)
- Co-authored-by: Sivabalan Narayanan <n.siva.b@gmail.com>
This commit is contained in:
@@ -325,6 +325,11 @@ object DataSourceWriteOptions {
|
|||||||
@Deprecated
|
@Deprecated
|
||||||
val INSERT_DROP_DUPS_OPT_KEY = INSERT_DROP_DUPS.key()
|
val INSERT_DROP_DUPS_OPT_KEY = INSERT_DROP_DUPS.key()
|
||||||
|
|
||||||
|
val PARTITIONS_TO_DELETE: ConfigProperty[String] = ConfigProperty
|
||||||
|
.key("hoodie.datasource.write.partitions.to.delete")
|
||||||
|
.noDefaultValue()
|
||||||
|
.withDocumentation("Comma separated list of partitions to delete")
|
||||||
|
|
||||||
val STREAMING_RETRY_CNT: ConfigProperty[String] = ConfigProperty
|
val STREAMING_RETRY_CNT: ConfigProperty[String] = ConfigProperty
|
||||||
.key("hoodie.datasource.write.streaming.retry.count")
|
.key("hoodie.datasource.write.streaming.retry.count")
|
||||||
.defaultValue("3")
|
.defaultValue("3")
|
||||||
|
|||||||
@@ -29,8 +29,8 @@ import org.apache.hudi.client.{HoodieWriteResult, SparkRDDWriteClient}
|
|||||||
import org.apache.hudi.common.config.{HoodieConfig, HoodieMetadataConfig, TypedProperties}
|
import org.apache.hudi.common.config.{HoodieConfig, HoodieMetadataConfig, TypedProperties}
|
||||||
import org.apache.hudi.common.fs.FSUtils
|
import org.apache.hudi.common.fs.FSUtils
|
||||||
import org.apache.hudi.common.model.{HoodieRecordPayload, HoodieTableType, WriteOperationType}
|
import org.apache.hudi.common.model.{HoodieRecordPayload, HoodieTableType, WriteOperationType}
|
||||||
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.TableSchemaResolver
|
||||||
import org.apache.hudi.common.table.timeline.{HoodieActiveTimeline, HoodieTimeline}
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline
|
||||||
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
|
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
|
||||||
import org.apache.hudi.common.util.{CommitUtils, ReflectionUtils}
|
import org.apache.hudi.common.util.{CommitUtils, ReflectionUtils}
|
||||||
import org.apache.hudi.config.HoodieBootstrapConfig.{BOOTSTRAP_BASE_PATH, BOOTSTRAP_INDEX_CLASS}
|
import org.apache.hudi.config.HoodieBootstrapConfig.{BOOTSTRAP_BASE_PATH, BOOTSTRAP_INDEX_CLASS}
|
||||||
@@ -192,7 +192,12 @@ object HoodieSparkSqlWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get list of partitions to delete
|
// Get list of partitions to delete
|
||||||
val partitionsToDelete = genericRecords.map(gr => keyGenerator.getKey(gr).getPartitionPath).toJavaRDD().distinct().collect()
|
val partitionsToDelete = if (parameters.containsKey(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key())) {
|
||||||
|
val partitionColsToDelete = parameters.get(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key()).get.split(",")
|
||||||
|
java.util.Arrays.asList(partitionColsToDelete:_*)
|
||||||
|
} else {
|
||||||
|
genericRecords.map(gr => keyGenerator.getKey(gr).getPartitionPath).toJavaRDD().distinct().collect()
|
||||||
|
}
|
||||||
// Create a HoodieWriteClient & issue the delete.
|
// Create a HoodieWriteClient & issue the delete.
|
||||||
val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc,
|
val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc,
|
||||||
null, path.get, tblName,
|
null, path.get, tblName,
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import org.apache.hadoop.fs.Path
|
|||||||
import org.apache.hudi.DataSourceWriteOptions._
|
import org.apache.hudi.DataSourceWriteOptions._
|
||||||
import org.apache.hudi.client.SparkRDDWriteClient
|
import org.apache.hudi.client.SparkRDDWriteClient
|
||||||
import org.apache.hudi.common.config.HoodieConfig
|
import org.apache.hudi.common.config.HoodieConfig
|
||||||
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||||
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord, HoodieRecordPayload, HoodieTableType, WriteOperationType}
|
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord, HoodieRecordPayload, HoodieTableType, WriteOperationType}
|
||||||
import org.apache.hudi.common.table.HoodieTableConfig
|
import org.apache.hudi.common.table.HoodieTableConfig
|
||||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
|
import org.apache.hudi.common.testutils.HoodieTestDataGenerator
|
||||||
@@ -679,13 +679,15 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
test("test delete partitions") {
|
List(true, false)
|
||||||
initSparkContext("test_delete_partitions")
|
.foreach(usePartitionsToDeleteConfig => {
|
||||||
|
test("test delete partitions for " + usePartitionsToDeleteConfig) {
|
||||||
|
initSparkContext("test_delete_partitions_" + usePartitionsToDeleteConfig)
|
||||||
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path_delete_partitions")
|
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path_delete_partitions")
|
||||||
try {
|
try {
|
||||||
val hoodieFooTableName = "hoodie_foo_tbl_delete_partitions"
|
val hoodieFooTableName = "hoodie_foo_tbl_delete_partitions"
|
||||||
val fooTableModifier = getCommonParams(path, hoodieFooTableName, HoodieTableType.COPY_ON_WRITE.name())
|
val fooTableModifier = getCommonParams(path, hoodieFooTableName, HoodieTableType.COPY_ON_WRITE.name())
|
||||||
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
var fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
|
||||||
val schema = DataSourceTestUtils.getStructTypeExampleSchema
|
val schema = DataSourceTestUtils.getStructTypeExampleSchema
|
||||||
val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema)
|
val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema)
|
||||||
val records = DataSourceTestUtils.generateRandomRows(10)
|
val records = DataSourceTestUtils.generateRandomRows(10)
|
||||||
@@ -715,10 +717,14 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
|||||||
// ensure 2nd batch of updates matches.
|
// ensure 2nd batch of updates matches.
|
||||||
assert(updatesDf.intersect(trimmedDf2).except(updatesDf).count() == 0)
|
assert(updatesDf.intersect(trimmedDf2).except(updatesDf).count() == 0)
|
||||||
|
|
||||||
// delete partitions
|
if ( usePartitionsToDeleteConfig) {
|
||||||
|
fooTableParams.updated(DataSourceWriteOptions.PARTITIONS_TO_DELETE.key(), HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
|
||||||
|
}
|
||||||
|
// delete partitions contains the primary key
|
||||||
val recordsToDelete = df1.filter(entry => {
|
val recordsToDelete = df1.filter(entry => {
|
||||||
val partitionPath : String = entry.getString(1)
|
val partitionPath : String = entry.getString(1)
|
||||||
partitionPath.equals(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) || partitionPath.equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)
|
partitionPath.equals(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH) ||
|
||||||
|
partitionPath.equals(HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH)
|
||||||
})
|
})
|
||||||
val updatedParams = fooTableParams.updated(DataSourceWriteOptions.OPERATION.key(), WriteOperationType.DELETE_PARTITION.name())
|
val updatedParams = fooTableParams.updated(DataSourceWriteOptions.OPERATION.key(), WriteOperationType.DELETE_PARTITION.name())
|
||||||
HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, updatedParams, recordsToDelete)
|
HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, updatedParams, recordsToDelete)
|
||||||
@@ -734,6 +740,7 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
|
|||||||
FileUtils.deleteDirectory(path.toFile)
|
FileUtils.deleteDirectory(path.toFile)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
})
|
||||||
|
|
||||||
def dropMetaFields(df: Dataset[Row]) : Dataset[Row] = {
|
def dropMetaFields(df: Dataset[Row]) : Dataset[Row] = {
|
||||||
df.drop(HoodieRecord.HOODIE_META_COLUMNS.get(0)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(1))
|
df.drop(HoodieRecord.HOODIE_META_COLUMNS.get(0)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(1))
|
||||||
|
|||||||
@@ -27,7 +27,7 @@
|
|||||||
<check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"/>
|
<check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"/>
|
||||||
<check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true">
|
<check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true">
|
||||||
<parameters>
|
<parameters>
|
||||||
<parameter name="maxFileLength"><![CDATA[800]]></parameter>
|
<parameter name="maxFileLength"><![CDATA[900]]></parameter>
|
||||||
</parameters>
|
</parameters>
|
||||||
</check>
|
</check>
|
||||||
<check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"/>
|
<check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"/>
|
||||||
|
|||||||
Reference in New Issue
Block a user