[HUDI-796] Add deduping logic for upserts case (#1558)
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.cli.commands;
|
||||
|
||||
import org.apache.hudi.cli.DeDupeType;
|
||||
import org.apache.hudi.cli.HoodieCLI;
|
||||
import org.apache.hudi.cli.HoodiePrintHelper;
|
||||
import org.apache.hudi.cli.HoodieTableHeaderFields;
|
||||
@@ -77,8 +78,13 @@ public class RepairsCommand implements CommandMarker {
|
||||
help = "Spark executor memory") final String sparkMemory,
|
||||
@CliOption(key = {"dryrun"},
|
||||
help = "Should we actually remove duplicates or just run and store result to repairedOutputPath",
|
||||
unspecifiedDefaultValue = "true") final boolean dryRun)
|
||||
unspecifiedDefaultValue = "true") final boolean dryRun,
|
||||
@CliOption(key = {"dedupeType"}, help = "Valid values are - insert_type, update_type and upsert_type",
|
||||
unspecifiedDefaultValue = "insert_type") final String dedupeType)
|
||||
throws Exception {
|
||||
if (!DeDupeType.values().contains(DeDupeType.withName(dedupeType))) {
|
||||
throw new IllegalArgumentException("Please provide valid dedupe type!");
|
||||
}
|
||||
if (StringUtils.isNullOrEmpty(sparkPropertiesPath)) {
|
||||
sparkPropertiesPath =
|
||||
Utils.getDefaultPropertiesFile(JavaConverters.mapAsScalaMapConverter(System.getenv()).asScala());
|
||||
@@ -87,7 +93,7 @@ public class RepairsCommand implements CommandMarker {
|
||||
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
|
||||
sparkLauncher.addAppArgs(SparkMain.SparkCommand.DEDUPLICATE.toString(), master, sparkMemory,
|
||||
duplicatedPartitionPath, repairedOutputPath, HoodieCLI.getTableMetaClient().getBasePath(),
|
||||
String.valueOf(dryRun));
|
||||
String.valueOf(dryRun), dedupeType);
|
||||
Process process = sparkLauncher.launch();
|
||||
InputStreamConsumer.captureOutput(process);
|
||||
int exitCode = process.waitFor();
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.cli.commands;
|
||||
|
||||
import org.apache.hudi.cli.DeDupeType;
|
||||
import org.apache.hudi.DataSourceWriteOptions;
|
||||
import org.apache.hudi.cli.DedupeSparkJob;
|
||||
import org.apache.hudi.cli.utils.SparkUtil;
|
||||
@@ -87,8 +88,8 @@ public class SparkMain {
|
||||
returnCode = rollback(jsc, args[3], args[4]);
|
||||
break;
|
||||
case DEDUPLICATE:
|
||||
assert (args.length == 7);
|
||||
returnCode = deduplicatePartitionPath(jsc, args[3], args[4], args[5], args[6]);
|
||||
assert (args.length == 8);
|
||||
returnCode = deduplicatePartitionPath(jsc, args[3], args[4], args[5], Boolean.parseBoolean(args[6]), args[7]);
|
||||
break;
|
||||
case ROLLBACK_TO_SAVEPOINT:
|
||||
assert (args.length == 5);
|
||||
@@ -304,10 +305,10 @@ public class SparkMain {
|
||||
}
|
||||
|
||||
private static int deduplicatePartitionPath(JavaSparkContext jsc, String duplicatedPartitionPath,
|
||||
String repairedOutputPath, String basePath, String dryRun) {
|
||||
String repairedOutputPath, String basePath, boolean dryRun, String dedupeType) {
|
||||
DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc),
|
||||
FSUtils.getFs(basePath, jsc.hadoopConfiguration()));
|
||||
job.fixDuplicates(Boolean.parseBoolean(dryRun));
|
||||
FSUtils.getFs(basePath, jsc.hadoopConfiguration()), DeDupeType.withName(dedupeType));
|
||||
job.fixDuplicates(dryRun);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
28
hudi-cli/src/main/scala/org/apache/hudi/cli/DeDupeType.scala
Normal file
28
hudi-cli/src/main/scala/org/apache/hudi/cli/DeDupeType.scala
Normal file
@@ -0,0 +1,28 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.cli
|
||||
|
||||
object DeDupeType extends Enumeration {
|
||||
|
||||
type dedupeType = Value
|
||||
|
||||
val INSERT_TYPE = Value("insert_type")
|
||||
val UPDATE_TYPE = Value("update_type")
|
||||
val UPSERT_TYPE = Value("upsert_type")
|
||||
}
|
||||
@@ -26,11 +26,10 @@ import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.log4j.Logger
|
||||
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
|
||||
|
||||
import scala.collection.JavaConversions._
|
||||
import scala.collection.mutable._
|
||||
|
||||
import scala.collection.mutable.{Buffer, HashMap, HashSet, ListBuffer}
|
||||
|
||||
/**
|
||||
* Spark job to de-duplicate data present in a partition path
|
||||
@@ -39,8 +38,8 @@ class DedupeSparkJob(basePath: String,
|
||||
duplicatedPartitionPath: String,
|
||||
repairOutputPath: String,
|
||||
sqlContext: SQLContext,
|
||||
fs: FileSystem) {
|
||||
|
||||
fs: FileSystem,
|
||||
dedupeType: DeDupeType.Value) {
|
||||
|
||||
val sparkHelper = new SparkHelper(sqlContext, fs)
|
||||
val LOG = Logger.getLogger(this.getClass)
|
||||
@@ -98,33 +97,91 @@ class DedupeSparkJob(basePath: String,
|
||||
ON h.`_hoodie_record_key` = d.dupe_key
|
||||
"""
|
||||
val dupeMap = sqlContext.sql(dupeDataSql).collectAsList().groupBy(r => r.getString(0))
|
||||
val fileToDeleteKeyMap = new HashMap[String, HashSet[String]]()
|
||||
|
||||
// Mark all files except the one with latest commits for deletion
|
||||
dupeMap.foreach(rt => {
|
||||
val (key, rows) = rt
|
||||
var maxCommit = -1L
|
||||
|
||||
rows.foreach(r => {
|
||||
val c = r(3).asInstanceOf[String].toLong
|
||||
if (c > maxCommit)
|
||||
maxCommit = c
|
||||
})
|
||||
|
||||
rows.foreach(r => {
|
||||
val c = r(3).asInstanceOf[String].toLong
|
||||
if (c != maxCommit) {
|
||||
val f = r(2).asInstanceOf[String].split("_")(0)
|
||||
if (!fileToDeleteKeyMap.contains(f)) {
|
||||
fileToDeleteKeyMap(f) = HashSet[String]()
|
||||
}
|
||||
fileToDeleteKeyMap(f).add(key)
|
||||
}
|
||||
})
|
||||
})
|
||||
fileToDeleteKeyMap
|
||||
getDedupePlan(dupeMap)
|
||||
}
|
||||
|
||||
private def getDedupePlan(dupeMap: Map[String, Buffer[Row]]): HashMap[String, HashSet[String]] = {
|
||||
val fileToDeleteKeyMap = new HashMap[String, HashSet[String]]()
|
||||
dupeMap.foreach(rt => {
|
||||
val (key, rows) = rt
|
||||
|
||||
dedupeType match {
|
||||
case DeDupeType.UPDATE_TYPE =>
|
||||
/*
|
||||
This corresponds to the case where all duplicates have been updated at least once.
|
||||
Once updated, duplicates are bound to have same commit time unless forcefully modified.
|
||||
*/
|
||||
rows.init.foreach(r => {
|
||||
val f = r(2).asInstanceOf[String].split("_")(0)
|
||||
if (!fileToDeleteKeyMap.contains(f)) {
|
||||
fileToDeleteKeyMap(f) = HashSet[String]()
|
||||
}
|
||||
fileToDeleteKeyMap(f).add(key)
|
||||
})
|
||||
|
||||
case DeDupeType.INSERT_TYPE =>
|
||||
/*
|
||||
This corresponds to the case where duplicates got created due to INSERT and have never been updated.
|
||||
*/
|
||||
var maxCommit = -1L
|
||||
|
||||
rows.foreach(r => {
|
||||
val c = r(3).asInstanceOf[String].toLong
|
||||
if (c > maxCommit)
|
||||
maxCommit = c
|
||||
})
|
||||
|
||||
rows.foreach(r => {
|
||||
val c = r(3).asInstanceOf[String].toLong
|
||||
if (c != maxCommit) {
|
||||
val f = r(2).asInstanceOf[String].split("_")(0)
|
||||
if (!fileToDeleteKeyMap.contains(f)) {
|
||||
fileToDeleteKeyMap(f) = HashSet[String]()
|
||||
}
|
||||
fileToDeleteKeyMap(f).add(key)
|
||||
}
|
||||
})
|
||||
|
||||
case DeDupeType.UPSERT_TYPE =>
|
||||
/*
|
||||
This corresponds to the case where duplicates got created as a result of inserts as well as updates,
|
||||
i.e few duplicate records have been updated, while others were never updated.
|
||||
*/
|
||||
var maxCommit = -1L
|
||||
|
||||
rows.foreach(r => {
|
||||
val c = r(3).asInstanceOf[String].toLong
|
||||
if (c > maxCommit)
|
||||
maxCommit = c
|
||||
})
|
||||
val rowsWithMaxCommit = new ListBuffer[Row]()
|
||||
rows.foreach(r => {
|
||||
val c = r(3).asInstanceOf[String].toLong
|
||||
if (c != maxCommit) {
|
||||
val f = r(2).asInstanceOf[String].split("_")(0)
|
||||
if (!fileToDeleteKeyMap.contains(f)) {
|
||||
fileToDeleteKeyMap(f) = HashSet[String]()
|
||||
}
|
||||
fileToDeleteKeyMap(f).add(key)
|
||||
} else {
|
||||
rowsWithMaxCommit += r
|
||||
}
|
||||
})
|
||||
|
||||
rowsWithMaxCommit.toList.init.foreach(r => {
|
||||
val f = r(2).asInstanceOf[String].split("_")(0)
|
||||
if (!fileToDeleteKeyMap.contains(f)) {
|
||||
fileToDeleteKeyMap(f) = HashSet[String]()
|
||||
}
|
||||
fileToDeleteKeyMap(f).add(key)
|
||||
})
|
||||
|
||||
case _ => throw new IllegalArgumentException("Please provide valid type for deduping!")
|
||||
}
|
||||
})
|
||||
LOG.debug(s"fileToDeleteKeyMap size: ${fileToDeleteKeyMap.size}, map: $fileToDeleteKeyMap")
|
||||
fileToDeleteKeyMap
|
||||
}
|
||||
|
||||
def fixDuplicates(dryRun: Boolean = true) = {
|
||||
val metadata = new HoodieTableMetaClient(fs.getConf, basePath)
|
||||
@@ -152,7 +209,7 @@ class DedupeSparkJob(basePath: String,
|
||||
val newFilePath = new Path(s"$repairOutputPath/${fileNameToPathMap(fileName).getName}")
|
||||
LOG.info(" Skipping and writing new file for : " + fileName)
|
||||
SparkHelpers.skipKeysAndWriteNewFile(instantTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
|
||||
fs.delete(badFilePath, false)
|
||||
fs.delete(badFilePath, true)
|
||||
}
|
||||
|
||||
// 3. Check that there are no duplicates anymore.
|
||||
@@ -175,7 +232,6 @@ class DedupeSparkJob(basePath: String,
|
||||
throw new HoodieException("Some records in source are not found in fixed files. Inspect output!!")
|
||||
}
|
||||
|
||||
|
||||
println("No duplicates found & counts are in check!!!! ")
|
||||
// 4. Prepare to copy the fixed files back.
|
||||
fileNameToPathMap.foreach { case (_, filePath) =>
|
||||
|
||||
Reference in New Issue
Block a user