HUDI-1674 (#3488)
This commit is contained in:
@@ -27,6 +27,7 @@ import org.apache.hudi.common.model.HoodieKey;
|
|||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||||
import org.apache.hudi.config.HoodieIndexConfig;
|
import org.apache.hudi.config.HoodieIndexConfig;
|
||||||
@@ -47,7 +48,6 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Simple examples of #{@link SparkRDDWriteClient}.
|
* Simple examples of #{@link SparkRDDWriteClient}.
|
||||||
*
|
*
|
||||||
@@ -127,6 +127,16 @@ public class HoodieWriteClientExample {
|
|||||||
JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(toBeDeleted, 1);
|
JavaRDD<HoodieKey> deleteRecords = jsc.parallelize(toBeDeleted, 1);
|
||||||
client.delete(deleteRecords, newCommitTime);
|
client.delete(deleteRecords, newCommitTime);
|
||||||
|
|
||||||
|
// Delete by partition
|
||||||
|
newCommitTime = client.startCommit();
|
||||||
|
client.startCommitWithTime(newCommitTime, HoodieTimeline.REPLACE_COMMIT_ACTION);
|
||||||
|
LOG.info("Starting commit " + newCommitTime);
|
||||||
|
// The partition where the data needs to be deleted
|
||||||
|
List<String> partitionList = toBeDeleted.stream().map(s -> s.getPartitionPath()).distinct().collect(Collectors.toList());
|
||||||
|
List<String> deleteList = recordsSoFar.stream().filter(f -> !partitionList.contains(f.getPartitionPath()))
|
||||||
|
.map(m -> m.getKey().getPartitionPath()).distinct().collect(Collectors.toList());
|
||||||
|
client.deletePartitions(deleteList, newCommitTime);
|
||||||
|
|
||||||
// compaction
|
// compaction
|
||||||
if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) {
|
if (HoodieTableType.valueOf(tableType) == HoodieTableType.MERGE_ON_READ) {
|
||||||
Option<String> instant = client.scheduleCompaction(Option.empty());
|
Option<String> instant = client.scheduleCompaction(Option.empty());
|
||||||
|
|||||||
@@ -19,7 +19,7 @@
|
|||||||
package org.apache.hudi.examples.spark
|
package org.apache.hudi.examples.spark
|
||||||
|
|
||||||
import org.apache.hudi.DataSourceReadOptions.{BEGIN_INSTANTTIME, END_INSTANTTIME, QUERY_TYPE_INCREMENTAL_OPT_VAL, QUERY_TYPE}
|
import org.apache.hudi.DataSourceReadOptions.{BEGIN_INSTANTTIME, END_INSTANTTIME, QUERY_TYPE_INCREMENTAL_OPT_VAL, QUERY_TYPE}
|
||||||
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD}
|
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD, PARTITIONS_TO_DELETE, OPERATION, DELETE_PARTITION_OPERATION_OPT_VAL, DELETE_OPERATION_OPT_VAL}
|
||||||
import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs
|
import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs
|
||||||
import org.apache.hudi.common.model.HoodieAvroPayload
|
import org.apache.hudi.common.model.HoodieAvroPayload
|
||||||
import org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME
|
import org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME
|
||||||
@@ -61,10 +61,12 @@ object HoodieDataSourceExample {
|
|||||||
incrementalQuery(spark, tablePath, tableName)
|
incrementalQuery(spark, tablePath, tableName)
|
||||||
pointInTimeQuery(spark, tablePath, tableName)
|
pointInTimeQuery(spark, tablePath, tableName)
|
||||||
|
|
||||||
|
delete(spark, tablePath, tableName)
|
||||||
|
deleteByPartition(spark, tablePath, tableName)
|
||||||
|
|
||||||
spark.stop()
|
spark.stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi dataset as below.
|
* Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi dataset as below.
|
||||||
*/
|
*/
|
||||||
@@ -72,7 +74,6 @@ object HoodieDataSourceExample {
|
|||||||
|
|
||||||
val commitTime: String = System.currentTimeMillis().toString
|
val commitTime: String = System.currentTimeMillis().toString
|
||||||
val inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20))
|
val inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20))
|
||||||
spark.sparkContext.parallelize(inserts, 2)
|
|
||||||
val df = spark.read.json(spark.sparkContext.parallelize(inserts, 1))
|
val df = spark.read.json(spark.sparkContext.parallelize(inserts, 1))
|
||||||
df.write.format("org.apache.hudi").
|
df.write.format("org.apache.hudi").
|
||||||
options(getQuickstartWriteConfigs).
|
options(getQuickstartWriteConfigs).
|
||||||
@@ -129,6 +130,43 @@ object HoodieDataSourceExample {
|
|||||||
save(tablePath)
|
save(tablePath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deleta data based in data information.
|
||||||
|
*/
|
||||||
|
def delete(spark: SparkSession, tablePath: String, tableName: String): Unit = {
|
||||||
|
|
||||||
|
val roViewDF = spark.read.format("org.apache.hudi").load(tablePath + "/*/*/*/*")
|
||||||
|
roViewDF.createOrReplaceTempView("hudi_ro_table")
|
||||||
|
val df = spark.sql("select uuid, partitionpath, ts from hudi_ro_table limit 2")
|
||||||
|
|
||||||
|
df.write.format("org.apache.hudi").
|
||||||
|
options(getQuickstartWriteConfigs).
|
||||||
|
option(PRECOMBINE_FIELD.key, "ts").
|
||||||
|
option(RECORDKEY_FIELD.key, "uuid").
|
||||||
|
option(PARTITIONPATH_FIELD.key, "partitionpath").
|
||||||
|
option(TABLE_NAME.key, tableName).
|
||||||
|
option(OPERATION.key, DELETE_OPERATION_OPT_VAL).
|
||||||
|
mode(Append).
|
||||||
|
save(tablePath)
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delete the data of a single or multiple partitions.
|
||||||
|
*/
|
||||||
|
def deleteByPartition(spark: SparkSession, tablePath: String, tableName: String): Unit = {
|
||||||
|
val df = spark.emptyDataFrame
|
||||||
|
df.write.format("org.apache.hudi").
|
||||||
|
options(getQuickstartWriteConfigs).
|
||||||
|
option(PRECOMBINE_FIELD.key, "ts").
|
||||||
|
option(RECORDKEY_FIELD.key, "uuid").
|
||||||
|
option(PARTITIONPATH_FIELD.key, "partitionpath").
|
||||||
|
option(TABLE_NAME.key, tableName).
|
||||||
|
option(OPERATION.key, DELETE_PARTITION_OPERATION_OPT_VAL).
|
||||||
|
option(PARTITIONS_TO_DELETE.key(), HoodieExampleDataGenerator.DEFAULT_PARTITION_PATHS.mkString(",")).
|
||||||
|
mode(Append).
|
||||||
|
save(tablePath)
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Hudi also provides capability to obtain a stream of records that changed since given commit timestamp.
|
* Hudi also provides capability to obtain a stream of records that changed since given commit timestamp.
|
||||||
* This can be achieved using Hudi’s incremental view and providing a begin time from which changes need to be streamed.
|
* This can be achieved using Hudi’s incremental view and providing a begin time from which changes need to be streamed.
|
||||||
|
|||||||
@@ -171,6 +171,7 @@ object DataSourceWriteOptions {
|
|||||||
val INSERT_OPERATION_OPT_VAL = WriteOperationType.INSERT.value
|
val INSERT_OPERATION_OPT_VAL = WriteOperationType.INSERT.value
|
||||||
val UPSERT_OPERATION_OPT_VAL = WriteOperationType.UPSERT.value
|
val UPSERT_OPERATION_OPT_VAL = WriteOperationType.UPSERT.value
|
||||||
val DELETE_OPERATION_OPT_VAL = WriteOperationType.DELETE.value
|
val DELETE_OPERATION_OPT_VAL = WriteOperationType.DELETE.value
|
||||||
|
val DELETE_PARTITION_OPERATION_OPT_VAL = WriteOperationType.DELETE_PARTITION.value
|
||||||
val BOOTSTRAP_OPERATION_OPT_VAL = WriteOperationType.BOOTSTRAP.value
|
val BOOTSTRAP_OPERATION_OPT_VAL = WriteOperationType.BOOTSTRAP.value
|
||||||
val INSERT_OVERWRITE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE.value
|
val INSERT_OVERWRITE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE.value
|
||||||
val INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE_TABLE.value
|
val INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL = WriteOperationType.INSERT_OVERWRITE_TABLE.value
|
||||||
|
|||||||
@@ -241,6 +241,8 @@ public class QuickstartUtils {
|
|||||||
Map<String, String> demoConfigs = new HashMap<>();
|
Map<String, String> demoConfigs = new HashMap<>();
|
||||||
demoConfigs.put("hoodie.insert.shuffle.parallelism", "2");
|
demoConfigs.put("hoodie.insert.shuffle.parallelism", "2");
|
||||||
demoConfigs.put("hoodie.upsert.shuffle.parallelism", "2");
|
demoConfigs.put("hoodie.upsert.shuffle.parallelism", "2");
|
||||||
|
demoConfigs.put("hoodie.bulkinsert.shuffle.parallelism", "2");
|
||||||
|
demoConfigs.put("hoodie.delete.shuffle.parallelism", "2");
|
||||||
return demoConfigs;
|
return demoConfigs;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user