[HUDI-1483] Support async clustering for deltastreamer and Spark streaming (#3142)
- Integrate async clustering service with HoodieDeltaStreamer and HoodieStreamingSink - Added methods in HoodieAsyncService to reuse code
This commit is contained in:
@@ -34,6 +34,7 @@ import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.TablePathUtils;
|
||||
import org.apache.hudi.config.HoodieClusteringConfig;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodiePayloadConfig;
|
||||
@@ -171,6 +172,8 @@ public class DataSourceUtils {
|
||||
boolean asyncCompact = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE_OPT_KEY().key()));
|
||||
boolean inlineCompact = !asyncCompact && parameters.get(DataSourceWriteOptions.TABLE_TYPE_OPT_KEY().key())
|
||||
.equals(DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL());
|
||||
boolean asyncClusteringEnabled = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE_OPT_KEY().key()));
|
||||
boolean inlineClusteringEnabled = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.INLINE_CLUSTERING_ENABLE_OPT_KEY().key()));
|
||||
// insert/bulk-insert combining to be true, if filtering for duplicates
|
||||
boolean combineInserts = Boolean.parseBoolean(parameters.get(DataSourceWriteOptions.INSERT_DROP_DUPS_OPT_KEY().key()));
|
||||
HoodieWriteConfig.Builder builder = HoodieWriteConfig.newBuilder()
|
||||
@@ -184,6 +187,9 @@ public class DataSourceUtils {
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||
.withPayloadClass(parameters.get(DataSourceWriteOptions.PAYLOAD_CLASS_OPT_KEY().key()))
|
||||
.withInlineCompaction(inlineCompact).build())
|
||||
.withClusteringConfig(HoodieClusteringConfig.newBuilder()
|
||||
.withInlineClustering(inlineClusteringEnabled)
|
||||
.withAsyncClustering(asyncClusteringEnabled).build())
|
||||
.withPayloadConfig(HoodiePayloadConfig.newBuilder().withPayloadOrderingField(parameters.get(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY().key()))
|
||||
.build())
|
||||
// override above with Hoodie configs specified as options.
|
||||
|
||||
@@ -474,6 +474,18 @@ object DataSourceWriteOptions {
|
||||
.defaultValue("true")
|
||||
.withDocumentation("")
|
||||
|
||||
val INLINE_CLUSTERING_ENABLE_OPT_KEY: ConfigProperty[String] = ConfigProperty
|
||||
.key("hoodie.datasource.clustering.inline.enable")
|
||||
.defaultValue("false")
|
||||
.sinceVersion("0.9.0")
|
||||
.withDocumentation("Enable inline clustering. Disabled by default.")
|
||||
|
||||
val ASYNC_CLUSTERING_ENABLE_OPT_KEY: ConfigProperty[String] = ConfigProperty
|
||||
.key("hoodie.datasource.clustering.async.enable")
|
||||
.defaultValue("false")
|
||||
.sinceVersion("0.9.0")
|
||||
.withDocumentation("Enable asynchronous clustering. Disabled by default.")
|
||||
|
||||
val KAFKA_AVRO_VALUE_DESERIALIZER_CLASS: ConfigProperty[String] = ConfigProperty
|
||||
.key("hoodie.deltastreamer.source.kafka.value.deserializer.class")
|
||||
.defaultValue("io.confluent.kafka.serializers.KafkaAvroDeserializer")
|
||||
|
||||
Reference in New Issue
Block a user