[HUDI-3838] Implemented drop partition column feature for delta streamer code path (#5294)
* [HUDI-3838] Implemented drop partition column feature for delta streamer code path * Ensure drop partition table config is updated in hoodie.props Co-authored-by: Sagar Sumit <sagarsumit09@gmail.com>
This commit is contained in:
committed by
GitHub
parent
101b82a679
commit
d16740976e
@@ -105,6 +105,7 @@ import java.util.stream.Collectors;
|
||||
import scala.collection.JavaConversions;
|
||||
|
||||
import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER;
|
||||
import static org.apache.hudi.common.table.HoodieTableConfig.DROP_PARTITION_COLUMNS;
|
||||
import static org.apache.hudi.config.HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE;
|
||||
import static org.apache.hudi.config.HoodieClusteringConfig.INLINE_CLUSTERING;
|
||||
import static org.apache.hudi.config.HoodieCompactionConfig.INLINE_COMPACT;
|
||||
@@ -280,6 +281,7 @@ public class DeltaSync implements Serializable {
|
||||
.setPreCombineField(cfg.sourceOrderingField)
|
||||
.setPartitionMetafileUseBaseFormat(props.getBoolean(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(),
|
||||
HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.defaultValue()))
|
||||
.setDropPartitionColumnsWhenWrite(isDropPartitionColumns())
|
||||
.initTable(new Configuration(jssc.hadoopConfiguration()),
|
||||
cfg.targetBasePath);
|
||||
}
|
||||
@@ -375,6 +377,7 @@ public class DeltaSync implements Serializable {
|
||||
SimpleKeyGenerator.class.getName()))
|
||||
.setPartitionMetafileUseBaseFormat(props.getBoolean(HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.key(),
|
||||
HoodieTableConfig.PARTITION_METAFILE_USE_BASE_FORMAT.defaultValue()))
|
||||
.setDropPartitionColumnsWhenWrite(isDropPartitionColumns())
|
||||
.initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath);
|
||||
}
|
||||
|
||||
@@ -478,13 +481,14 @@ public class DeltaSync implements Serializable {
|
||||
|
||||
boolean shouldCombine = cfg.filterDupes || cfg.operation.equals(WriteOperationType.UPSERT);
|
||||
JavaRDD<GenericRecord> avroRDD = avroRDDOptional.get();
|
||||
JavaRDD<HoodieRecord> records = avroRDD.map(gr -> {
|
||||
JavaRDD<HoodieRecord> records = avroRDD.map(record -> {
|
||||
GenericRecord gr = isDropPartitionColumns() ? HoodieAvroUtils.removeFields(record, getPartitionColumns(keyGenerator, props)) : record;
|
||||
HoodieRecordPayload payload = shouldCombine ? DataSourceUtils.createPayload(cfg.payloadClassName, gr,
|
||||
(Comparable) HoodieAvroUtils.getNestedFieldVal(gr, cfg.sourceOrderingField, false, props.getBoolean(
|
||||
KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(),
|
||||
Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue()))))
|
||||
: DataSourceUtils.createPayload(cfg.payloadClassName, gr);
|
||||
return new HoodieAvroRecord<>(keyGenerator.getKey(gr), payload);
|
||||
return new HoodieAvroRecord<>(keyGenerator.getKey(record), payload);
|
||||
});
|
||||
|
||||
return Pair.of(schemaProvider, Pair.of(checkpointStr, records));
|
||||
@@ -727,6 +731,9 @@ public class DeltaSync implements Serializable {
|
||||
|
||||
private void reInitWriteClient(Schema sourceSchema, Schema targetSchema) throws IOException {
|
||||
LOG.info("Setting up new Hoodie Write Client");
|
||||
if (isDropPartitionColumns()) {
|
||||
targetSchema = HoodieAvroUtils.removeFields(targetSchema, getPartitionColumns(keyGenerator, props));
|
||||
}
|
||||
registerAvroSchemas(sourceSchema, targetSchema);
|
||||
HoodieWriteConfig hoodieCfg = getHoodieClientConfig(targetSchema);
|
||||
if (hoodieCfg.isEmbeddedTimelineServerEnabled()) {
|
||||
@@ -898,4 +905,24 @@ public class DeltaSync implements Serializable {
|
||||
return Option.empty();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set based on hoodie.datasource.write.drop.partition.columns config.
|
||||
* When set to true, will not write the partition columns into the table.
|
||||
*/
|
||||
private Boolean isDropPartitionColumns() {
|
||||
return props.getBoolean(DROP_PARTITION_COLUMNS.key(), DROP_PARTITION_COLUMNS.defaultValue());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of partition columns as a list of strings.
|
||||
*
|
||||
* @param keyGenerator KeyGenerator
|
||||
* @param props TypedProperties
|
||||
* @return List of partition columns.
|
||||
*/
|
||||
private List<String> getPartitionColumns(KeyGenerator keyGenerator, TypedProperties props) {
|
||||
String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props);
|
||||
return Arrays.asList(partitionColumns.split(","));
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user