[HUDI-1468] Support custom clustering strategies and preserve commit metadata as part of clustering (#3419)
Co-authored-by: Satish Kotha <satishkotha@uber.com>
This commit is contained in:
@@ -125,7 +125,13 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.");
|
||||
|
||||
private HoodieClusteringConfig() {
|
||||
public static final ConfigProperty<Boolean> CLUSTERING_PRESERVE_HOODIE_COMMIT_METADATA = ConfigProperty
|
||||
.key("hoodie.clustering.preserve.commit.metadata")
|
||||
.defaultValue(false)
|
||||
.sinceVersion("0.9.0")
|
||||
.withDocumentation("When rewriting data, preserves existing hoodie_commit_time");
|
||||
|
||||
public HoodieClusteringConfig() {
|
||||
super();
|
||||
}
|
||||
|
||||
@@ -214,6 +220,11 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withPreserveHoodieCommitMetadata(Boolean preserveHoodieCommitMetadata) {
|
||||
clusteringConfig.setValue(CLUSTERING_PRESERVE_HOODIE_COMMIT_METADATA, String.valueOf(preserveHoodieCommitMetadata));
|
||||
return this;
|
||||
}
|
||||
|
||||
public HoodieClusteringConfig build() {
|
||||
clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName());
|
||||
return clusteringConfig;
|
||||
|
||||
@@ -700,6 +700,10 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
return getBoolean(HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE);
|
||||
}
|
||||
|
||||
public boolean isPreserveHoodieCommitMetadata() {
|
||||
return getBoolean(HoodieClusteringConfig.CLUSTERING_PRESERVE_HOODIE_COMMIT_METADATA);
|
||||
}
|
||||
|
||||
public boolean isClusteringEnabled() {
|
||||
// TODO: future support async clustering
|
||||
return inlineClusteringEnabled() || isAsyncClusteringEnabled();
|
||||
|
||||
@@ -25,12 +25,22 @@ import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
public class CreateHandleFactory<T extends HoodieRecordPayload, I, K, O> extends WriteHandleFactory<T, I, K, O> {
|
||||
|
||||
private boolean preserveMetadata = false;
|
||||
|
||||
public CreateHandleFactory() {
|
||||
this(false);
|
||||
}
|
||||
|
||||
public CreateHandleFactory(boolean preserveMetadata) {
|
||||
this.preserveMetadata = preserveMetadata;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteHandle<T, I, K, O> create(final HoodieWriteConfig hoodieConfig, final String commitTime,
|
||||
final HoodieTable<T, I, K, O> hoodieTable, final String partitionPath,
|
||||
final String fileIdPrefix, TaskContextSupplier taskContextSupplier) {
|
||||
final HoodieTable<T, I, K, O> hoodieTable, final String partitionPath,
|
||||
final String fileIdPrefix, TaskContextSupplier taskContextSupplier) {
|
||||
|
||||
return new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, partitionPath,
|
||||
getNextFileId(fileIdPrefix), taskContextSupplier);
|
||||
getNextFileId(fileIdPrefix), taskContextSupplier, preserveMetadata);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,18 +59,33 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload, I, K, O> extends
|
||||
protected long recordsDeleted = 0;
|
||||
private Map<String, HoodieRecord<T>> recordMap;
|
||||
private boolean useWriterSchema = false;
|
||||
private boolean preserveHoodieMetadata = false;
|
||||
|
||||
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) {
|
||||
this(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(),
|
||||
taskContextSupplier);
|
||||
taskContextSupplier, false);
|
||||
}
|
||||
|
||||
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier,
|
||||
boolean preserveHoodieMetadata) {
|
||||
this(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(),
|
||||
taskContextSupplier, preserveHoodieMetadata);
|
||||
}
|
||||
|
||||
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||
String partitionPath, String fileId, Option<Schema> overriddenSchema,
|
||||
TaskContextSupplier taskContextSupplier) {
|
||||
this(config, instantTime, hoodieTable, partitionPath, fileId, overriddenSchema, taskContextSupplier, false);
|
||||
}
|
||||
|
||||
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||
String partitionPath, String fileId, Option<Schema> overriddenSchema,
|
||||
TaskContextSupplier taskContextSupplier, boolean preserveHoodieMetadata) {
|
||||
super(config, instantTime, partitionPath, fileId, hoodieTable, overriddenSchema,
|
||||
taskContextSupplier);
|
||||
this.preserveHoodieMetadata = preserveHoodieMetadata;
|
||||
writeStatus.setFileId(fileId);
|
||||
writeStatus.setPartitionPath(partitionPath);
|
||||
writeStatus.setStat(new HoodieWriteStat());
|
||||
@@ -119,7 +134,11 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload, I, K, O> extends
|
||||
}
|
||||
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
||||
IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get());
|
||||
fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
|
||||
if (preserveHoodieMetadata) {
|
||||
fileWriter.writeAvro(record.getRecordKey(), recordWithMetadataInSchema);
|
||||
} else {
|
||||
fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
|
||||
}
|
||||
// update the new location of record, so we know where to find it next
|
||||
record.unseal();
|
||||
record.setNewLocation(new HoodieRecordLocation(instantTime, writeStatus.getFileId()));
|
||||
|
||||
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io;
|
||||
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* A HoodieCreateHandle which writes all data into a single file.
|
||||
* <p>
|
||||
* Please use this with caution. This can end up creating very large files if not used correctly.
|
||||
*/
|
||||
public class HoodieUnboundedCreateHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieCreateHandle<T, I, K, O> {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieUnboundedCreateHandle.class);
|
||||
|
||||
public HoodieUnboundedCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
|
||||
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier,
|
||||
boolean preserveHoodieMetadata) {
|
||||
super(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(),
|
||||
taskContextSupplier, preserveHoodieMetadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canWrite(HoodieRecord record) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io;
|
||||
|
||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
/**
|
||||
* A SingleFileHandleCreateFactory is used to write all data in the spark partition into a single data file.
|
||||
* <p>
|
||||
* Please use this with caution. This can end up creating very large files if not used correctly.
|
||||
*/
|
||||
public class SingleFileHandleCreateFactory<T extends HoodieRecordPayload, I, K, O> extends WriteHandleFactory<T, I, K, O> {
|
||||
|
||||
private AtomicBoolean isHandleCreated = new AtomicBoolean(false);
|
||||
private String fileId;
|
||||
private boolean preserveHoodieMetadata;
|
||||
|
||||
public SingleFileHandleCreateFactory(String fileId, boolean preserveHoodieMetadata) {
|
||||
super();
|
||||
this.fileId = fileId;
|
||||
this.preserveHoodieMetadata = preserveHoodieMetadata;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieWriteHandle<T, I, K, O> create(final HoodieWriteConfig hoodieConfig, final String commitTime,
|
||||
final HoodieTable<T, I, K, O> hoodieTable, final String partitionPath,
|
||||
final String fileIdPrefix, TaskContextSupplier taskContextSupplier) {
|
||||
|
||||
if (isHandleCreated.compareAndSet(false, true)) {
|
||||
return new HoodieUnboundedCreateHandle(hoodieConfig, commitTime, hoodieTable, partitionPath,
|
||||
fileId, // ignore idPfx, always use same fileId
|
||||
taskContextSupplier, preserveHoodieMetadata);
|
||||
}
|
||||
|
||||
throw new HoodieIOException("Fixed handle create is only expected to be invoked once");
|
||||
}
|
||||
}
|
||||
@@ -18,25 +18,27 @@
|
||||
|
||||
package org.apache.hudi.table.action.cluster.strategy;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Pluggable implementation for writing data into new file groups based on ClusteringPlan.
|
||||
*/
|
||||
public abstract class ClusteringExecutionStrategy<T extends HoodieRecordPayload,I,K,O> implements Serializable {
|
||||
public abstract class ClusteringExecutionStrategy<T extends HoodieRecordPayload, I, K, O> implements Serializable {
|
||||
private static final Logger LOG = LogManager.getLogger(ClusteringExecutionStrategy.class);
|
||||
|
||||
private final HoodieTable<T,I,K,O> hoodieTable;
|
||||
private final HoodieEngineContext engineContext;
|
||||
private final HoodieTable<T, I, K, O> hoodieTable;
|
||||
private final transient HoodieEngineContext engineContext;
|
||||
private final HoodieWriteConfig writeConfig;
|
||||
|
||||
public ClusteringExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
|
||||
@@ -50,10 +52,9 @@ public abstract class ClusteringExecutionStrategy<T extends HoodieRecordPayload,
|
||||
* file groups created is bounded by numOutputGroups.
|
||||
* Note that commit is not done as part of strategy. commit is callers responsibility.
|
||||
*/
|
||||
public abstract O performClustering(final I inputRecords, final int numOutputGroups, final String instantTime,
|
||||
final Map<String, String> strategyParams, final Schema schema);
|
||||
public abstract HoodieWriteMetadata<O> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime);
|
||||
|
||||
protected HoodieTable<T,I,K, O> getHoodieTable() {
|
||||
protected HoodieTable<T, I, K, O> getHoodieTable() {
|
||||
return this.hoodieTable;
|
||||
}
|
||||
|
||||
|
||||
@@ -97,6 +97,7 @@ public abstract class PartitionAwareClusteringPlanStrategy<T extends HoodieRecor
|
||||
.setInputGroups(clusteringGroups)
|
||||
.setExtraMetadata(getExtraMetadata())
|
||||
.setVersion(getPlanVersion())
|
||||
.setPreserveHoodieMetadata(getWriteConfig().isPreserveHoodieCommitMetadata())
|
||||
.build());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,5 +43,6 @@ public abstract class AbstractBulkInsertHelper<T extends HoodieRecordPayload, I,
|
||||
boolean performDedupe,
|
||||
Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner,
|
||||
boolean addMetadataFields,
|
||||
int parallelism);
|
||||
int parallelism,
|
||||
boolean preserveMetadata);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user