1
0

[HUDI-1468] Support custom clustering strategies and preserve commit metadata as part of clustering (#3419)

Co-authored-by: Satish Kotha <satishkotha@uber.com>
This commit is contained in:
Sagar Sumit
2021-08-07 08:23:08 +05:30
committed by GitHub
parent 9ce548edb1
commit 70b6bd485f
34 changed files with 1150 additions and 343 deletions

View File

@@ -125,7 +125,13 @@ public class HoodieClusteringConfig extends HoodieConfig {
.sinceVersion("0.7.0")
.withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.");
private HoodieClusteringConfig() {
public static final ConfigProperty<Boolean> CLUSTERING_PRESERVE_HOODIE_COMMIT_METADATA = ConfigProperty
.key("hoodie.clustering.preserve.commit.metadata")
.defaultValue(false)
.sinceVersion("0.9.0")
.withDocumentation("When rewriting data, preserves existing hoodie_commit_time");
public HoodieClusteringConfig() {
super();
}
@@ -214,6 +220,11 @@ public class HoodieClusteringConfig extends HoodieConfig {
return this;
}
public Builder withPreserveHoodieCommitMetadata(Boolean preserveHoodieCommitMetadata) {
clusteringConfig.setValue(CLUSTERING_PRESERVE_HOODIE_COMMIT_METADATA, String.valueOf(preserveHoodieCommitMetadata));
return this;
}
public HoodieClusteringConfig build() {
clusteringConfig.setDefaults(HoodieClusteringConfig.class.getName());
return clusteringConfig;

View File

@@ -700,6 +700,10 @@ public class HoodieWriteConfig extends HoodieConfig {
return getBoolean(HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE);
}
public boolean isPreserveHoodieCommitMetadata() {
return getBoolean(HoodieClusteringConfig.CLUSTERING_PRESERVE_HOODIE_COMMIT_METADATA);
}
public boolean isClusteringEnabled() {
// TODO: future support async clustering
return inlineClusteringEnabled() || isAsyncClusteringEnabled();

View File

@@ -25,12 +25,22 @@ import org.apache.hudi.table.HoodieTable;
public class CreateHandleFactory<T extends HoodieRecordPayload, I, K, O> extends WriteHandleFactory<T, I, K, O> {
private boolean preserveMetadata = false;
public CreateHandleFactory() {
this(false);
}
public CreateHandleFactory(boolean preserveMetadata) {
this.preserveMetadata = preserveMetadata;
}
@Override
public HoodieWriteHandle<T, I, K, O> create(final HoodieWriteConfig hoodieConfig, final String commitTime,
final HoodieTable<T, I, K, O> hoodieTable, final String partitionPath,
final String fileIdPrefix, TaskContextSupplier taskContextSupplier) {
final HoodieTable<T, I, K, O> hoodieTable, final String partitionPath,
final String fileIdPrefix, TaskContextSupplier taskContextSupplier) {
return new HoodieCreateHandle(hoodieConfig, commitTime, hoodieTable, partitionPath,
getNextFileId(fileIdPrefix), taskContextSupplier);
getNextFileId(fileIdPrefix), taskContextSupplier, preserveMetadata);
}
}
}

View File

@@ -59,18 +59,33 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload, I, K, O> extends
protected long recordsDeleted = 0;
private Map<String, HoodieRecord<T>> recordMap;
private boolean useWriterSchema = false;
private boolean preserveHoodieMetadata = false;
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier) {
this(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(),
taskContextSupplier);
taskContextSupplier, false);
}
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier,
boolean preserveHoodieMetadata) {
this(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(),
taskContextSupplier, preserveHoodieMetadata);
}
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
String partitionPath, String fileId, Option<Schema> overriddenSchema,
TaskContextSupplier taskContextSupplier) {
this(config, instantTime, hoodieTable, partitionPath, fileId, overriddenSchema, taskContextSupplier, false);
}
public HoodieCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
String partitionPath, String fileId, Option<Schema> overriddenSchema,
TaskContextSupplier taskContextSupplier, boolean preserveHoodieMetadata) {
super(config, instantTime, partitionPath, fileId, hoodieTable, overriddenSchema,
taskContextSupplier);
this.preserveHoodieMetadata = preserveHoodieMetadata;
writeStatus.setFileId(fileId);
writeStatus.setPartitionPath(partitionPath);
writeStatus.setStat(new HoodieWriteStat());
@@ -119,7 +134,11 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload, I, K, O> extends
}
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get());
fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
if (preserveHoodieMetadata) {
fileWriter.writeAvro(record.getRecordKey(), recordWithMetadataInSchema);
} else {
fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
}
// update the new location of record, so we know where to find it next
record.unseal();
record.setNewLocation(new HoodieRecordLocation(instantTime, writeStatus.getFileId()));

View File

@@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.io;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/**
* A HoodieCreateHandle which writes all data into a single file.
* <p>
* Please use this with caution. This can end up creating very large files if not used correctly.
*/
public class HoodieUnboundedCreateHandle<T extends HoodieRecordPayload, I, K, O> extends HoodieCreateHandle<T, I, K, O> {
private static final Logger LOG = LogManager.getLogger(HoodieUnboundedCreateHandle.class);
public HoodieUnboundedCreateHandle(HoodieWriteConfig config, String instantTime, HoodieTable<T, I, K, O> hoodieTable,
String partitionPath, String fileId, TaskContextSupplier taskContextSupplier,
boolean preserveHoodieMetadata) {
super(config, instantTime, hoodieTable, partitionPath, fileId, Option.empty(),
taskContextSupplier, preserveHoodieMetadata);
}
@Override
public boolean canWrite(HoodieRecord record) {
return true;
}
}

View File

@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.io;
import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.table.HoodieTable;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* A SingleFileHandleCreateFactory is used to write all data in the spark partition into a single data file.
* <p>
* Please use this with caution. This can end up creating very large files if not used correctly.
*/
public class SingleFileHandleCreateFactory<T extends HoodieRecordPayload, I, K, O> extends WriteHandleFactory<T, I, K, O> {
private AtomicBoolean isHandleCreated = new AtomicBoolean(false);
private String fileId;
private boolean preserveHoodieMetadata;
public SingleFileHandleCreateFactory(String fileId, boolean preserveHoodieMetadata) {
super();
this.fileId = fileId;
this.preserveHoodieMetadata = preserveHoodieMetadata;
}
@Override
public HoodieWriteHandle<T, I, K, O> create(final HoodieWriteConfig hoodieConfig, final String commitTime,
final HoodieTable<T, I, K, O> hoodieTable, final String partitionPath,
final String fileIdPrefix, TaskContextSupplier taskContextSupplier) {
if (isHandleCreated.compareAndSet(false, true)) {
return new HoodieUnboundedCreateHandle(hoodieConfig, commitTime, hoodieTable, partitionPath,
fileId, // ignore idPfx, always use same fileId
taskContextSupplier, preserveHoodieMetadata);
}
throw new HoodieIOException("Fixed handle create is only expected to be invoked once");
}
}

View File

@@ -18,25 +18,27 @@
package org.apache.hudi.table.action.cluster.strategy;
import org.apache.avro.Schema;
import org.apache.hudi.avro.model.HoodieClusteringPlan;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import org.apache.avro.Schema;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.Serializable;
import java.util.Map;
/**
* Pluggable implementation for writing data into new file groups based on ClusteringPlan.
*/
public abstract class ClusteringExecutionStrategy<T extends HoodieRecordPayload,I,K,O> implements Serializable {
public abstract class ClusteringExecutionStrategy<T extends HoodieRecordPayload, I, K, O> implements Serializable {
private static final Logger LOG = LogManager.getLogger(ClusteringExecutionStrategy.class);
private final HoodieTable<T,I,K,O> hoodieTable;
private final HoodieEngineContext engineContext;
private final HoodieTable<T, I, K, O> hoodieTable;
private final transient HoodieEngineContext engineContext;
private final HoodieWriteConfig writeConfig;
public ClusteringExecutionStrategy(HoodieTable table, HoodieEngineContext engineContext, HoodieWriteConfig writeConfig) {
@@ -50,10 +52,9 @@ public abstract class ClusteringExecutionStrategy<T extends HoodieRecordPayload,
* file groups created is bounded by numOutputGroups.
* Note that commit is not done as part of strategy. commit is callers responsibility.
*/
public abstract O performClustering(final I inputRecords, final int numOutputGroups, final String instantTime,
final Map<String, String> strategyParams, final Schema schema);
public abstract HoodieWriteMetadata<O> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime);
protected HoodieTable<T,I,K, O> getHoodieTable() {
protected HoodieTable<T, I, K, O> getHoodieTable() {
return this.hoodieTable;
}

View File

@@ -97,6 +97,7 @@ public abstract class PartitionAwareClusteringPlanStrategy<T extends HoodieRecor
.setInputGroups(clusteringGroups)
.setExtraMetadata(getExtraMetadata())
.setVersion(getPlanVersion())
.setPreserveHoodieMetadata(getWriteConfig().isPreserveHoodieCommitMetadata())
.build());
}
}

View File

@@ -43,5 +43,6 @@ public abstract class AbstractBulkInsertHelper<T extends HoodieRecordPayload, I,
boolean performDedupe,
Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner,
boolean addMetadataFields,
int parallelism);
int parallelism,
boolean preserveMetadata);
}