1
0
Files
hudi/hudi-client/src/main/java/org/apache/hudi/HoodieWriteClient.java
leesf b19bed442d [HUDI-296] Explore use of spotless to auto fix formatting errors (#945)
- Add spotless format fixing to project
- One time reformatting for conformity
- Build fails for formatting changes and mvn spotless:apply autofixes them
2019-10-10 05:19:40 -07:00

1347 lines
65 KiB
Java

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi;
import com.codahale.metrics.Timer;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.hudi.avro.model.HoodieCleanMetadata;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
import org.apache.hudi.common.HoodieCleanStat;
import org.apache.hudi.common.HoodieRollbackStat;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieDataFile;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieRollingStat;
import org.apache.hudi.common.model.HoodieRollingStatMetadata;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.HoodieTimeline;
import org.apache.hudi.common.table.TableFileSystemView.ReadOptimizedView;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieInstant.State;
import org.apache.hudi.common.util.AvroUtils;
import org.apache.hudi.common.util.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieCommitException;
import org.apache.hudi.exception.HoodieCompactionException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieInsertException;
import org.apache.hudi.exception.HoodieRollbackException;
import org.apache.hudi.exception.HoodieSavepointException;
import org.apache.hudi.exception.HoodieUpsertException;
import org.apache.hudi.func.BulkInsertMapFunction;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.io.HoodieCommitArchiveLog;
import org.apache.hudi.metrics.HoodieMetrics;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.UserDefinedBulkInsertPartitioner;
import org.apache.hudi.table.WorkloadProfile;
import org.apache.hudi.table.WorkloadStat;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
/**
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient mutations on a HDFS
* dataset [upsert()]
* <p>
* Note that, at any given time, there can only be one Spark job performing these operatons on a Hoodie dataset.
*/
public class HoodieWriteClient<T extends HoodieRecordPayload> extends AbstractHoodieClient {
private static Logger logger = LogManager.getLogger(HoodieWriteClient.class);
private final boolean rollbackInFlight;
private final transient HoodieMetrics metrics;
private final transient HoodieIndex<T> index;
private transient Timer.Context writeContext = null;
private transient Timer.Context compactionTimer;
private transient Timer.Context indexTimer = null;
/**
* @param jsc
* @param clientConfig
* @throws Exception
*/
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig) throws Exception {
this(jsc, clientConfig, false);
}
/**
* @param jsc
* @param clientConfig
* @param rollbackInFlight
*/
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) {
this(jsc, clientConfig, rollbackInFlight, HoodieIndex.createIndex(clientConfig, jsc));
}
@VisibleForTesting
HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight, HoodieIndex index) {
this(jsc, clientConfig, rollbackInFlight, index, Option.empty());
}
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight,
HoodieIndex index, Option<EmbeddedTimelineService> timelineService) {
super(jsc, clientConfig, timelineService);
this.index = index;
this.metrics = new HoodieMetrics(config, config.getTableName());
this.rollbackInFlight = rollbackInFlight;
}
public static SparkConf registerClasses(SparkConf conf) {
conf.registerKryoClasses(new Class[] {HoodieWriteConfig.class, HoodieRecord.class, HoodieKey.class});
return conf;
}
/**
* Filter out HoodieRecords that already exists in the output folder. This is useful in deduplication.
*
* @param hoodieRecords Input RDD of Hoodie records.
* @return A subset of hoodieRecords RDD, with existing records filtered out.
*/
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
indexTimer = metrics.getIndexCtx();
JavaRDD<HoodieRecord<T>> recordsWithLocation = index.tagLocation(hoodieRecords, jsc, table);
metrics.updateIndexMetrics("lookup", metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop()));
indexTimer = null;
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
}
/**
* Upserts a bunch of new records into the Hoodie table, at the supplied commitTime
*/
public JavaRDD<WriteStatus> upsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
HoodieTable<T> table = getTableAndInitCtx(records);
try {
// De-dupe/merge if needed
JavaRDD<HoodieRecord<T>> dedupedRecords =
combineOnCondition(config.shouldCombineBeforeUpsert(), records, config.getUpsertShuffleParallelism());
indexTimer = metrics.getIndexCtx();
// perform index loop up to get existing location of records
JavaRDD<HoodieRecord<T>> taggedRecords = index.tagLocation(dedupedRecords, jsc, table);
metrics.updateIndexMetrics("lookup", metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop()));
indexTimer = null;
return upsertRecordsInternal(taggedRecords, commitTime, table, true);
} catch (Throwable e) {
if (e instanceof HoodieUpsertException) {
throw (HoodieUpsertException) e;
}
throw new HoodieUpsertException("Failed to upsert for commit time " + commitTime, e);
}
}
/**
* Upserts the given prepared records into the Hoodie table, at the supplied commitTime.
* <p>
* This implementation requires that the input records are already tagged, and de-duped if needed.
*
* @param preppedRecords Prepared HoodieRecords to upsert
* @param commitTime Commit Time handle
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public JavaRDD<WriteStatus> upsertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords, final String commitTime) {
HoodieTable<T> table = getTableAndInitCtx(preppedRecords);
try {
return upsertRecordsInternal(preppedRecords, commitTime, table, true);
} catch (Throwable e) {
if (e instanceof HoodieUpsertException) {
throw (HoodieUpsertException) e;
}
throw new HoodieUpsertException("Failed to upsert prepared records for commit time " + commitTime, e);
}
}
/**
* Inserts the given HoodieRecords, into the table. This API is intended to be used for normal writes.
* <p>
* This implementation skips the index check and is able to leverage benefits such as small file handling/blocking
* alignment, as with upsert(), by profiling the workload
*
* @param records HoodieRecords to insert
* @param commitTime Commit Time handle
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public JavaRDD<WriteStatus> insert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
HoodieTable<T> table = getTableAndInitCtx(records);
try {
// De-dupe/merge if needed
JavaRDD<HoodieRecord<T>> dedupedRecords =
combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
return upsertRecordsInternal(dedupedRecords, commitTime, table, false);
} catch (Throwable e) {
if (e instanceof HoodieInsertException) {
throw e;
}
throw new HoodieInsertException("Failed to insert for commit time " + commitTime, e);
}
}
/**
* Inserts the given prepared records into the Hoodie table, at the supplied commitTime.
* <p>
* This implementation skips the index check, skips de-duping and is able to leverage benefits such as small file
* handling/blocking alignment, as with insert(), by profiling the workload. The prepared HoodieRecords should be
* de-duped if needed.
*
* @param preppedRecords HoodieRecords to insert
* @param commitTime Commit Time handle
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public JavaRDD<WriteStatus> insertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords, final String commitTime) {
HoodieTable<T> table = getTableAndInitCtx(preppedRecords);
try {
return upsertRecordsInternal(preppedRecords, commitTime, table, false);
} catch (Throwable e) {
if (e instanceof HoodieInsertException) {
throw e;
}
throw new HoodieInsertException("Failed to insert prepared records for commit time " + commitTime, e);
}
}
/**
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie
* table for the very first time (e.g: converting an existing dataset to Hoodie).
* <p>
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control
* the numbers of files with less memory compared to the {@link HoodieWriteClient#insert(JavaRDD, String)}
*
* @param records HoodieRecords to insert
* @param commitTime Commit Time handle
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
return bulkInsert(records, commitTime, Option.empty());
}
/**
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie
* table for the very first time (e.g: converting an existing dataset to Hoodie).
* <p>
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control
* the numbers of files with less memory compared to the {@link HoodieWriteClient#insert(JavaRDD, String)}. Optionally
* it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See
* {@link UserDefinedBulkInsertPartitioner}.
*
* @param records HoodieRecords to insert
* @param commitTime Commit Time handle
* @param bulkInsertPartitioner If specified then it will be used to partition input records before they are inserted
* into hoodie.
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public JavaRDD<WriteStatus> bulkInsert(JavaRDD<HoodieRecord<T>> records, final String commitTime,
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
HoodieTable<T> table = getTableAndInitCtx(records);
try {
// De-dupe/merge if needed
JavaRDD<HoodieRecord<T>> dedupedRecords =
combineOnCondition(config.shouldCombineBeforeInsert(), records, config.getInsertShuffleParallelism());
return bulkInsertInternal(dedupedRecords, commitTime, table, bulkInsertPartitioner);
} catch (Throwable e) {
if (e instanceof HoodieInsertException) {
throw e;
}
throw new HoodieInsertException("Failed to bulk insert for commit time " + commitTime, e);
}
}
/**
* Loads the given HoodieRecords, as inserts into the table. This is suitable for doing big bulk loads into a Hoodie
* table for the very first time (e.g: converting an existing dataset to Hoodie). The input records should contain no
* duplicates if needed.
* <p>
* This implementation uses sortBy (which does range partitioning based on reservoir sampling) and attempts to control
* the numbers of files with less memory compared to the {@link HoodieWriteClient#insert(JavaRDD, String)}. Optionally
* it allows users to specify their own partitioner. If specified then it will be used for repartitioning records. See
* {@link UserDefinedBulkInsertPartitioner}.
*
* @param preppedRecords HoodieRecords to insert
* @param commitTime Commit Time handle
* @param bulkInsertPartitioner If specified then it will be used to partition input records before they are inserted
* into hoodie.
* @return JavaRDD[WriteStatus] - RDD of WriteStatus to inspect errors and counts
*/
public JavaRDD<WriteStatus> bulkInsertPreppedRecords(JavaRDD<HoodieRecord<T>> preppedRecords, final String commitTime,
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
HoodieTable<T> table = getTableAndInitCtx(preppedRecords);
try {
return bulkInsertInternal(preppedRecords, commitTime, table, bulkInsertPartitioner);
} catch (Throwable e) {
if (e instanceof HoodieInsertException) {
throw e;
}
throw new HoodieInsertException("Failed to bulk insert prepared records for commit time " + commitTime, e);
}
}
private JavaRDD<WriteStatus> bulkInsertInternal(JavaRDD<HoodieRecord<T>> dedupedRecords, String commitTime,
HoodieTable<T> table, Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
final JavaRDD<HoodieRecord<T>> repartitionedRecords;
final int parallelism = config.getBulkInsertShuffleParallelism();
if (bulkInsertPartitioner.isPresent()) {
repartitionedRecords = bulkInsertPartitioner.get().repartitionRecords(dedupedRecords, parallelism);
} else {
// Now, sort the records and line them up nicely for loading.
repartitionedRecords = dedupedRecords.sortBy(record -> {
// Let's use "partitionPath + key" as the sort key. Spark, will ensure
// the records split evenly across RDD partitions, such that small partitions fit
// into 1 RDD partition, while big ones spread evenly across multiple RDD partitions
return String.format("%s+%s", record.getPartitionPath(), record.getRecordKey());
}, true, parallelism);
}
// generate new file ID prefixes for each output partition
final List<String> fileIDPrefixes =
IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
JavaRDD<WriteStatus> writeStatusRDD = repartitionedRecords
.mapPartitionsWithIndex(new BulkInsertMapFunction<T>(commitTime, config, table, fileIDPrefixes), true)
.flatMap(writeStatuses -> writeStatuses.iterator());
return updateIndexAndCommitIfNeeded(writeStatusRDD, table, commitTime);
}
private void commitOnAutoCommit(String commitTime, JavaRDD<WriteStatus> resultRDD, String actionType) {
if (config.shouldAutoCommit()) {
logger.info("Auto commit enabled: Committing " + commitTime);
boolean commitResult = commit(commitTime, resultRDD, Option.empty(), actionType);
if (!commitResult) {
throw new HoodieCommitException("Failed to commit " + commitTime);
}
} else {
logger.info("Auto commit disabled for " + commitTime);
}
}
private JavaRDD<HoodieRecord<T>> combineOnCondition(boolean condition, JavaRDD<HoodieRecord<T>> records,
int parallelism) {
if (condition) {
return deduplicateRecords(records, parallelism);
}
return records;
}
/**
* Save the workload profile in an intermediate file (here re-using commit files) This is useful when performing
* rollback for MOR datasets. Only updates are recorded in the workload profile metadata since updates to log blocks
* are unknown across batches Inserts (which are new parquet files) are rolled back based on commit time. // TODO :
* Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata
*/
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, HoodieTable<T> table, String commitTime)
throws HoodieCommitException {
try {
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
profile.getPartitionPaths().stream().forEach(path -> {
WorkloadStat partitionStat = profile.getWorkloadStat(path.toString());
partitionStat.getUpdateLocationToCount().entrySet().stream().forEach(entry -> {
HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setFileId(entry.getKey());
// TODO : Write baseCommitTime is possible here ?
writeStat.setPrevCommit(entry.getValue().getKey());
writeStat.setNumUpdateWrites(entry.getValue().getValue());
metadata.addWriteStat(path.toString(), writeStat);
});
});
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
Option<HoodieInstant> instant =
activeTimeline.getCommitsTimeline().filterInflightsExcludingCompaction().lastInstant();
activeTimeline.saveToInflight(instant.get(), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch (IOException io) {
throw new HoodieCommitException("Failed to commit " + commitTime + " unable to save inflight metadata ", io);
}
}
private JavaRDD<WriteStatus> upsertRecordsInternal(JavaRDD<HoodieRecord<T>> preppedRecords, String commitTime,
HoodieTable<T> hoodieTable, final boolean isUpsert) {
// Cache the tagged records, so we don't end up computing both
// TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
if (preppedRecords.getStorageLevel() == StorageLevel.NONE()) {
preppedRecords.persist(StorageLevel.MEMORY_AND_DISK_SER());
} else {
logger.info("RDD PreppedRecords was persisted at: " + preppedRecords.getStorageLevel());
}
WorkloadProfile profile = null;
if (hoodieTable.isWorkloadProfileNeeded()) {
profile = new WorkloadProfile(preppedRecords);
logger.info("Workload profile :" + profile);
saveWorkloadProfileMetadataToInflight(profile, hoodieTable, commitTime);
}
// partition using the insert partitioner
final Partitioner partitioner = getPartitioner(hoodieTable, isUpsert, profile);
JavaRDD<HoodieRecord<T>> partitionedRecords = partition(preppedRecords, partitioner);
JavaRDD<WriteStatus> writeStatusRDD = partitionedRecords.mapPartitionsWithIndex((partition, recordItr) -> {
if (isUpsert) {
return hoodieTable.handleUpsertPartition(commitTime, partition, recordItr, partitioner);
} else {
return hoodieTable.handleInsertPartition(commitTime, partition, recordItr, partitioner);
}
}, true).flatMap(List::iterator);
return updateIndexAndCommitIfNeeded(writeStatusRDD, hoodieTable, commitTime);
}
private Partitioner getPartitioner(HoodieTable table, boolean isUpsert, WorkloadProfile profile) {
if (isUpsert) {
return table.getUpsertPartitioner(profile);
} else {
return table.getInsertPartitioner(profile);
}
}
private JavaRDD<WriteStatus> updateIndexAndCommitIfNeeded(JavaRDD<WriteStatus> writeStatusRDD, HoodieTable<T> table,
String commitTime) {
// cache writeStatusRDD before updating index, so that all actions before this are not triggered again for future
// RDD actions that are performed after updating the index.
writeStatusRDD = writeStatusRDD.persist(config.getWriteStatusStorageLevel());
indexTimer = metrics.getIndexCtx();
// Update the index back
JavaRDD<WriteStatus> statuses = index.updateLocation(writeStatusRDD, jsc, table);
metrics.updateIndexMetrics("update", metrics.getDurationInMs(indexTimer == null ? 0L : indexTimer.stop()));
indexTimer = null;
// Trigger the insert and collect statuses
commitOnAutoCommit(commitTime, statuses, table.getMetaClient().getCommitActionType());
return statuses;
}
private JavaRDD<HoodieRecord<T>> partition(JavaRDD<HoodieRecord<T>> dedupedRecords, Partitioner partitioner) {
return dedupedRecords.mapToPair(
record -> new Tuple2<>(new Tuple2<>(record.getKey(), Option.ofNullable(record.getCurrentLocation())), record))
.partitionBy(partitioner).map(Tuple2::_2);
}
/**
* Commit changes performed at the given commitTime marker
*/
public boolean commit(String commitTime, JavaRDD<WriteStatus> writeStatuses) {
return commit(commitTime, writeStatuses, Option.empty());
}
/**
* Commit changes performed at the given commitTime marker
*/
public boolean commit(String commitTime, JavaRDD<WriteStatus> writeStatuses,
Option<Map<String, String>> extraMetadata) {
HoodieTableMetaClient metaClient = createMetaClient(false);
return commit(commitTime, writeStatuses, extraMetadata, metaClient.getCommitActionType());
}
private boolean commit(String commitTime, JavaRDD<WriteStatus> writeStatuses,
Option<Map<String, String>> extraMetadata, String actionType) {
logger.info("Commiting " + commitTime);
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
List<HoodieWriteStat> stats = writeStatuses.map(WriteStatus::getStat).collect();
updateMetadataAndRollingStats(actionType, metadata, stats);
// Finalize write
finalizeWrite(table, commitTime, stats);
// add in extra metadata
if (extraMetadata.isPresent()) {
extraMetadata.get().forEach(metadata::addMetadata);
}
try {
activeTimeline.saveAsComplete(new HoodieInstant(true, actionType, commitTime),
Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
// Save was a success & Do a inline compaction if enabled
if (config.isInlineCompaction()) {
metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "true");
forceCompact(extraMetadata);
} else {
metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "false");
}
// We cannot have unbounded commit files. Archive commits if we have to archive
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(config, createMetaClient(true));
archiveLog.archiveIfRequired(jsc);
if (config.isAutoClean()) {
// Call clean to cleanup if there is anything to cleanup after the commit,
logger.info("Auto cleaning is enabled. Running cleaner now");
clean(commitTime);
} else {
logger.info("Auto cleaning is not enabled. Not running cleaner now");
}
if (writeContext != null) {
long durationInMs = metrics.getDurationInMs(writeContext.stop());
metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime(), durationInMs,
metadata, actionType);
writeContext = null;
}
logger.info("Committed " + commitTime);
} catch (IOException e) {
throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + commitTime,
e);
} catch (ParseException e) {
throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + commitTime
+ "Instant time is not of valid format", e);
}
return true;
}
/**
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will be referenced in the
* savepoint and will never be cleaned. The savepointed commit will never be rolledback or archived.
* <p>
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be manually created and
* deleted.
* <p>
* Savepoint should be on a commit that could not have been cleaned.
*
* @param user - User creating the savepoint
* @param comment - Comment for the savepoint
* @return true if the savepoint was created successfully
*/
public boolean savepoint(String user, String comment) {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
if (table.getCompletedCommitsTimeline().empty()) {
throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty");
}
if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
throw new UnsupportedOperationException("Savepointing is not supported or MergeOnRead table types");
}
String latestCommit = table.getCompletedCommitsTimeline().lastInstant().get().getTimestamp();
logger.info("Savepointing latest commit " + latestCommit);
return savepoint(latestCommit, user, comment);
}
/**
* Savepoint a specific commit. Latest version of data files as of the passed in commitTime will be referenced in the
* savepoint and will never be cleaned. The savepointed commit will never be rolledback or archived.
* <p>
* This gives an option to rollback the state to the savepoint anytime. Savepoint needs to be manually created and
* deleted.
* <p>
* Savepoint should be on a commit that could not have been cleaned.
*
* @param commitTime - commit that should be savepointed
* @param user - User creating the savepoint
* @param comment - Comment for the savepoint
* @return true if the savepoint was created successfully
*/
public boolean savepoint(String commitTime, String user, String comment) {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
throw new UnsupportedOperationException("Savepointing is not supported or MergeOnRead table types");
}
Option<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!table.getCompletedCommitsTimeline().containsInstant(commitInstant)) {
throw new HoodieSavepointException("Could not savepoint non-existing commit " + commitInstant);
}
try {
// Check the last commit that was not cleaned and check if savepoint time is > that commit
String lastCommitRetained;
if (cleanInstant.isPresent()) {
HoodieCleanMetadata cleanMetadata = AvroUtils
.deserializeHoodieCleanMetadata(table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get());
lastCommitRetained = cleanMetadata.getEarliestCommitToRetain();
} else {
lastCommitRetained = table.getCompletedCommitsTimeline().firstInstant().get().getTimestamp();
}
// Cannot allow savepoint time on a commit that could have been cleaned
Preconditions.checkArgument(
HoodieTimeline.compareTimestamps(commitTime, lastCommitRetained, HoodieTimeline.GREATER_OR_EQUAL),
"Could not savepoint commit " + commitTime + " as this is beyond the lookup window " + lastCommitRetained);
Map<String, List<String>> latestFilesMap = jsc
.parallelize(FSUtils.getAllPartitionPaths(fs, table.getMetaClient().getBasePath(),
config.shouldAssumeDatePartitioning()))
.mapToPair((PairFunction<String, String, List<String>>) partitionPath -> {
// Scan all partitions files with this commit time
logger.info("Collecting latest files in partition path " + partitionPath);
ReadOptimizedView view = table.getROFileSystemView();
List<String> latestFiles = view.getLatestDataFilesBeforeOrOn(partitionPath, commitTime)
.map(HoodieDataFile::getFileName).collect(Collectors.toList());
return new Tuple2<>(partitionPath, latestFiles);
}).collectAsMap();
HoodieSavepointMetadata metadata = AvroUtils.convertSavepointMetadata(user, comment, latestFilesMap);
// Nothing to save in the savepoint
table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, commitTime),
AvroUtils.serializeSavepointMetadata(metadata));
logger.info("Savepoint " + commitTime + " created");
return true;
} catch (IOException e) {
throw new HoodieSavepointException("Failed to savepoint " + commitTime, e);
}
}
/**
* Delete a savepoint that was created. Once the savepoint is deleted, the commit can be rolledback and cleaner may
* clean up data files.
*
* @param savepointTime - delete the savepoint
* @return true if the savepoint was deleted successfully
*/
public void deleteSavepoint(String savepointTime) {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
throw new UnsupportedOperationException("Savepointing is not supported or MergeOnRead table types");
}
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint);
if (!isSavepointPresent) {
logger.warn("No savepoint present " + savepointTime);
return;
}
activeTimeline.revertToInflight(savePoint);
activeTimeline.deleteInflight(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, savepointTime));
logger.info("Savepoint " + savepointTime + " deleted");
}
/**
* Delete a compaction request that is pending.
*
* NOTE - This is an Admin operation. With async compaction, this is expected to be called with async compaction and
* write shutdown. Otherwise, async compactor could fail with errors
*
* @param compactionTime - delete the compaction time
*/
private void deleteRequestedCompaction(String compactionTime) {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
HoodieInstant compactionRequestedInstant =
new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime);
boolean isCompactionInstantInRequestedState =
table.getActiveTimeline().filterPendingCompactionTimeline().containsInstant(compactionRequestedInstant);
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
if (commitTimeline.empty() && !commitTimeline.findInstantsAfter(compactionTime, Integer.MAX_VALUE).empty()) {
throw new HoodieRollbackException(
"Found commits after time :" + compactionTime + ", please rollback greater commits first");
}
if (isCompactionInstantInRequestedState) {
activeTimeline.deleteCompactionRequested(compactionRequestedInstant);
} else {
throw new IllegalArgumentException("Compaction is not in requested state " + compactionTime);
}
logger.info("Compaction " + compactionTime + " deleted");
}
/**
* Rollback the state to the savepoint. WARNING: This rollsback recent commits and deleted data files. Queries
* accessing the files will mostly fail. This should be done during a downtime.
*
* @param savepointTime - savepoint time to rollback to
* @return true if the savepoint was rollecback to successfully
*/
public boolean rollbackToSavepoint(String savepointTime) {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
// Rollback to savepoint is expected to be a manual operation and no concurrent write or compaction is expected
// to be running. Rollback to savepoint also removes any pending compaction actions that are generated after
// savepoint time. Allowing pending compaction to be retained is not safe as those workload could be referencing
// file-slices that will be rolled-back as part of this operation
HoodieTimeline commitTimeline = table.getMetaClient().getCommitsAndCompactionTimeline();
HoodieInstant savePoint = new HoodieInstant(false, HoodieTimeline.SAVEPOINT_ACTION, savepointTime);
boolean isSavepointPresent = table.getCompletedSavepointTimeline().containsInstant(savePoint);
if (!isSavepointPresent) {
throw new HoodieRollbackException("No savepoint for commitTime " + savepointTime);
}
List<String> commitsToRollback = commitTimeline.findInstantsAfter(savepointTime, Integer.MAX_VALUE).getInstants()
.map(HoodieInstant::getTimestamp).collect(Collectors.toList());
logger.info("Rolling back commits " + commitsToRollback);
restoreToInstant(savepointTime);
// Make sure the rollback was successful
Option<HoodieInstant> lastInstant =
activeTimeline.reload().getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
Preconditions.checkArgument(lastInstant.isPresent());
Preconditions.checkArgument(lastInstant.get().getTimestamp().equals(savepointTime),
savepointTime + "is not the last commit after rolling back " + commitsToRollback + ", last commit was "
+ lastInstant.get().getTimestamp());
return true;
}
/**
* Rollback the (inflight/committed) record changes with the given commit time. Three steps: (1) Atomically unpublish
* this commit (2) clean indexing data, (3) clean new generated parquet files. (4) Finally delete .commit or .inflight
* file,
*/
public boolean rollback(final String commitTime) throws HoodieRollbackException {
rollbackInternal(commitTime);
return true;
}
/**
* NOTE : This action requires all writers (ingest and compact) to a dataset to be stopped before proceeding. Revert
* the (inflight/committed) record changes for all commits after the provided @param. Three steps: (1) Atomically
* unpublish this commit (2) clean indexing data, (3) clean new generated parquet/log files and/or append rollback to
* existing log files. (4) Finally delete .commit, .inflight, .compaction.inflight or .compaction.requested file
*/
public void restoreToInstant(final String instantTime) throws HoodieRollbackException {
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
// Get all the commits on the timeline after the provided commit time
List<HoodieInstant> instantsToRollback = table.getActiveTimeline().getCommitsAndCompactionTimeline().getInstants()
.filter(instant -> HoodieActiveTimeline.GREATER.test(instant.getTimestamp(), instantTime))
.collect(Collectors.toList());
// reverse the commits to descending order of commit time
Collections.reverse(instantsToRollback);
// Start a rollback instant for all commits to be rolled back
String startRollbackInstant = startInstant();
// Start the timer
final Timer.Context context = startContext();
ImmutableMap.Builder<String, List<HoodieRollbackStat>> instantsToStats = ImmutableMap.builder();
instantsToRollback.stream().forEach(instant -> {
try {
switch (instant.getAction()) {
case HoodieTimeline.COMMIT_ACTION:
case HoodieTimeline.DELTA_COMMIT_ACTION:
List<HoodieRollbackStat> statsForInstant = doRollbackAndGetStats(instant.getTimestamp());
instantsToStats.put(instant.getTimestamp(), statsForInstant);
break;
case HoodieTimeline.COMPACTION_ACTION:
// TODO : Get file status and create a rollback stat and file
// TODO : Delete the .aux files along with the instant file, okay for now since the archival process will
// delete these files when it does not see a corresponding instant file under .hoodie
List<HoodieRollbackStat> statsForCompaction = doRollbackAndGetStats(instant.getTimestamp());
instantsToStats.put(instant.getTimestamp(), statsForCompaction);
logger.info("Deleted compaction instant " + instant);
break;
default:
throw new IllegalArgumentException("invalid action name " + instant.getAction());
}
} catch (IOException io) {
throw new HoodieRollbackException("unable to rollback instant " + instant, io);
}
});
try {
finishRestore(context, instantsToStats.build(),
instantsToRollback.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()),
startRollbackInstant, instantTime);
} catch (IOException io) {
throw new HoodieRollbackException("unable to rollback instants " + instantsToRollback, io);
}
}
private String startInstant() {
return HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
}
private Timer.Context startContext() {
return metrics.getRollbackCtx();
}
private List<HoodieRollbackStat> doRollbackAndGetStats(final String commitToRollback) throws IOException {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
HoodieTimeline inflightCommitTimeline = table.getInflightCommitTimeline();
HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
// Check if any of the commits is a savepoint - do not allow rollback on those commits
List<String> savepoints = table.getCompletedSavepointTimeline().getInstants().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList());
savepoints.stream().forEach(s -> {
if (s.contains(commitToRollback)) {
throw new HoodieRollbackException(
"Could not rollback a savepointed commit. Delete savepoint first before rolling back" + s);
}
});
if (commitTimeline.empty() && inflightCommitTimeline.empty()) {
// nothing to rollback
logger.info("No commits to rollback " + commitToRollback);
}
// Make sure only the last n commits are being rolled back
// If there is a commit in-between or after that is not rolled back, then abort
String lastCommit = commitToRollback;
if ((lastCommit != null) && !commitTimeline.empty()
&& !commitTimeline.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) {
throw new HoodieRollbackException(
"Found commits after time :" + lastCommit + ", please rollback greater commits first");
}
List<String> inflights =
inflightCommitTimeline.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
if ((lastCommit != null) && !inflights.isEmpty() && (inflights.indexOf(lastCommit) != inflights.size() - 1)) {
throw new HoodieRollbackException(
"Found in-flight commits after time :" + lastCommit + ", please rollback greater commits first");
}
List<HoodieRollbackStat> stats = table.rollback(jsc, commitToRollback, true);
logger.info("Deleted inflight commits " + commitToRollback);
// cleanup index entries
if (!index.rollbackCommit(commitToRollback)) {
throw new HoodieRollbackException("Rollback index changes failed, for time :" + commitToRollback);
}
logger.info("Index rolled back for commits " + commitToRollback);
return stats;
}
private void finishRollback(final Timer.Context context, List<HoodieRollbackStat> rollbackStats,
List<String> commitsToRollback, final String startRollbackTime) throws IOException {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
Option<Long> durationInMs = Option.empty();
Long numFilesDeleted = rollbackStats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum();
if (context != null) {
durationInMs = Option.of(metrics.getDurationInMs(context.stop()));
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
}
HoodieRollbackMetadata rollbackMetadata =
AvroUtils.convertRollbackMetadata(startRollbackTime, durationInMs, commitsToRollback, rollbackStats);
table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.ROLLBACK_ACTION, startRollbackTime),
AvroUtils.serializeRollbackMetadata(rollbackMetadata));
logger.info("Commits " + commitsToRollback + " rollback is complete");
if (!table.getActiveTimeline().getCleanerTimeline().empty()) {
logger.info("Cleaning up older rollback meta files");
// Cleanup of older cleaner meta files
// TODO - make the commit archival generic and archive rollback metadata
FSUtils.deleteOlderRollbackMetaFiles(fs, table.getMetaClient().getMetaPath(),
table.getActiveTimeline().getRollbackTimeline().getInstants());
}
}
private void finishRestore(final Timer.Context context, Map<String, List<HoodieRollbackStat>> commitToStats,
List<String> commitsToRollback, final String startRestoreTime, final String restoreToInstant) throws IOException {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
Option<Long> durationInMs = Option.empty();
Long numFilesDeleted = 0L;
for (Map.Entry<String, List<HoodieRollbackStat>> commitToStat : commitToStats.entrySet()) {
List<HoodieRollbackStat> stats = commitToStat.getValue();
numFilesDeleted = stats.stream().mapToLong(stat -> stat.getSuccessDeleteFiles().size()).sum();
}
if (context != null) {
durationInMs = Option.of(metrics.getDurationInMs(context.stop()));
metrics.updateRollbackMetrics(durationInMs.get(), numFilesDeleted);
}
HoodieRestoreMetadata restoreMetadata =
AvroUtils.convertRestoreMetadata(startRestoreTime, durationInMs, commitsToRollback, commitToStats);
table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.RESTORE_ACTION, startRestoreTime),
AvroUtils.serializeRestoreMetadata(restoreMetadata));
logger.info("Commits " + commitsToRollback + " rollback is complete. Restored dataset to " + restoreToInstant);
if (!table.getActiveTimeline().getCleanerTimeline().empty()) {
logger.info("Cleaning up older restore meta files");
// Cleanup of older cleaner meta files
// TODO - make the commit archival generic and archive rollback metadata
FSUtils.deleteOlderRollbackMetaFiles(fs, table.getMetaClient().getMetaPath(),
table.getActiveTimeline().getRestoreTimeline().getInstants());
}
}
private void rollbackInternal(String commitToRollback) {
if (commitToRollback.isEmpty()) {
logger.info("List of commits to rollback is empty");
return;
}
final String startRollbackTime = startInstant();
final Timer.Context context = startContext();
// Create a Hoodie table which encapsulated the commits and files visible
try {
List<HoodieRollbackStat> stats = doRollbackAndGetStats(commitToRollback);
Map<String, List<HoodieRollbackStat>> statToCommit = new HashMap<>();
finishRollback(context, stats, Arrays.asList(commitToRollback), startRollbackTime);
} catch (IOException e) {
throw new HoodieRollbackException("Failed to rollback " + config.getBasePath() + " commits " + commitToRollback,
e);
}
}
/**
* Releases any resources used by the client.
*/
public void close() {
// Stop timeline-server if running
super.close();
// Calling this here releases any resources used by your index, so make sure to finish any related operations
// before this point
this.index.close();
}
/**
* Clean up any stale/old files/data lying around (either on file storage or index storage) based on the
* configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be
* cleaned)
*/
public void clean() throws HoodieIOException {
String startCleanTime = HoodieActiveTimeline.createNewCommitTime();
clean(startCleanTime);
}
/**
* Clean up any stale/old files/data lying around (either on file storage or index storage) based on the
* configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be
* cleaned)
*/
private void clean(String startCleanTime) throws HoodieIOException {
try {
logger.info("Cleaner started");
final Timer.Context context = metrics.getCleanCtx();
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
List<HoodieCleanStat> cleanStats = table.clean(jsc);
if (cleanStats.isEmpty()) {
return;
}
// Emit metrics (duration, numFilesDeleted) if needed
Option<Long> durationInMs = Option.empty();
if (context != null) {
durationInMs = Option.of(metrics.getDurationInMs(context.stop()));
logger.info("cleanerElaspsedTime (Minutes): " + durationInMs.get() / (1000 * 60));
}
// Create the metadata and save it
HoodieCleanMetadata metadata = AvroUtils.convertCleanMetadata(startCleanTime, durationInMs, cleanStats);
logger.info("Cleaned " + metadata.getTotalFilesDeleted() + " files");
metrics.updateCleanMetrics(durationInMs.orElseGet(() -> -1L), metadata.getTotalFilesDeleted());
table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.CLEAN_ACTION, startCleanTime),
AvroUtils.serializeCleanMetadata(metadata));
logger.info("Marked clean started on " + startCleanTime + " as complete");
if (!table.getActiveTimeline().getCleanerTimeline().empty()) {
// Cleanup of older cleaner meta files
// TODO - make the commit archival generic and archive clean metadata
FSUtils.deleteOlderCleanMetaFiles(fs, table.getMetaClient().getMetaPath(),
table.getActiveTimeline().getCleanerTimeline().getInstants());
}
} catch (IOException e) {
throw new HoodieIOException("Failed to clean up after commit", e);
}
}
/**
* Provides a new commit time for a write operation (insert/update)
*/
public String startCommit() {
String commitTime = HoodieActiveTimeline.createNewCommitTime();
startCommitWithTime(commitTime);
return commitTime;
}
public void startCommitWithTime(String instantTime) {
if (rollbackInFlight) {
// Only rollback inflight commit/delta-commits. Do not touch compaction commits
rollbackInflightCommits();
}
logger.info("Generate a new instant time " + instantTime);
HoodieTableMetaClient metaClient = createMetaClient(true);
// if there are pending compactions, their instantTime must not be greater than that of this instant time
metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().ifPresent(latestPending -> {
Preconditions.checkArgument(
HoodieTimeline.compareTimestamps(latestPending.getTimestamp(), instantTime, HoodieTimeline.LESSER),
"Latest pending compaction instant time must be earlier " + "than this instant time. Latest Compaction :"
+ latestPending + ", Ingesting at " + instantTime);
});
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
String commitActionType = table.getMetaClient().getCommitActionType();
activeTimeline.createInflight(new HoodieInstant(true, commitActionType, instantTime));
}
/**
* Schedules a new compaction instant
*/
public Option<String> scheduleCompaction(Option<Map<String, String>> extraMetadata) throws IOException {
String instantTime = HoodieActiveTimeline.createNewCommitTime();
logger.info("Generate a new instant time " + instantTime);
boolean notEmpty = scheduleCompactionAtInstant(instantTime, extraMetadata);
return notEmpty ? Option.of(instantTime) : Option.empty();
}
/**
* Schedules a new compaction instant with passed-in instant time
*
* @param instantTime Compaction Instant Time
* @param extraMetadata Extra Metadata to be stored
*/
public boolean scheduleCompactionAtInstant(String instantTime, Option<Map<String, String>> extraMetadata)
throws IOException {
HoodieTableMetaClient metaClient = createMetaClient(true);
// if there are inflight writes, their instantTime must not be less than that of compaction instant time
metaClient.getCommitsTimeline().filterInflightsExcludingCompaction().firstInstant().ifPresent(earliestInflight -> {
Preconditions.checkArgument(
HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), instantTime, HoodieTimeline.GREATER),
"Earliest write inflight instant time must be later " + "than compaction time. Earliest :" + earliestInflight
+ ", Compaction scheduled at " + instantTime);
});
// Committed and pending compaction instants should have strictly lower timestamps
List<HoodieInstant> conflictingInstants = metaClient
.getActiveTimeline().getCommitsAndCompactionTimeline().getInstants().filter(instant -> HoodieTimeline
.compareTimestamps(instant.getTimestamp(), instantTime, HoodieTimeline.GREATER_OR_EQUAL))
.collect(Collectors.toList());
Preconditions.checkArgument(conflictingInstants.isEmpty(),
"Following instants have timestamps >= compactionInstant (" + instantTime + ") Instants :"
+ conflictingInstants);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieCompactionPlan workload = table.scheduleCompaction(jsc, instantTime);
if (workload != null && (workload.getOperations() != null) && (!workload.getOperations().isEmpty())) {
extraMetadata.ifPresent(workload::setExtraMetadata);
HoodieInstant compactionInstant =
new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime);
metaClient.getActiveTimeline().saveToCompactionRequested(compactionInstant,
AvroUtils.serializeCompactionPlan(workload));
return true;
}
return false;
}
/**
* Performs Compaction for the workload stored in instant-time
*
* @param compactionInstantTime Compaction Instant Time
*/
public JavaRDD<WriteStatus> compact(String compactionInstantTime) throws IOException {
return compact(compactionInstantTime, config.shouldAutoCommit());
}
/**
* Commit a compaction operation. Allow passing additional meta-data to be stored in commit instant file.
*/
public void commitCompaction(String compactionInstantTime, JavaRDD<WriteStatus> writeStatuses,
Option<Map<String, String>> extraMetadata) throws IOException {
HoodieTableMetaClient metaClient = createMetaClient(true);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(
timeline.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get());
// Merge extra meta-data passed by user with the one already in inflight compaction
Option<Map<String, String>> mergedMetaData = extraMetadata.map(m -> {
Map<String, String> merged = new HashMap<>();
Map<String, String> extraMetaDataFromInstantFile = compactionPlan.getExtraMetadata();
if (extraMetaDataFromInstantFile != null) {
merged.putAll(extraMetaDataFromInstantFile);
}
// Overwrite/Merge with the user-passed meta-data
merged.putAll(m);
return Option.of(merged);
}).orElseGet(() -> Option.ofNullable(compactionPlan.getExtraMetadata()));
commitCompaction(writeStatuses, table, compactionInstantTime, true, mergedMetaData);
}
/**
* Deduplicate Hoodie records, using the given deduplication funciton.
*/
JavaRDD<HoodieRecord<T>> deduplicateRecords(JavaRDD<HoodieRecord<T>> records, int parallelism) {
boolean isIndexingGlobal = index.isGlobal();
return records.mapToPair(record -> {
HoodieKey hoodieKey = record.getKey();
// If index used is global, then records are expected to differ in their partitionPath
Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
return new Tuple2<>(key, record);
}).reduceByKey((rec1, rec2) -> {
@SuppressWarnings("unchecked")
T reducedData = (T) rec1.getData().preCombine(rec2.getData());
// we cannot allow the user to change the key or partitionPath, since that will affect
// everything
// so pick it from one of the records.
return new HoodieRecord<T>(rec1.getKey(), reducedData);
}, parallelism).map(Tuple2::_2);
}
/**
* Cleanup all inflight commits
*/
private void rollbackInflightCommits() {
HoodieTable<T> table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterInflightsExcludingCompaction();
List<String> commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
Collections.reverse(commits);
for (String commit : commits) {
rollback(commit);
}
}
private HoodieTable getTableAndInitCtx(JavaRDD<HoodieRecord<T>> records) {
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
if (table.getMetaClient().getCommitActionType().equals(HoodieTimeline.COMMIT_ACTION)) {
writeContext = metrics.getCommitCtx();
} else {
writeContext = metrics.getDeltaCommitCtx();
}
return table;
}
/**
* Compaction specific private methods
*/
/**
* Ensures compaction instant is in expected state and performs Compaction for the workload stored in instant-time
*
* @param compactionInstantTime Compaction Instant Time
*/
private JavaRDD<WriteStatus> compact(String compactionInstantTime, boolean autoCommit) throws IOException {
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTableMetaClient metaClient = createMetaClient(true);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieTimeline pendingCompactionTimeline = metaClient.getActiveTimeline().filterPendingCompactionTimeline();
HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime);
if (pendingCompactionTimeline.containsInstant(inflightInstant)) {
// inflight compaction - Needs to rollback first deleting new parquet files before we run compaction.
rollbackInflightCompaction(inflightInstant, table);
// refresh table
metaClient = createMetaClient(true);
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
pendingCompactionTimeline = metaClient.getActiveTimeline().filterPendingCompactionTimeline();
}
HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
if (pendingCompactionTimeline.containsInstant(instant)) {
return runCompaction(instant, metaClient.getActiveTimeline(), autoCommit);
} else {
throw new IllegalStateException(
"No Compaction request available at " + compactionInstantTime + " to run compaction");
}
}
/**
* Perform compaction operations as specified in the compaction commit file
*
* @param compactionInstant Compacton Instant time
* @param activeTimeline Active Timeline
* @param autoCommit Commit after compaction
* @return RDD of Write Status
*/
private JavaRDD<WriteStatus> runCompaction(HoodieInstant compactionInstant, HoodieActiveTimeline activeTimeline,
boolean autoCommit) throws IOException {
HoodieCompactionPlan compactionPlan =
AvroUtils.deserializeCompactionPlan(activeTimeline.getInstantAuxiliaryDetails(compactionInstant).get());
// Mark instant as compaction inflight
activeTimeline.transitionCompactionRequestedToInflight(compactionInstant);
compactionTimer = metrics.getCompactionCtx();
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTableMetaClient metaClient = createMetaClient(true);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
JavaRDD<WriteStatus> statuses = table.compact(jsc, compactionInstant.getTimestamp(), compactionPlan);
// Force compaction action
statuses.persist(config.getWriteStatusStorageLevel());
// pass extra-metada so that it gets stored in commit file automatically
commitCompaction(statuses, table, compactionInstant.getTimestamp(), autoCommit,
Option.ofNullable(compactionPlan.getExtraMetadata()));
return statuses;
}
/**
* Commit Compaction and track metrics
*
* @param compactedStatuses Compaction Write status
* @param table Hoodie Table
* @param compactionCommitTime Compaction Commit Time
* @param autoCommit Auto Commit
* @param extraMetadata Extra Metadata to store
*/
protected void commitCompaction(JavaRDD<WriteStatus> compactedStatuses, HoodieTable<T> table,
String compactionCommitTime, boolean autoCommit, Option<Map<String, String>> extraMetadata) {
if (autoCommit) {
HoodieCommitMetadata metadata = doCompactionCommit(table, compactedStatuses, compactionCommitTime, extraMetadata);
if (compactionTimer != null) {
long durationInMs = metrics.getDurationInMs(compactionTimer.stop());
try {
metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).getTime(),
durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION);
} catch (ParseException e) {
throw new HoodieCommitException("Commit time is not of valid format.Failed to commit compaction "
+ config.getBasePath() + " at time " + compactionCommitTime, e);
}
}
logger.info("Compacted successfully on commit " + compactionCommitTime);
} else {
logger.info("Compaction did not run for commit " + compactionCommitTime);
}
}
private void finalizeWrite(HoodieTable<T> table, String instantTime, List<HoodieWriteStat> stats) {
try {
final Timer.Context finalizeCtx = metrics.getFinalizeCtx();
table.finalizeWrite(jsc, instantTime, stats);
if (finalizeCtx != null) {
Option<Long> durationInMs = Option.of(metrics.getDurationInMs(finalizeCtx.stop()));
durationInMs.ifPresent(duration -> {
logger.info("Finalize write elapsed time (milliseconds): " + duration);
metrics.updateFinalizeWriteMetrics(duration, stats.size());
});
}
} catch (HoodieIOException ioe) {
throw new HoodieCommitException("Failed to complete commit " + instantTime + " due to finalize errors.", ioe);
}
}
/**
* Rollback failed compactions. Inflight rollbacks for compactions revert the .inflight file to the .requested file
*
* @param inflightInstant Inflight Compaction Instant
* @param table Hoodie Table
*/
@VisibleForTesting
void rollbackInflightCompaction(HoodieInstant inflightInstant, HoodieTable table) throws IOException {
table.rollback(jsc, inflightInstant.getTimestamp(), false);
// Revert instant state file
table.getActiveTimeline().revertCompactionInflightToRequested(inflightInstant);
}
private HoodieCommitMetadata doCompactionCommit(HoodieTable<T> table, JavaRDD<WriteStatus> writeStatuses,
String compactionCommitTime, Option<Map<String, String>> extraMetadata) {
HoodieTableMetaClient metaClient = table.getMetaClient();
List<HoodieWriteStat> updateStatusMap = writeStatuses.map(WriteStatus::getStat).collect();
HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
for (HoodieWriteStat stat : updateStatusMap) {
metadata.addWriteStat(stat.getPartitionPath(), stat);
}
// Finalize write
finalizeWrite(table, compactionCommitTime, updateStatusMap);
// Copy extraMetadata
extraMetadata.ifPresent(m -> {
m.entrySet().stream().forEach(e -> {
metadata.addMetadata(e.getKey(), e.getValue());
});
});
logger.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata);
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
try {
activeTimeline.transitionCompactionInflightToComplete(
new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionCommitTime),
Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch (IOException e) {
throw new HoodieCompactionException(
"Failed to commit " + metaClient.getBasePath() + " at time " + compactionCommitTime, e);
}
return metadata;
}
/**
* Performs a compaction operation on a dataset, serially before or after an insert/upsert action.
*/
private Option<String> forceCompact(Option<Map<String, String>> extraMetadata) throws IOException {
Option<String> compactionInstantTimeOpt = scheduleCompaction(extraMetadata);
compactionInstantTimeOpt.ifPresent(compactionInstantTime -> {
try {
// inline compaction should auto commit as the user is never given control
compact(compactionInstantTime, true);
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
});
return compactionInstantTimeOpt;
}
private void updateMetadataAndRollingStats(String actionType, HoodieCommitMetadata metadata,
List<HoodieWriteStat> writeStats) {
// TODO : make sure we cannot rollback / archive last commit file
try {
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTable table = HoodieTable.getHoodieTable(createMetaClient(true), config, jsc);
// 0. All of the rolling stat management is only done by the DELTA commit for MOR and COMMIT for COW other wise
// there may be race conditions
HoodieRollingStatMetadata rollingStatMetadata = new HoodieRollingStatMetadata(actionType);
// 1. Look up the previous compaction/commit and get the HoodieCommitMetadata from there.
// 2. Now, first read the existing rolling stats and merge with the result of current metadata.
// Need to do this on every commit (delta or commit) to support COW and MOR.
for (HoodieWriteStat stat : writeStats) {
String partitionPath = stat.getPartitionPath();
// TODO: why is stat.getPartitionPath() null at times here.
metadata.addWriteStat(partitionPath, stat);
HoodieRollingStat hoodieRollingStat = new HoodieRollingStat(stat.getFileId(),
stat.getNumWrites() - (stat.getNumUpdateWrites() - stat.getNumDeletes()), stat.getNumUpdateWrites(),
stat.getNumDeletes(), stat.getTotalWriteBytes());
rollingStatMetadata.addRollingStat(partitionPath, hoodieRollingStat);
}
// The last rolling stat should be present in the completed timeline
Option<HoodieInstant> lastInstant =
table.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
if (lastInstant.isPresent()) {
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
table.getActiveTimeline().getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class);
Option<String> lastRollingStat = Option
.ofNullable(commitMetadata.getExtraMetadata().get(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY));
if (lastRollingStat.isPresent()) {
rollingStatMetadata = rollingStatMetadata
.merge(HoodieCommitMetadata.fromBytes(lastRollingStat.get().getBytes(), HoodieRollingStatMetadata.class));
}
}
metadata.addMetadata(HoodieRollingStatMetadata.ROLLING_STAT_METADATA_KEY, rollingStatMetadata.toJsonString());
} catch (IOException io) {
throw new HoodieCommitException("Unable to save rolling stats");
}
}
}