1
0

Async Compaction Main API changes

This commit is contained in:
Balaji Varadarajan
2018-05-23 23:09:25 -07:00
committed by vinoth chandar
parent 9b78523d62
commit 2f8ce93030
18 changed files with 878 additions and 267 deletions

View File

@@ -17,11 +17,13 @@
package com.uber.hoodie;
import com.google.common.base.Optional;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.common.model.HoodieKey;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieRecordPayload;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.CompactionUtils;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
@@ -32,6 +34,8 @@ import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -183,4 +187,17 @@ public class HoodieReadClient<T extends HoodieRecordPayload> implements Serializ
throws HoodieIndexException {
return index.tagLocation(hoodieRecords, jsc, hoodieTable);
}
/**
* Return all pending compactions with instant time for clients to decide what to compact next.
* @return
*/
public List<Pair<String, HoodieCompactionPlan>> getPendingCompactions() {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
hoodieTable.getMetaClient().getBasePath(), true);
return CompactionUtils.getAllPendingCompactionPlans(metaClient).stream()
.map(instantWorkloadPair ->
Pair.of(instantWorkloadPair.getKey().getTimestamp(), instantWorkloadPair.getValue()))
.collect(Collectors.toList());
}
}

View File

@@ -19,8 +19,10 @@ package com.uber.hoodie;
import com.codahale.metrics.Timer;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.avro.model.HoodieRollbackMetadata;
import com.uber.hoodie.avro.model.HoodieSavepointMetadata;
import com.uber.hoodie.common.HoodieCleanStat;
@@ -97,6 +99,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
private final transient HoodieMetrics metrics;
private final transient HoodieIndex<T> index;
private transient Timer.Context writeContext = null;
private transient Timer.Context compactionTimer;
/**
* @param jsc
@@ -478,13 +481,13 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Commit changes performed at the given commitTime marker
*/
public boolean commit(String commitTime, JavaRDD<WriteStatus> writeStatuses,
Optional<HashMap<String, String>> extraMetadata) {
Optional<Map<String, String>> extraMetadata) {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true);
return commit(commitTime, writeStatuses, extraMetadata, metaClient.getCommitActionType());
}
private boolean commit(String commitTime, JavaRDD<WriteStatus> writeStatuses,
Optional<HashMap<String, String>> extraMetadata, String actionType) {
Optional<Map<String, String>> extraMetadata, String actionType) {
logger.info("Commiting " + commitTime);
// Create a Hoodie table which encapsulated the commits and files visible
@@ -525,7 +528,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// Do a inline compaction if enabled
if (config.isInlineCompaction()) {
metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "true");
forceCompact();
forceCompact(extraMetadata);
} else {
metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "false");
}
@@ -685,7 +688,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
* Delete a compaction request that is pending.
*
* NOTE - This is an Admin operation.
* With async compaction, this is expected to be called with async compaction and ingestion shutdown.
* With async compaction, this is expected to be called with async compaction and write shutdown.
* Otherwise, async compactor could fail with errors
*
* @param compactionTime - delete the compaction time
@@ -729,7 +732,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config, jsc);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
// Rollback to savepoint is expected to be a manual operation and no concurrent ingestion or compaction is expected
// Rollback to savepoint is expected to be a manual operation and no concurrent write or compaction is expected
// to be running. Rollback to savepoint also removes any pending compaction actions that are generated after
// savepoint time. Allowing pending compaction to be retained is not safe as those workload could be referencing
// file-slices that will be rolled-back as part of this operation
@@ -811,8 +814,12 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
// Make sure only the last n commits are being rolled back
// If there is a commit in-between or after that is not rolled back, then abort
String lastCommit = commitsToRollback.get(commitsToRollback.size() - 1);
if (!commitTimeline.empty() && !commitTimeline
String lastCommit = null;
if (!commitsToRollback.isEmpty()) {
lastCommit = commitsToRollback.get(commitsToRollback.size() - 1);
}
if ((lastCommit != null) && !commitTimeline.empty() && !commitTimeline
.findInstantsAfter(lastCommit, Integer.MAX_VALUE).empty()) {
throw new HoodieRollbackException(
"Found commits after time :" + lastCommit + ", please rollback greater commits first");
@@ -820,7 +827,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
List<String> inflights = inflightCommitTimeline.getInstants().map(HoodieInstant::getTimestamp)
.collect(Collectors.toList());
if (!inflights.isEmpty() && inflights.indexOf(lastCommit) != inflights.size() - 1) {
if ((lastCommit != null) && !inflights.isEmpty() && (inflights.indexOf(lastCommit) != inflights.size() - 1)) {
throw new HoodieRollbackException("Found in-flight commits after time :" + lastCommit
+ ", please rollback greater commits first");
}
@@ -940,141 +947,109 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
return commitTime;
}
public void startCommitWithTime(String commitTime) {
public void startCommitWithTime(String instantTime) {
if (rollbackInFlight) {
// Only rollback inflight commit/delta-commits. Do not touch compaction commits
rollbackInflightCommits();
}
logger.info("Generate a new commit time " + commitTime);
HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config, jsc);
logger.info("Generate a new instant time " + instantTime);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath());
// if there are pending compactions, their instantTime must not be greater than that of this instant time
metaClient.getActiveTimeline().filterPendingCompactionTimeline().lastInstant().ifPresent(latestPending -> {
Preconditions.checkArgument(
HoodieTimeline.compareTimestamps(latestPending.getTimestamp(), instantTime, HoodieTimeline.LESSER),
"Latest pending compaction instant time must be earlier "
+ "than this instant time. Latest Compaction :" + latestPending + ", Ingesting at " + instantTime);
});
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
String commitActionType = table.getMetaClient().getCommitActionType();
activeTimeline.createInflight(new HoodieInstant(true, commitActionType, commitTime));
activeTimeline.createInflight(new HoodieInstant(true, commitActionType, instantTime));
}
/**
* Schedules a new compaction instant
* @param extraMetadata
* @return
*/
public Optional<String> scheduleCompaction(Optional<Map<String, String>> extraMetadata) throws IOException {
String instantTime = HoodieActiveTimeline.createNewCommitTime();
logger.info("Generate a new instant time " + instantTime);
boolean notEmpty = scheduleCompactionAtInstant(instantTime, extraMetadata);
return notEmpty ? Optional.of(instantTime) : Optional.empty();
}
/**
* Provides a new commit time for a compaction (commit) operation
* Schedules a new compaction instant with passed-in instant time
* @param instantTime Compaction Instant Time
* @param extraMetadata Extra Metadata to be stored
*/
public String startCompaction() {
String commitTime = HoodieActiveTimeline.createNewCommitTime();
logger.info("Generate a new commit time " + commitTime);
startCompactionWithTime(commitTime);
return commitTime;
public boolean scheduleCompactionAtInstant(String instantTime, Optional<Map<String, String>> extraMetadata)
throws IOException {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
config.getBasePath(), true);
// if there are inflight writes, their instantTime must not be less than that of compaction instant time
metaClient.getActiveTimeline().filterInflightsExcludingCompaction().firstInstant().ifPresent(earliestInflight -> {
Preconditions.checkArgument(
HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), instantTime, HoodieTimeline.GREATER),
"Earliest write inflight instant time must be later "
+ "than compaction time. Earliest :" + earliestInflight + ", Compaction scheduled at " + instantTime);
});
// Committed and pending compaction instants should have strictly lower timestamps
List<HoodieInstant> conflictingInstants =
metaClient.getActiveTimeline().getCommitsAndCompactionTimeline().getInstants().filter(instant ->
HoodieTimeline.compareTimestamps(instant.getTimestamp(), instantTime,
HoodieTimeline.GREATER_OR_EQUAL)).collect(Collectors.toList());
Preconditions.checkArgument(conflictingInstants.isEmpty(),
"Following instants have timestamps >= compactionInstant. Instants :"
+ conflictingInstants);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieCompactionPlan workload = table.scheduleCompaction(jsc, instantTime);
if (workload != null && (workload.getOperations() != null) && (!workload.getOperations().isEmpty())) {
extraMetadata.ifPresent(workload::setExtraMetadata);
HoodieInstant compactionInstant =
new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime);
metaClient.getActiveTimeline().saveToCompactionRequested(compactionInstant,
AvroUtils.serializeCompactionPlan(workload));
return true;
}
return false;
}
/**
* Since MOR tableType default to {@link HoodieTimeline#DELTA_COMMIT_ACTION}, we need to
* explicitly set to {@link HoodieTimeline#COMMIT_ACTION} for compaction
* Performs Compaction for the workload stored in instant-time
* @param compactionInstantTime Compaction Instant Time
* @return
* @throws IOException
*/
public void startCompactionWithTime(String commitTime) {
HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config, jsc);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
String commitActionType = HoodieTimeline.COMMIT_ACTION;
activeTimeline.createInflight(new HoodieInstant(true, commitActionType, commitTime));
public JavaRDD<WriteStatus> compact(String compactionInstantTime) throws IOException {
return compact(compactionInstantTime, config.shouldAutoCommit());
}
/**
* Performs a compaction operation on a dataset. WARNING: Compaction operation cannot be executed
* asynchronously. Please always use this serially before or after an insert/upsert action.
* Commit a compaction operation. Allow passing additional meta-data to be stored in commit instant file.
*/
public JavaRDD<WriteStatus> compact(String commitTime) throws IOException {
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTable<T> table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config, jsc);
// TODO : Fix table.getActionType for MOR table type to return different actions based on delta or compaction
writeContext = metrics.getCommitCtx();
JavaRDD<WriteStatus> statuses = table.compact(jsc, commitTime);
// Trigger the insert and collect statuses
statuses = statuses.persist(config.getWriteStatusStorageLevel());
String actionType = HoodieActiveTimeline.COMMIT_ACTION;
commitOnAutoCommit(commitTime, statuses, actionType);
return statuses;
}
/**
* Commit a compaction operation
*/
public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses,
Optional<HashMap<String, String>> extraMetadata) {
String commitCompactionActionType = HoodieActiveTimeline.COMMIT_ACTION;
commit(commitTime, writeStatuses, extraMetadata, commitCompactionActionType);
}
/**
* Commit a compaction operation
*/
public void commitCompaction(String commitTime, JavaRDD<WriteStatus> writeStatuses) {
String commitCompactionActionType = HoodieActiveTimeline.COMMIT_ACTION;
commit(commitTime, writeStatuses, Optional.empty(), commitCompactionActionType);
}
/**
* Performs a compaction operation on a dataset. WARNING: Compaction operation cannot be executed
* asynchronously. Please always use this serially before or after an insert/upsert action.
*/
private void forceCompact(String compactionCommitTime) throws IOException {
// Create a Hoodie table which encapsulated the commits and files visible
public void commitCompaction(String compactionInstantTime, JavaRDD<WriteStatus> writeStatuses,
Optional<Map<String, String>> extraMetadata) throws IOException {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
config.getBasePath(), true);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
// TODO : Fix table.getActionType for MOR table type to return different actions based on delta or compaction and
// then use getTableAndInitCtx
Timer.Context writeContext = metrics.getCommitCtx();
JavaRDD<WriteStatus> compactedStatuses = table.compact(jsc, compactionCommitTime);
if (!compactedStatuses.isEmpty()) {
HoodieCommitMetadata metadata = commitForceCompaction(compactedStatuses, metaClient, compactionCommitTime);
long durationInMs = metrics.getDurationInMs(writeContext.stop());
try {
metrics
.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).getTime(),
durationInMs, metadata, HoodieActiveTimeline.COMMIT_ACTION);
} catch (ParseException e) {
throw new HoodieCommitException(
"Commit time is not of valid format.Failed to commit " + config.getBasePath()
+ " at time " + compactionCommitTime, e);
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(
timeline.getInstantAuxiliaryDetails(HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime)).get());
// Merge extra meta-data passed by user with the one already in inflight compaction
Optional<Map<String, String>> mergedMetaData = extraMetadata.map(m -> {
Map<String, String> merged = new HashMap<>();
Map<String, String> extraMetaDataFromInstantFile = compactionPlan.getExtraMetadata();
if (extraMetaDataFromInstantFile != null) {
merged.putAll(extraMetaDataFromInstantFile);
}
logger.info("Compacted successfully on commit " + compactionCommitTime);
} else {
logger.info("Compaction did not run for commit " + compactionCommitTime);
}
}
/**
* Performs a compaction operation on a dataset. WARNING: Compaction operation cannot be executed
* asynchronously. Please always use this serially before or after an insert/upsert action.
*/
private String forceCompact() throws IOException {
String compactionCommitTime = startCompaction();
forceCompact(compactionCommitTime);
return compactionCommitTime;
}
private HoodieCommitMetadata commitForceCompaction(JavaRDD<WriteStatus> writeStatuses,
HoodieTableMetaClient metaClient, String compactionCommitTime) {
List<HoodieWriteStat> updateStatusMap = writeStatuses.map(writeStatus -> writeStatus.getStat())
.collect();
HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
for (HoodieWriteStat stat : updateStatusMap) {
metadata.addWriteStat(stat.getPartitionPath(), stat);
}
logger.info("Compaction finished with result " + metadata);
logger.info("Committing Compaction " + compactionCommitTime);
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
try {
activeTimeline.saveAsComplete(
new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, compactionCommitTime),
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch (IOException e) {
throw new HoodieCompactionException(
"Failed to commit " + metaClient.getBasePath() + " at time " + compactionCommitTime, e);
}
return metadata;
// Overwrite/Merge with the user-passed meta-data
merged.putAll(m);
return Optional.of(merged);
}).orElseGet(() -> Optional.ofNullable(compactionPlan.getExtraMetadata()));
commitCompaction(writeStatuses, table, compactionInstantTime, true, mergedMetaData);
}
/**
@@ -1127,4 +1102,158 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
}
return table;
}
/**
* Compaction specific private methods
*/
/**
* Ensures compaction instant is in expected state and performs Compaction for the workload stored in instant-time
* @param compactionInstantTime Compaction Instant Time
* @return
* @throws IOException
*/
private JavaRDD<WriteStatus> compact(String compactionInstantTime, boolean autoCommit) throws IOException {
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
config.getBasePath(), true);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
HoodieTimeline pendingCompactionTimeline = metaClient.getActiveTimeline().filterPendingCompactionTimeline();
HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime);
if (pendingCompactionTimeline.containsInstant(inflightInstant)) {
//inflight compaction - Needs to rollback first deleting new parquet files before we run compaction.
rollbackInflightCompaction(inflightInstant, table);
// refresh table
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true);
table = HoodieTable.getHoodieTable(metaClient, config, jsc);
pendingCompactionTimeline = metaClient.getActiveTimeline().filterPendingCompactionTimeline();
}
HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
if (pendingCompactionTimeline.containsInstant(instant)) {
return runCompaction(instant, metaClient.getActiveTimeline(), autoCommit);
} else {
throw new IllegalStateException("No Compaction request available at " + compactionInstantTime
+ " to run compaction");
}
}
/**
* Perform compaction operations as specified in the compaction commit file
*
* @param compactionInstant Compacton Instant time
* @param activeTimeline Active Timeline
* @param autoCommit Commit after compaction
* @return RDD of Write Status
*/
private JavaRDD<WriteStatus> runCompaction(
HoodieInstant compactionInstant, HoodieActiveTimeline activeTimeline, boolean autoCommit) throws IOException {
HoodieCompactionPlan compactionPlan = AvroUtils.deserializeCompactionPlan(
activeTimeline.getInstantAuxiliaryDetails(compactionInstant).get());
// Mark instant as compaction inflight
activeTimeline.transitionCompactionRequestedToInflight(compactionInstant);
compactionTimer = metrics.getCompactionCtx();
// Create a Hoodie table which encapsulated the commits and files visible
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
config.getBasePath(), true);
HoodieTable<T> table = HoodieTable.getHoodieTable(metaClient, config, jsc);
JavaRDD<WriteStatus> statuses = table.compact(jsc, compactionInstant.getTimestamp(), compactionPlan);
// Force compaction action
statuses.persist(config.getWriteStatusStorageLevel());
// pass extra-metada so that it gets stored in commit file automatically
commitCompaction(statuses, table, compactionInstant.getTimestamp(), autoCommit,
Optional.ofNullable(compactionPlan.getExtraMetadata()));
return statuses;
}
/**
* Commit Compaction and track metrics
*
* @param compactedStatuses Compaction Write status
* @param table Hoodie Table
* @param compactionCommitTime Compaction Commit Time
* @param autoCommit Auto Commit
* @param extraMetadata Extra Metadata to store
*/
protected void commitCompaction(JavaRDD<WriteStatus> compactedStatuses, HoodieTable<T> table,
String compactionCommitTime, boolean autoCommit, Optional<Map<String, String>> extraMetadata) {
if (autoCommit) {
HoodieCommitMetadata metadata =
doCompactionCommit(compactedStatuses, table.getMetaClient(), compactionCommitTime, extraMetadata);
if (compactionTimer != null) {
long durationInMs = metrics.getDurationInMs(compactionTimer.stop());
try {
metrics.updateCommitMetrics(HoodieActiveTimeline.COMMIT_FORMATTER.parse(compactionCommitTime).getTime(),
durationInMs, metadata, HoodieActiveTimeline.COMPACTION_ACTION);
} catch (ParseException e) {
throw new HoodieCommitException(
"Commit time is not of valid format.Failed to commit compaction " + config.getBasePath()
+ " at time " + compactionCommitTime, e);
}
}
logger.info("Compacted successfully on commit " + compactionCommitTime);
} else {
logger.info("Compaction did not run for commit " + compactionCommitTime);
}
}
/**
* Rollback partial compactions
* @param inflightInstant Inflight Compaction Instant
* @param table Hoodie Table
*/
private void rollbackInflightCompaction(HoodieInstant inflightInstant, HoodieTable table) throws IOException {
table.rollback(jsc, ImmutableList.copyOf(new String[] { inflightInstant.getTimestamp() }));
// Revert instant state file
table.getActiveTimeline().revertCompactionInflightToRequested(inflightInstant);
}
private HoodieCommitMetadata doCompactionCommit(JavaRDD<WriteStatus> writeStatuses,
HoodieTableMetaClient metaClient, String compactionCommitTime, Optional<Map<String, String>> extraMetadata) {
List<HoodieWriteStat> updateStatusMap = writeStatuses.map(writeStatus -> writeStatus.getStat())
.collect();
HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
for (HoodieWriteStat stat : updateStatusMap) {
metadata.addWriteStat(stat.getPartitionPath(), stat);
}
// Copy extraMetadata
extraMetadata.ifPresent(m -> {
m.entrySet().stream().forEach(e -> {
metadata.addMetadata(e.getKey(), e.getValue());
});
});
logger.info("Compaction finished with result " + metadata);
logger.info("Committing Compaction " + compactionCommitTime);
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
try {
activeTimeline.transitionCompactionInflightToComplete(
new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionCommitTime),
Optional.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} catch (IOException e) {
throw new HoodieCompactionException(
"Failed to commit " + metaClient.getBasePath() + " at time " + compactionCommitTime, e);
}
return metadata;
}
/**
* Performs a compaction operation on a dataset. WARNING: Compaction operation cannot be executed
* asynchronously. Please always use this serially before or after an insert/upsert action.
*/
private Optional<String> forceCompact(Optional<Map<String, String>> extraMetadata) throws IOException {
Optional<String> compactionInstantTimeOpt = scheduleCompaction(extraMetadata);
compactionInstantTimeOpt.ifPresent(compactionInstantTime -> {
try {
compact(compactionInstantTime);
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
});
return compactionInstantTimeOpt;
}
}

View File

@@ -1,42 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.io.compact;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
/**
* Contains Hoodie Compaction instant along with workload
*/
public class HoodieCompactionInstantWithPlan {
private final String compactionInstantTime;
private final HoodieCompactionPlan compactionPlan;
public HoodieCompactionInstantWithPlan(String compactionInstantTime,
HoodieCompactionPlan compactionPlan) {
this.compactionInstantTime = compactionInstantTime;
this.compactionPlan = compactionPlan;
}
public String getCompactionInstantTime() {
return compactionInstantTime;
}
public HoodieCompactionPlan getCompactionPlan() {
return compactionPlan;
}
}

View File

@@ -22,6 +22,7 @@ import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.table.HoodieTable;
import java.io.IOException;
import java.io.Serializable;
import java.util.Set;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@@ -30,15 +31,6 @@ import org.apache.spark.api.java.JavaSparkContext;
*/
public interface HoodieCompactor extends Serializable {
/**
* Compact the delta files with the data files
*
* @deprecated : Will be removed in next PR
*/
@Deprecated
JavaRDD<WriteStatus> compact(JavaSparkContext jsc, final HoodieWriteConfig config,
HoodieTable hoodieTable, String compactionCommitTime) throws Exception;
/**
* Generate a new compaction plan for scheduling
*
@@ -50,7 +42,8 @@ public interface HoodieCompactor extends Serializable {
* @throws IOException when encountering errors
*/
HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc,
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime)
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime,
Set<String> fileIdsWithPendingCompactions)
throws IOException;
/**
@@ -58,5 +51,5 @@ public interface HoodieCompactor extends Serializable {
*/
JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
String compactionCommitTime) throws IOException;
String compactionInstantTime) throws IOException;
}

View File

@@ -45,6 +45,7 @@ import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.avro.Schema;
@@ -73,31 +74,22 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
// Accumulator to keep track of total log file slices for a dataset
private AccumulatorV2<Long, Long> totalFileSlices;
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieWriteConfig config,
HoodieTable hoodieTable, String compactionCommitTime) throws IOException {
HoodieCompactionPlan compactionPlan = generateCompactionPlan(jsc, hoodieTable, config,
compactionCommitTime);
List<HoodieCompactionOperation> operations = compactionPlan.getOperations();
if ((operations == null) || (operations.isEmpty())) {
return jsc.emptyRDD();
}
return compact(jsc, compactionPlan, hoodieTable, config, compactionCommitTime);
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
String compactionCommitTime) throws IOException {
String compactionInstantTime) throws IOException {
if (compactionPlan == null || (compactionPlan.getOperations() == null)
|| (compactionPlan.getOperations().isEmpty())) {
return jsc.emptyRDD();
}
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
// Compacting is very similar to applying updates to existing file
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
List<CompactionOperation> operations = compactionPlan.getOperations().stream()
.map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(
CompactionOperation::convertFromAvroRecordInstance).collect(toList());
log.info("Compactor compacting " + operations + " files");
return jsc.parallelize(operations, operations.size())
.map(s -> compact(table, metaClient, config, s, compactionCommitTime))
.map(s -> compact(table, metaClient, config, s, compactionInstantTime))
.flatMap(writeStatusesItr -> writeStatusesItr.iterator());
}
@@ -164,8 +156,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
@Override
public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc,
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime)
throws IOException {
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime,
Set<String> fileIdsWithPendingCompactions) throws IOException {
totalLogFiles = new LongAccumulator();
totalFileSlices = new LongAccumulator();
@@ -191,7 +183,9 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
List<HoodieCompactionOperation> operations =
jsc.parallelize(partitionPaths, partitionPaths.size())
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
.getLatestFileSlices(partitionPath).map(
.getLatestFileSlices(partitionPath)
.filter(slice -> !fileIdsWithPendingCompactions.contains(slice.getFileId()))
.map(
s -> {
List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile
.getBaseInstantAndLogVersionComparator().reversed()).collect(Collectors.toList());
@@ -211,13 +205,16 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
log.info("Total number of latest files slices " + totalFileSlices.value());
log.info("Total number of log files " + totalLogFiles.value());
log.info("Total number of file slices " + totalFileSlices.value());
// Filter the compactions with the passed in filter. This lets us choose most effective
// compactions only
// TODO: In subsequent PRs, pending Compaction plans will be wired in. Strategy can look at pending compaction
// plans to schedule next compaction plan
HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations,
CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
Preconditions.checkArgument(compactionPlan.getOperations().stream()
.filter(op -> fileIdsWithPendingCompactions.contains(op.getFileId())).count() == 0,
"Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. "
+ "Please fix your strategy implementation."
+ "FileIdsWithPendingCompactions :" + fileIdsWithPendingCompactions
+ ", Selected workload :" + compactionPlan);
if (compactionPlan.getOperations().isEmpty()) {
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
}

View File

@@ -38,6 +38,7 @@ public class HoodieMetrics {
public String commitTimerName = null;
public String deltaCommitTimerName = null;
public String finalizeTimerName = null;
public String compactionTimerName = null;
private HoodieWriteConfig config = null;
private String tableName = null;
private Timer rollbackTimer = null;
@@ -45,6 +46,7 @@ public class HoodieMetrics {
private Timer commitTimer = null;
private Timer deltaCommitTimer = null;
private Timer finalizeTimer = null;
private Timer compactionTimer = null;
public HoodieMetrics(HoodieWriteConfig config, String tableName) {
this.config = config;
@@ -56,6 +58,7 @@ public class HoodieMetrics {
this.commitTimerName = getMetricsName("timer", HoodieTimeline.COMMIT_ACTION);
this.deltaCommitTimerName = getMetricsName("timer", HoodieTimeline.DELTA_COMMIT_ACTION);
this.finalizeTimerName = getMetricsName("timer", "finalize");
this.compactionTimerName = getMetricsName("timer", HoodieTimeline.COMPACTION_ACTION);
}
}
@@ -70,6 +73,13 @@ public class HoodieMetrics {
return rollbackTimer == null ? null : rollbackTimer.time();
}
public Timer.Context getCompactionCtx() {
if (config.isMetricsOn() && compactionTimer == null) {
compactionTimer = createTimer(commitTimerName);
}
return compactionTimer == null ? null : compactionTimer.time();
}
public Timer.Context getCleanCtx() {
if (config.isMetricsOn() && cleanTimer == null) {
cleanTimer = createTimer(cleanTimerName);

View File

@@ -167,11 +167,6 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
HoodieCompactionPlan compactionPlan) {

View File

@@ -36,6 +36,7 @@ import com.uber.hoodie.common.table.log.block.HoodieCommandBlock.HoodieCommandBl
import com.uber.hoodie.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.exception.HoodieCompactionException;
@@ -51,6 +52,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -148,22 +150,14 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
logger.info("Compacting merge on read table " + config.getBasePath());
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
try {
return compactor.generateCompactionPlan(jsc, this, config, instantTime);
return compactor.generateCompactionPlan(jsc, this, config, instantTime,
new HashSet<>(((HoodieTableFileSystemView)getRTFileSystemView())
.getFileIdToPendingCompaction().keySet()));
} catch (IOException e) {
throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e);
}
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionCommitTime) {
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
try {
return compactor.compact(jsc, config, this, compactionCommitTime);
} catch (IOException e) {
throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
}
}
@Override
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
HoodieCompactionPlan compactionPlan) {
@@ -185,7 +179,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
}
Map<String, HoodieInstant> commitsAndCompactions = this.getActiveTimeline()
.getTimelineOfActions(Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.DELTA_COMMIT_ACTION)).getInstants()
HoodieActiveTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.COMPACTION_ACTION)).getInstants()
.filter(i -> commits.contains(i.getTimestamp()))
.collect(Collectors.toMap(i -> i.getTimestamp(), i -> i));
@@ -219,6 +213,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
switch (instant.getAction()) {
case HoodieTimeline.COMMIT_ACTION:
case HoodieTimeline.COMPACTION_ACTION:
try {
Map<FileStatus, Boolean> results = super
.deleteCleanedFiles(partitionPath, Arrays.asList(commit));

View File

@@ -223,13 +223,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
/**
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
* access
* @deprecated Will be replaced with newer APIs
*/
@Deprecated
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime);
/**
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data access
*
* @param jsc Spark Context
* @param compactionInstantTime Instant Time