(1) Define CompactionWorkload in avro to allow storing them in instant files.
(2) Split APIs in HoodieRealtimeCompactor to separate generating compaction workload from running compaction
This commit is contained in:
committed by
vinoth chandar
parent
6d01ae8ca0
commit
1b61f04e05
@@ -108,6 +108,7 @@ public class HoodieCompactionConfig extends DefaultHoodieConfig {
|
||||
+ ".partitions";
|
||||
// 500GB of target IO per compaction (both read and write)
|
||||
public static final String DEFAULT_TARGET_PARTITIONS_PER_DAYBASED_COMPACTION = String.valueOf(10);
|
||||
public static final String DEFAULT_COMPACTOR_ID = "default";
|
||||
|
||||
private HoodieCompactionConfig(Properties props) {
|
||||
super(props);
|
||||
|
||||
@@ -1,99 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.io.compact;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Encapsulates all the needed information about a compaction and make a decision whether this
|
||||
* compaction is effective or not
|
||||
*
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
public class CompactionOperation implements Serializable {
|
||||
|
||||
private Optional<String> dataFileCommitTime;
|
||||
private Optional<Long> dataFileSize;
|
||||
private List<String> deltaFilePaths;
|
||||
private Optional<String> dataFilePath;
|
||||
private String fileId;
|
||||
private String partitionPath;
|
||||
private Map<String, Object> metrics;
|
||||
|
||||
//Only for serialization/de-serialization
|
||||
@Deprecated
|
||||
public CompactionOperation() {
|
||||
}
|
||||
|
||||
public CompactionOperation(Optional<HoodieDataFile> dataFile, String partitionPath,
|
||||
List<HoodieLogFile> logFiles, HoodieWriteConfig writeConfig) {
|
||||
if (dataFile.isPresent()) {
|
||||
this.dataFilePath = Optional.of(dataFile.get().getPath());
|
||||
this.fileId = dataFile.get().getFileId();
|
||||
this.dataFileCommitTime = Optional.of(dataFile.get().getCommitTime());
|
||||
this.dataFileSize = Optional.of(dataFile.get().getFileSize());
|
||||
} else {
|
||||
assert logFiles.size() > 0;
|
||||
this.dataFilePath = Optional.empty();
|
||||
this.fileId = FSUtils.getFileIdFromLogPath(logFiles.get(0).getPath());
|
||||
this.dataFileCommitTime = Optional.empty();
|
||||
this.dataFileSize = Optional.empty();
|
||||
}
|
||||
this.partitionPath = partitionPath;
|
||||
this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString())
|
||||
.collect(Collectors.toList());
|
||||
this.metrics = writeConfig.getCompactionStrategy()
|
||||
.captureMetrics(writeConfig, dataFile, partitionPath, logFiles);
|
||||
}
|
||||
|
||||
public Optional<String> getDataFileCommitTime() {
|
||||
return dataFileCommitTime;
|
||||
}
|
||||
|
||||
public Optional<Long> getDataFileSize() {
|
||||
return dataFileSize;
|
||||
}
|
||||
|
||||
public List<String> getDeltaFilePaths() {
|
||||
return deltaFilePaths;
|
||||
}
|
||||
|
||||
public Optional<String> getDataFilePath() {
|
||||
return dataFilePath;
|
||||
}
|
||||
|
||||
public String getFileId() {
|
||||
return fileId;
|
||||
}
|
||||
|
||||
public String getPartitionPath() {
|
||||
return partitionPath;
|
||||
}
|
||||
|
||||
public Map<String, Object> getMetrics() {
|
||||
return metrics;
|
||||
}
|
||||
}
|
||||
@@ -17,13 +17,11 @@
|
||||
package com.uber.hoodie.io.compact;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Date;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
@@ -34,17 +32,31 @@ public interface HoodieCompactor extends Serializable {
|
||||
|
||||
/**
|
||||
* Compact the delta files with the data files
|
||||
*
|
||||
* @deprecated : Will be removed in next PR
|
||||
*/
|
||||
@Deprecated
|
||||
JavaRDD<WriteStatus> compact(JavaSparkContext jsc, final HoodieWriteConfig config,
|
||||
HoodieTable hoodieTable, String compactionCommitTime) throws Exception;
|
||||
|
||||
/**
|
||||
* Generate a new compaction plan for scheduling
|
||||
*
|
||||
* @param jsc Spark Context
|
||||
* @param hoodieTable Hoodie Table
|
||||
* @param config Hoodie Write Configuration
|
||||
* @param compactionCommitTime scheduled compaction commit time
|
||||
* @return Compaction Plan
|
||||
* @throws IOException when encountering errors
|
||||
*/
|
||||
HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc,
|
||||
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime)
|
||||
throws IOException;
|
||||
|
||||
// Helper methods
|
||||
default String startCompactionCommit(HoodieTable hoodieTable) {
|
||||
String commitTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
|
||||
HoodieActiveTimeline activeTimeline = hoodieTable.getActiveTimeline();
|
||||
activeTimeline
|
||||
.createInflight(new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime));
|
||||
return commitTime;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Execute compaction operations and report back status
|
||||
*/
|
||||
JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
|
||||
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
|
||||
String compactionCommitTime) throws IOException;
|
||||
}
|
||||
@@ -22,6 +22,10 @@ import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionOperation;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.common.model.CompactionOperation;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
import com.uber.hoodie.common.model.HoodieWriteStat.RuntimeStats;
|
||||
@@ -29,6 +33,7 @@ import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
import com.uber.hoodie.common.table.log.HoodieMergedLogRecordScanner;
|
||||
import com.uber.hoodie.common.util.CompactionUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
@@ -36,9 +41,11 @@ import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.StreamSupport;
|
||||
import org.apache.avro.Schema;
|
||||
@@ -70,26 +77,25 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, HoodieWriteConfig config,
|
||||
HoodieTable hoodieTable, String compactionCommitTime) throws IOException {
|
||||
|
||||
totalLogFiles = new LongAccumulator();
|
||||
totalFileSlices = new LongAccumulator();
|
||||
jsc.sc().register(totalLogFiles);
|
||||
jsc.sc().register(totalFileSlices);
|
||||
|
||||
List<CompactionOperation> operations = getCompactionWorkload(jsc, hoodieTable, config,
|
||||
HoodieCompactionPlan compactionPlan = generateCompactionPlan(jsc, hoodieTable, config,
|
||||
compactionCommitTime);
|
||||
if (operations == null) {
|
||||
List<HoodieCompactionOperation> operations = compactionPlan.getOperations();
|
||||
if ((operations == null) || (operations.isEmpty())) {
|
||||
return jsc.emptyRDD();
|
||||
}
|
||||
return executeCompaction(jsc, operations, hoodieTable, config, compactionCommitTime);
|
||||
return compact(jsc, compactionPlan, hoodieTable, config, compactionCommitTime);
|
||||
}
|
||||
|
||||
private JavaRDD<WriteStatus> executeCompaction(JavaSparkContext jsc,
|
||||
List<CompactionOperation> operations, HoodieTable hoodieTable, HoodieWriteConfig config,
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc,
|
||||
HoodieCompactionPlan compactionPlan, HoodieTable hoodieTable, HoodieWriteConfig config,
|
||||
String compactionCommitTime) throws IOException {
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
// Compacting is very similar to applying updates to existing file
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, jsc);
|
||||
log.info("After filtering, Compacting " + operations + " files");
|
||||
List<CompactionOperation> operations = compactionPlan.getOperations().stream()
|
||||
.map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
|
||||
log.info("Compactor " + compactionPlan.getCompactorId() + " running, Compacting " + operations + " files");
|
||||
return jsc.parallelize(operations, operations.size())
|
||||
.map(s -> compact(table, metaClient, config, s, compactionCommitTime))
|
||||
.flatMap(writeStatusesItr -> writeStatusesItr.iterator());
|
||||
@@ -144,8 +150,8 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles());
|
||||
s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
|
||||
s.getStat().setPartitionPath(operation.getPartitionPath());
|
||||
s.getStat().setTotalLogSizeCompacted((long) operation.getMetrics().get(
|
||||
CompactionStrategy.TOTAL_LOG_FILE_SIZE));
|
||||
s.getStat().setTotalLogSizeCompacted(operation.getMetrics().get(
|
||||
CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
|
||||
s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
|
||||
s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks());
|
||||
s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks());
|
||||
@@ -156,10 +162,16 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
}).collect(toList());
|
||||
}
|
||||
|
||||
private List<CompactionOperation> getCompactionWorkload(JavaSparkContext jsc,
|
||||
@Override
|
||||
public HoodieCompactionPlan generateCompactionPlan(JavaSparkContext jsc,
|
||||
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionCommitTime)
|
||||
throws IOException {
|
||||
|
||||
totalLogFiles = new LongAccumulator();
|
||||
totalFileSlices = new LongAccumulator();
|
||||
jsc.sc().register(totalLogFiles);
|
||||
jsc.sc().register(totalFileSlices);
|
||||
|
||||
Preconditions
|
||||
.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
|
||||
"HoodieRealtimeTableCompactor can only compact table of type "
|
||||
@@ -176,7 +188,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
|
||||
TableFileSystemView.RealtimeView fileSystemView = hoodieTable.getRTFileSystemView();
|
||||
log.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||
List<CompactionOperation> operations =
|
||||
List<HoodieCompactionOperation> operations =
|
||||
jsc.parallelize(partitionPaths, partitionPaths.size())
|
||||
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
|
||||
.getLatestFileSlices(partitionPath).map(
|
||||
@@ -185,10 +197,16 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
.getBaseInstantAndLogVersionComparator().reversed()).collect(Collectors.toList());
|
||||
totalLogFiles.add((long) logFiles.size());
|
||||
totalFileSlices.add(1L);
|
||||
return new CompactionOperation(s.getDataFile(), partitionPath, logFiles, config);
|
||||
// Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
|
||||
// for spark Map operations and collecting them finally in Avro generated classes for storing
|
||||
// into meta files.
|
||||
Optional<HoodieDataFile> dataFile = s.getDataFile();
|
||||
return new CompactionOperation(dataFile, partitionPath, logFiles,
|
||||
config.getCompactionStrategy().captureMetrics(config, dataFile, partitionPath, logFiles));
|
||||
})
|
||||
.filter(c -> !c.getDeltaFilePaths().isEmpty())
|
||||
.collect(toList()).iterator()).collect();
|
||||
.collect(toList()).iterator()).collect().stream().map(CompactionUtils::buildHoodieCompactionOperation)
|
||||
.collect(toList());
|
||||
log.info("Total of " + operations.size() + " compactions are retrieved");
|
||||
log.info("Total number of latest files slices " + totalFileSlices.value());
|
||||
log.info("Total number of log files " + totalLogFiles.value());
|
||||
@@ -196,12 +214,13 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
|
||||
// Filter the compactions with the passed in filter. This lets us choose most effective
|
||||
// compactions only
|
||||
operations = config.getCompactionStrategy().orderAndFilter(config, operations);
|
||||
if (operations.isEmpty()) {
|
||||
// TODO: In subsequent PRs, pending Compaction plans will be wired in. Strategy can look at pending compaction
|
||||
// plans to schedule next compaction plan
|
||||
HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations,
|
||||
new ArrayList<>());
|
||||
if (compactionPlan.getOperations().isEmpty()) {
|
||||
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
||||
return null;
|
||||
}
|
||||
return operations;
|
||||
return compactionPlan;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,9 @@
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionOperation;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
@@ -30,15 +31,15 @@ import java.util.List;
|
||||
public class BoundedIOCompactionStrategy extends CompactionStrategy {
|
||||
|
||||
@Override
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<CompactionOperation> operations) {
|
||||
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||
// Iterate through the operations in order and accept operations as long as we are within the
|
||||
// IO limit
|
||||
// Preserves the original ordering of compactions
|
||||
List<CompactionOperation> finalOperations = Lists.newArrayList();
|
||||
List<HoodieCompactionOperation> finalOperations = Lists.newArrayList();
|
||||
long targetIORemaining = writeConfig.getTargetIOPerCompactionInMB();
|
||||
for (CompactionOperation op : operations) {
|
||||
long opIo = (Long) op.getMetrics().get(TOTAL_IO_MB);
|
||||
for (HoodieCompactionOperation op : operations) {
|
||||
long opIo = op.getMetrics().get(TOTAL_IO_MB).longValue();
|
||||
targetIORemaining -= opIo;
|
||||
finalOperations.add(op);
|
||||
if (targetIORemaining <= 0) {
|
||||
|
||||
@@ -17,11 +17,13 @@
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionOperation;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -35,7 +37,6 @@ import java.util.Optional;
|
||||
* passed in every time
|
||||
*
|
||||
* @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor
|
||||
* @see CompactionOperation
|
||||
*/
|
||||
public abstract class CompactionStrategy implements Serializable {
|
||||
|
||||
@@ -46,7 +47,7 @@ public abstract class CompactionStrategy implements Serializable {
|
||||
public static final String TOTAL_LOG_FILES = "TOTAL_LOG_FILES";
|
||||
|
||||
/**
|
||||
* Callback hook when a CompactionOperation is created. Individual strategies can capture the
|
||||
* Callback hook when a HoodieCompactionOperation is created. Individual strategies can capture the
|
||||
* metrics they need to decide on the priority.
|
||||
*
|
||||
* @param dataFile - Base file to compact
|
||||
@@ -54,9 +55,9 @@ public abstract class CompactionStrategy implements Serializable {
|
||||
* @param logFiles - List of log files to compact with the base file
|
||||
* @return Map[String, Object] - metrics captured
|
||||
*/
|
||||
public Map<String, Object> captureMetrics(HoodieWriteConfig writeConfig, Optional<HoodieDataFile> dataFile, String
|
||||
partitionPath, List<HoodieLogFile> logFiles) {
|
||||
Map<String, Object> metrics = Maps.newHashMap();
|
||||
public Map<String, Double> captureMetrics(HoodieWriteConfig writeConfig, Optional<HoodieDataFile> dataFile,
|
||||
String partitionPath, List<HoodieLogFile> logFiles) {
|
||||
Map<String, Double> metrics = Maps.newHashMap();
|
||||
Long defaultMaxParquetFileSize = writeConfig.getParquetMaxFileSize();
|
||||
// Total size of all the log files
|
||||
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(Optional::isPresent)
|
||||
@@ -70,25 +71,44 @@ public abstract class CompactionStrategy implements Serializable {
|
||||
// Total IO will the the IO for read + write
|
||||
Long totalIO = totalIORead + totalIOWrite;
|
||||
// Save these metrics and we will use during the filter
|
||||
metrics.put(TOTAL_IO_READ_MB, totalIORead);
|
||||
metrics.put(TOTAL_IO_WRITE_MB, totalIOWrite);
|
||||
metrics.put(TOTAL_IO_MB, totalIO);
|
||||
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize);
|
||||
metrics.put(TOTAL_LOG_FILES, logFiles.size());
|
||||
metrics.put(TOTAL_IO_READ_MB, totalIORead.doubleValue());
|
||||
metrics.put(TOTAL_IO_WRITE_MB, totalIOWrite.doubleValue());
|
||||
metrics.put(TOTAL_IO_MB, totalIO.doubleValue());
|
||||
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue());
|
||||
metrics.put(TOTAL_LOG_FILES, Double.valueOf(logFiles.size()));
|
||||
return metrics;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to
|
||||
* order and filter out compactions
|
||||
* Generate Compaction plan. Allows clients to order and filter the list of compactions to be set. The default
|
||||
* implementation takes care of setting compactor Id from configuration allowing subclasses to only worry about
|
||||
* ordering and filtering compaction operations
|
||||
*
|
||||
* @param writeConfig - HoodieWriteConfig - config for this compaction is passed in
|
||||
* @param operations - list of compactions collected
|
||||
* @param writeConfig Hoodie Write Config
|
||||
* @param operations Compaction Operations to be ordered and filtered
|
||||
* @param pendingCompactionPlans Pending Compaction Plans for strategy to schedule next compaction plan
|
||||
* @return Compaction plan to be scheduled.
|
||||
*/
|
||||
public HoodieCompactionPlan generateCompactionPlan(HoodieWriteConfig writeConfig,
|
||||
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||
// Strategy implementation can overload this method to set specific compactor-id
|
||||
return HoodieCompactionPlan.newBuilder().setCompactorId(HoodieCompactionConfig.DEFAULT_COMPACTOR_ID)
|
||||
.setOperations(orderAndFilter(writeConfig, operations, pendingCompactionPlans))
|
||||
.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Order and Filter the list of compactions. Use the metrics captured with the captureMetrics to order and filter out
|
||||
* compactions
|
||||
*
|
||||
* @param writeConfig config for this compaction is passed in
|
||||
* @param operations list of compactions collected
|
||||
* @param pendingCompactionPlans Pending Compaction Plans for strategy to schedule next compaction plan
|
||||
* @return list of compactions to perform in this run
|
||||
*/
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<CompactionOperation> operations) {
|
||||
protected List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<HoodieCompactionOperation> operations,
|
||||
List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||
return operations;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,9 +18,10 @@
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionOperation;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Comparator;
|
||||
@@ -58,12 +59,12 @@ public class DayBasedCompactionStrategy extends CompactionStrategy {
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<CompactionOperation> operations) {
|
||||
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||
// Iterate through the operations and accept operations as long as we are within the configured target partitions
|
||||
// limit
|
||||
List<CompactionOperation> filteredList = operations.stream()
|
||||
.collect(Collectors.groupingBy(CompactionOperation::getPartitionPath)).entrySet().stream()
|
||||
List<HoodieCompactionOperation> filteredList = operations.stream()
|
||||
.collect(Collectors.groupingBy(HoodieCompactionOperation::getPartitionPath)).entrySet().stream()
|
||||
.sorted(Map.Entry.comparingByKey(comparator)).limit(writeConfig.getTargetPartitionsPerDayBasedCompaction())
|
||||
.flatMap(e -> e.getValue().stream())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
@@ -16,10 +16,11 @@
|
||||
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionOperation;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@@ -34,37 +35,37 @@ import java.util.stream.Collectors;
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy implements
|
||||
Comparator<CompactionOperation> {
|
||||
Comparator<HoodieCompactionOperation> {
|
||||
|
||||
private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE";
|
||||
|
||||
@Override
|
||||
public Map<String, Object> captureMetrics(HoodieWriteConfig config, Optional<HoodieDataFile> dataFile, String
|
||||
partitionPath,
|
||||
List<HoodieLogFile> logFiles) {
|
||||
public Map<String, Double> captureMetrics(HoodieWriteConfig config, Optional<HoodieDataFile> dataFile,
|
||||
String partitionPath, List<HoodieLogFile> logFiles) {
|
||||
Map<String, Double> metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles);
|
||||
|
||||
Map<String, Object> metrics = super.captureMetrics(config, dataFile, partitionPath, logFiles);
|
||||
// Total size of all the log files
|
||||
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize)
|
||||
.filter(Optional::isPresent).map(Optional::get).reduce((size1, size2) -> size1 + size2)
|
||||
.orElse(0L);
|
||||
// save the metrics needed during the order
|
||||
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize);
|
||||
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize.doubleValue());
|
||||
return metrics;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<CompactionOperation> operations) {
|
||||
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionPlans) {
|
||||
// Order the operations based on the reverse size of the logs and limit them by the IO
|
||||
return super
|
||||
.orderAndFilter(writeConfig, operations.stream().sorted(this).collect(Collectors.toList()));
|
||||
.orderAndFilter(writeConfig,
|
||||
operations.stream().sorted(this).collect(Collectors.toList()), pendingCompactionPlans);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(CompactionOperation op1, CompactionOperation op2) {
|
||||
Long totalLogSize1 = (Long) op1.getMetrics().get(TOTAL_LOG_FILE_SIZE);
|
||||
Long totalLogSize2 = (Long) op2.getMetrics().get(TOTAL_LOG_FILE_SIZE);
|
||||
public int compare(HoodieCompactionOperation op1, HoodieCompactionOperation op2) {
|
||||
Long totalLogSize1 = op1.getMetrics().get(TOTAL_LOG_FILE_SIZE).longValue();
|
||||
Long totalLogSize2 = op2.getMetrics().get(TOTAL_LOG_FILE_SIZE).longValue();
|
||||
// Reverse the comparison order - so that larger log file size is compacted first
|
||||
return totalLogSize2.compareTo(totalLogSize1);
|
||||
}
|
||||
|
||||
@@ -16,8 +16,9 @@
|
||||
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionOperation;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
@@ -30,8 +31,8 @@ import java.util.List;
|
||||
public class UnBoundedCompactionStrategy extends CompactionStrategy {
|
||||
|
||||
@Override
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig config,
|
||||
List<CompactionOperation> operations) {
|
||||
public List<HoodieCompactionOperation> orderAndFilter(HoodieWriteConfig config,
|
||||
List<HoodieCompactionOperation> operations, List<HoodieCompactionPlan> pendingCompactionWorkloads) {
|
||||
return operations;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ package com.uber.hoodie.table;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.common.HoodieCleanStat;
|
||||
import com.uber.hoodie.common.HoodieRollbackStat;
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
@@ -161,11 +162,22 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String commitTime) {
|
||||
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime) {
|
||||
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
|
||||
HoodieCompactionPlan compactionPlan) {
|
||||
throw new HoodieNotSupportedException("Compaction is not supported from a CopyOnWrite table");
|
||||
}
|
||||
|
||||
public Iterator<List<WriteStatus>> handleUpdate(String commitTime, String fileLoc,
|
||||
Iterator<HoodieRecord<T>> recordItr) throws IOException {
|
||||
// these are updates
|
||||
|
||||
@@ -19,6 +19,7 @@ package com.uber.hoodie.table;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.common.HoodieRollbackStat;
|
||||
import com.uber.hoodie.common.model.FileSlice;
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
@@ -126,7 +127,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionCommitTime) {
|
||||
public HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime) {
|
||||
logger.info("Checking if compaction needs to be run on " + config.getBasePath());
|
||||
Optional<HoodieInstant> lastCompaction = getActiveTimeline().getCommitTimeline()
|
||||
.filterCompletedInstants().lastInstant();
|
||||
@@ -141,10 +142,20 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||
logger.info("Not running compaction as only " + deltaCommitsSinceLastCompaction
|
||||
+ " delta commits was found since last compaction " + deltaCommitsSinceTs
|
||||
+ ". Waiting for " + config.getInlineCompactDeltaCommitMax());
|
||||
return jsc.emptyRDD();
|
||||
return new HoodieCompactionPlan();
|
||||
}
|
||||
|
||||
logger.info("Compacting merge on read table " + config.getBasePath());
|
||||
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
|
||||
try {
|
||||
return compactor.generateCompactionPlan(jsc, this, config, instantTime);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCompactionException("Could not schedule compaction " + config.getBasePath(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionCommitTime) {
|
||||
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
|
||||
try {
|
||||
return compactor.compact(jsc, config, this, compactionCommitTime);
|
||||
@@ -153,6 +164,17 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
|
||||
HoodieCompactionPlan compactionPlan) {
|
||||
HoodieRealtimeTableCompactor compactor = new HoodieRealtimeTableCompactor();
|
||||
try {
|
||||
return compactor.compact(jsc, compactionPlan, this, config, compactionInstantTime);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<HoodieRollbackStat> rollback(JavaSparkContext jsc, List<String> commits)
|
||||
throws IOException {
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
|
||||
import com.uber.hoodie.avro.model.HoodieSavepointMetadata;
|
||||
import com.uber.hoodie.common.HoodieCleanStat;
|
||||
import com.uber.hoodie.common.HoodieRollbackStat;
|
||||
@@ -211,12 +212,32 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
public abstract Iterator<List<WriteStatus>> handleInsertPartition(String commitTime,
|
||||
Integer partition, Iterator<HoodieRecord<T>> recordIterator, Partitioner partitioner);
|
||||
|
||||
/**
|
||||
* Schedule compaction for the instant time
|
||||
* @param jsc Spark Context
|
||||
* @param instantTime Instant Time for scheduling compaction
|
||||
* @return
|
||||
*/
|
||||
public abstract HoodieCompactionPlan scheduleCompaction(JavaSparkContext jsc, String instantTime);
|
||||
|
||||
/**
|
||||
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data
|
||||
* access
|
||||
* @deprecated Will be replaced with newer APIs
|
||||
*/
|
||||
@Deprecated
|
||||
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String commitTime);
|
||||
|
||||
/**
|
||||
* Run Compaction on the table. Compaction arranges the data so that it is optimized for data access
|
||||
*
|
||||
* @param jsc Spark Context
|
||||
* @param compactionInstantTime Instant Time
|
||||
* @param compactionPlan Compaction Plan
|
||||
*/
|
||||
public abstract JavaRDD<WriteStatus> compact(JavaSparkContext jsc, String compactionInstantTime,
|
||||
HoodieCompactionPlan compactionPlan);
|
||||
|
||||
/**
|
||||
* Clean partition paths according to cleaning policy and returns the number of files cleaned.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user