Implement Compaction policy abstraction. Implement LogSizeBased Bounded IO Compaction as the default strategy
This commit is contained in:
committed by
prazanna
parent
82b211d2e6
commit
91b088f29f
@@ -1,34 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.io.compact;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Implementations of CompactionFilter allows prioritizing and filtering certain type of
|
||||
* compactions over other compactions.
|
||||
*
|
||||
* e.g. Filter in-efficient compaction like compacting a very large old parquet file with a small avro file
|
||||
*/
|
||||
public interface CompactionFilter {
|
||||
List<CompactionOperation> filter(List<CompactionOperation> input);
|
||||
|
||||
// Default implementation - do not filter anything
|
||||
static CompactionFilter allowAll() {
|
||||
return s -> s;
|
||||
}
|
||||
}
|
||||
@@ -19,61 +19,72 @@ package com.uber.hoodie.io.compact;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.table.log.HoodieLogFile;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.strategy.CompactionStrategy;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Encapsulates all the needed information about a compaction
|
||||
* and make a decision whether this compaction is effective or not
|
||||
*
|
||||
* @see CompactionFilter
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
public class CompactionOperation implements Serializable {
|
||||
private String dataFileCommitTime;
|
||||
private long dataFileSize;
|
||||
private List<String> deltaFilePaths;
|
||||
private String dataFilePath;
|
||||
private String fileId;
|
||||
private String partitionPath;
|
||||
|
||||
//Only for serialization/de-serialization
|
||||
@Deprecated
|
||||
public CompactionOperation() {
|
||||
}
|
||||
private String dataFileCommitTime;
|
||||
private long dataFileSize;
|
||||
private List<String> deltaFilePaths;
|
||||
private String dataFilePath;
|
||||
private String fileId;
|
||||
private String partitionPath;
|
||||
private Map<String, Object> metrics;
|
||||
|
||||
public CompactionOperation(HoodieDataFile dataFile, String partitionPath,
|
||||
List<HoodieLogFile> value) {
|
||||
this.dataFilePath = dataFile.getPath();
|
||||
this.fileId = dataFile.getFileId();
|
||||
this.partitionPath = partitionPath;
|
||||
this.dataFileCommitTime = dataFile.getCommitTime();
|
||||
this.dataFileSize = dataFile.getFileStatus().getLen();
|
||||
this.deltaFilePaths = value.stream().map(s -> s.getPath().toString()).collect(
|
||||
Collectors.toList());
|
||||
}
|
||||
//Only for serialization/de-serialization
|
||||
@Deprecated
|
||||
public CompactionOperation() {
|
||||
}
|
||||
|
||||
public String getDataFileCommitTime() {
|
||||
return dataFileCommitTime;
|
||||
}
|
||||
public CompactionOperation(HoodieDataFile dataFile, String partitionPath,
|
||||
List<HoodieLogFile> logFiles, HoodieWriteConfig writeConfig) {
|
||||
this.dataFilePath = dataFile.getPath();
|
||||
this.fileId = dataFile.getFileId();
|
||||
this.partitionPath = partitionPath;
|
||||
this.dataFileCommitTime = dataFile.getCommitTime();
|
||||
this.dataFileSize = dataFile.getFileSize();
|
||||
this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString()).collect(
|
||||
Collectors.toList());
|
||||
this.metrics = writeConfig.getCompactionStrategy()
|
||||
.captureMetrics(dataFile, partitionPath, logFiles);
|
||||
}
|
||||
|
||||
public long getDataFileSize() {
|
||||
return dataFileSize;
|
||||
}
|
||||
public String getDataFileCommitTime() {
|
||||
return dataFileCommitTime;
|
||||
}
|
||||
|
||||
public List<String> getDeltaFilePaths() {
|
||||
return deltaFilePaths;
|
||||
}
|
||||
public long getDataFileSize() {
|
||||
return dataFileSize;
|
||||
}
|
||||
|
||||
public String getDataFilePath() {
|
||||
return dataFilePath;
|
||||
}
|
||||
public List<String> getDeltaFilePaths() {
|
||||
return deltaFilePaths;
|
||||
}
|
||||
|
||||
public String getFileId() {
|
||||
return fileId;
|
||||
}
|
||||
public String getDataFilePath() {
|
||||
return dataFilePath;
|
||||
}
|
||||
|
||||
public String getPartitionPath() {
|
||||
return partitionPath;
|
||||
}
|
||||
public String getFileId() {
|
||||
return fileId;
|
||||
}
|
||||
|
||||
public String getPartitionPath() {
|
||||
return partitionPath;
|
||||
}
|
||||
|
||||
public Map<String, Object> getMetrics() {
|
||||
return metrics;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ public interface HoodieCompactor extends Serializable {
|
||||
* @throws Exception
|
||||
*/
|
||||
HoodieCompactionMetadata compact(JavaSparkContext jsc, final HoodieWriteConfig config,
|
||||
HoodieTable hoodieTable, CompactionFilter compactionFilter) throws Exception;
|
||||
HoodieTable hoodieTable) throws Exception;
|
||||
|
||||
|
||||
// Helper methods
|
||||
|
||||
@@ -64,7 +64,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
|
||||
@Override
|
||||
public HoodieCompactionMetadata compact(JavaSparkContext jsc, HoodieWriteConfig config,
|
||||
HoodieTable hoodieTable, CompactionFilter compactionFilter) throws IOException {
|
||||
HoodieTable hoodieTable) throws IOException {
|
||||
Preconditions.checkArgument(
|
||||
hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ,
|
||||
"HoodieRealtimeTableCompactor can only compact table of type "
|
||||
@@ -86,12 +86,12 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
.getFileSystemView()
|
||||
.groupLatestDataFileWithLogFiles(partitionPath).entrySet()
|
||||
.stream()
|
||||
.map(s -> new CompactionOperation(s.getKey(), partitionPath, s.getValue()))
|
||||
.map(s -> new CompactionOperation(s.getKey(), partitionPath, s.getValue(), config))
|
||||
.collect(toList()).iterator()).collect();
|
||||
log.info("Total of " + operations.size() + " compactions are retrieved");
|
||||
|
||||
// Filter the compactions with the passed in filter. This lets us choose most effective compactions only
|
||||
operations = compactionFilter.filter(operations);
|
||||
operations = config.getCompactionStrategy().orderAndFilter(config, operations);
|
||||
if (operations.isEmpty()) {
|
||||
log.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
|
||||
return null;
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.table.log.HoodieLogFile;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
/**
|
||||
* CompactionStrategy which looks at total IO to be done for the compaction (read + write)
|
||||
* and limits the list of compactions to be under a configured limit on the IO
|
||||
*
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
public class BoundedIOCompactionStrategy implements CompactionStrategy {
|
||||
|
||||
public static final String TOTAL_IO_READ_MB = "TOTAL_IO_READ_MB";
|
||||
public static final String TOTAL_IO_WRITE_MB = "TOTAL_IO_WRITE_MB";
|
||||
public static final String TOTAL_IO_MB = "TOTAL_IO_MB";
|
||||
|
||||
@Override
|
||||
public Map<String, Object> captureMetrics(HoodieDataFile dataFile, String partitionPath,
|
||||
List<HoodieLogFile> logFiles) {
|
||||
Map<String, Object> metrics = Maps.newHashMap();
|
||||
// Total size of all the log files
|
||||
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(
|
||||
Optional::isPresent).map(Optional::get).reduce(
|
||||
(size1, size2) -> size1 + size2).orElse(0L);
|
||||
// Total read will be the base file + all the log files
|
||||
Long totalIORead = FSUtils.getSizeInMB(dataFile.getFileSize() + totalLogFileSize);
|
||||
// Total write will be similar to the size of the base file
|
||||
Long totalIOWrite = FSUtils.getSizeInMB(dataFile.getFileSize());
|
||||
// Total IO will the the IO for read + write
|
||||
Long totalIO = totalIORead + totalIOWrite;
|
||||
// Save these metrics and we will use during the filter
|
||||
metrics.put(TOTAL_IO_READ_MB, totalIORead);
|
||||
metrics.put(TOTAL_IO_WRITE_MB, totalIOWrite);
|
||||
metrics.put(TOTAL_IO_MB, totalIO);
|
||||
return metrics;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig, List<CompactionOperation> operations) {
|
||||
// Iterate through the operations in order and accept operations as long as we are within the IO limit
|
||||
// Preserves the original ordering of compactions
|
||||
List<CompactionOperation> finalOperations = Lists.newArrayList();
|
||||
long targetIORemaining = writeConfig.getTargetIOPerCompactionInMB();
|
||||
for (CompactionOperation op : operations) {
|
||||
long opIo = (Long) op.getMetrics().get(TOTAL_IO_MB);
|
||||
targetIORemaining -= opIo;
|
||||
finalOperations.add(op);
|
||||
if (targetIORemaining <= 0) {
|
||||
return finalOperations;
|
||||
}
|
||||
}
|
||||
return finalOperations;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.table.log.HoodieLogFile;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Strategy for compaction. Pluggable implementation of define how compaction should be done.
|
||||
* The implementations of this interface can capture the relevant metrics to order and filter
|
||||
* the final list of compaction operation to run in a single compaction.
|
||||
*
|
||||
* Implementation of CompactionStrategy cannot hold any state.
|
||||
* Difference instantiations can be passed in every time
|
||||
*
|
||||
* @see com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor
|
||||
* @see CompactionOperation
|
||||
*/
|
||||
public interface CompactionStrategy extends Serializable {
|
||||
|
||||
/**
|
||||
* Callback hook when a CompactionOperation is created. Individual strategies can
|
||||
* capture the metrics they need to decide on the priority.
|
||||
*
|
||||
* @param dataFile - Base file to compact
|
||||
* @param partitionPath - Partition path
|
||||
* @param logFiles - List of log files to compact with the base file
|
||||
* @return Map[String, Object] - metrics captured
|
||||
*/
|
||||
Map<String, Object> captureMetrics(HoodieDataFile dataFile, String partitionPath,
|
||||
List<HoodieLogFile> logFiles);
|
||||
|
||||
/**
|
||||
* Order and Filter the list of compactions. Use the metrics captured with the
|
||||
* captureMetrics to order and filter out compactions
|
||||
*
|
||||
* @param writeConfig - HoodieWriteConfig - config for this compaction is passed in
|
||||
* @param operations - list of compactions collected
|
||||
* @return list of compactions to perform in this run
|
||||
*/
|
||||
List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<CompactionOperation> operations);
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.table.log.HoodieLogFile;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* LogFileSizeBasedCompactionStrategy orders the compactions based on the total log files size
|
||||
* and limits the compactions within a configured IO bound
|
||||
*
|
||||
* @see BoundedIOCompactionStrategy
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
public class LogFileSizeBasedCompactionStrategy extends BoundedIOCompactionStrategy implements
|
||||
Comparator<CompactionOperation> {
|
||||
|
||||
private static final String TOTAL_LOG_FILE_SIZE = "TOTAL_LOG_FILE_SIZE";
|
||||
|
||||
@Override
|
||||
public Map<String, Object> captureMetrics(HoodieDataFile dataFile, String partitionPath,
|
||||
List<HoodieLogFile> logFiles) {
|
||||
|
||||
Map<String, Object> metrics = super.captureMetrics(dataFile, partitionPath, logFiles);
|
||||
// Total size of all the log files
|
||||
Long totalLogFileSize = logFiles.stream().map(HoodieLogFile::getFileSize).filter(
|
||||
Optional::isPresent).map(Optional::get).reduce(
|
||||
(size1, size2) -> size1 + size2).orElse(0L);
|
||||
// save the metrics needed during the order
|
||||
metrics.put(TOTAL_LOG_FILE_SIZE, totalLogFileSize);
|
||||
return metrics;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig writeConfig,
|
||||
List<CompactionOperation> operations) {
|
||||
// Order the operations based on the reverse size of the logs and limit them by the IO
|
||||
return super
|
||||
.orderAndFilter(writeConfig, operations.stream().sorted(this).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(CompactionOperation op1, CompactionOperation op2) {
|
||||
Long totalLogSize1 = (Long) op1.getMetrics().get(TOTAL_LOG_FILE_SIZE);
|
||||
Long totalLogSize2 = (Long) op2.getMetrics().get(TOTAL_LOG_FILE_SIZE);
|
||||
// Reverse the comparison order - so that larger log file size is compacted first
|
||||
return totalLogSize2.compareTo(totalLogSize1);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.io.compact.strategy;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.table.log.HoodieLogFile;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* UnBoundedCompactionStrategy will not change ordering or filter any compaction.
|
||||
* It is a pass-through and will compact all the base files which has a log file.
|
||||
* This usually means no-intelligence on compaction.
|
||||
*
|
||||
* @see CompactionStrategy
|
||||
*/
|
||||
public class UnBoundedCompactionStrategy implements CompactionStrategy {
|
||||
|
||||
@Override
|
||||
public Map<String, Object> captureMetrics(HoodieDataFile dataFile, String partitionPath,
|
||||
List<HoodieLogFile> logFiles) {
|
||||
return Maps.newHashMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<CompactionOperation> orderAndFilter(HoodieWriteConfig config,
|
||||
List<CompactionOperation> operations) {
|
||||
return operations;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user