1
0

(1) Define CompactionWorkload in avro to allow storing them in instant files.

(2) Split APIs in HoodieRealtimeCompactor to separate generating compaction workload from running compaction
This commit is contained in:
Balaji Varadarajan
2018-05-23 20:49:24 -07:00
committed by vinoth chandar
parent 6d01ae8ca0
commit 1b61f04e05
16 changed files with 444 additions and 129 deletions

View File

@@ -0,0 +1,113 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.model;
import com.uber.hoodie.avro.model.HoodieCompactionOperation;
import com.uber.hoodie.common.util.FSUtils;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
/**
* Encapsulates all the needed information about a compaction and make a decision whether this
* compaction is effective or not
*
*/
public class CompactionOperation implements Serializable {
private String baseInstantTime;
private Optional<String> dataFileCommitTime;
private List<String> deltaFilePaths;
private Optional<String> dataFilePath;
private String fileId;
private String partitionPath;
private Map<String, Double> metrics;
//Only for serialization/de-serialization
@Deprecated
public CompactionOperation() {
}
public CompactionOperation(Optional<HoodieDataFile> dataFile, String partitionPath,
List<HoodieLogFile> logFiles, Map<String, Double> metrics) {
if (dataFile.isPresent()) {
this.baseInstantTime = dataFile.get().getCommitTime();
this.dataFilePath = Optional.of(dataFile.get().getPath());
this.fileId = dataFile.get().getFileId();
this.dataFileCommitTime = Optional.of(dataFile.get().getCommitTime());
} else {
assert logFiles.size() > 0;
this.dataFilePath = Optional.empty();
this.baseInstantTime = FSUtils.getBaseCommitTimeFromLogPath(logFiles.get(0).getPath());
this.fileId = FSUtils.getFileIdFromLogPath(logFiles.get(0).getPath());
this.dataFileCommitTime = Optional.empty();
}
this.partitionPath = partitionPath;
this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString())
.collect(Collectors.toList());
this.metrics = metrics;
}
public String getBaseInstantTime() {
return baseInstantTime;
}
public Optional<String> getDataFileCommitTime() {
return dataFileCommitTime;
}
public List<String> getDeltaFilePaths() {
return deltaFilePaths;
}
public Optional<String> getDataFilePath() {
return dataFilePath;
}
public String getFileId() {
return fileId;
}
public String getPartitionPath() {
return partitionPath;
}
public Map<String, Double> getMetrics() {
return metrics;
}
/**
* Convert Avro generated Compaction operation to POJO for Spark RDD operation
* @param operation Hoodie Compaction Operation
* @return
*/
public static CompactionOperation convertFromAvroRecordInstance(HoodieCompactionOperation operation) {
CompactionOperation op = new CompactionOperation();
op.baseInstantTime = operation.getBaseInstantTime();
op.dataFilePath = Optional.ofNullable(operation.getDataFilePath());
op.deltaFilePaths = new ArrayList<>(operation.getDeltaFilePaths());
op.fileId = operation.getFileId();
op.metrics = new HashMap<>(operation.getMetrics());
op.partitionPath = operation.getPartitionPath();
return op;
}
}

View File

@@ -21,6 +21,7 @@ import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
import com.uber.hoodie.avro.model.HoodieCleanPartitionMetadata;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.avro.model.HoodieRollbackMetadata;
import com.uber.hoodie.avro.model.HoodieRollbackPartitionMetadata;
import com.uber.hoodie.avro.model.HoodieSavepointMetadata;
@@ -142,6 +143,10 @@ public class AvroUtils {
partitionMetadataBuilder.build());
}
public static Optional<byte[]> serializeCompactionWorkload(HoodieCompactionPlan compactionWorkload)
throws IOException {
return serializeAvroMetadata(compactionWorkload, HoodieCompactionPlan.class);
}
public static Optional<byte[]> serializeCleanMetadata(HoodieCleanMetadata metadata)
throws IOException {
@@ -169,6 +174,11 @@ public class AvroUtils {
return Optional.of(baos.toByteArray());
}
public static HoodieCompactionPlan deserializeHoodieCompactionPlan(byte[] bytes)
throws IOException {
return deserializeAvroMetadata(bytes, HoodieCompactionPlan.class);
}
public static HoodieCleanMetadata deserializeHoodieCleanMetadata(byte[] bytes)
throws IOException {
return deserializeAvroMetadata(bytes, HoodieCleanMetadata.class);
@@ -188,5 +198,4 @@ public class AvroUtils {
.checkArgument(fileReader.hasNext(), "Could not deserialize metadata of type " + clazz);
return fileReader.next();
}
}

View File

@@ -0,0 +1,100 @@
/*
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.util;
import com.uber.hoodie.avro.model.HoodieCompactionOperation;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.common.model.CompactionOperation;
import com.uber.hoodie.common.model.FileSlice;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import javafx.util.Pair;
/**
* Helper class to generate compaction workload from FileGroup/FileSlice abstraction
*/
public class CompactionUtils {
/**
* Generate compaction operation from file-slice
*
* @param partitionPath Partition path
* @param fileSlice File Slice
* @param metricsCaptureFunction Metrics Capture function
* @return Compaction Operation
*/
public static HoodieCompactionOperation buildFromFileSlice(String partitionPath, FileSlice fileSlice,
Optional<Function<Pair<String, FileSlice>, Map<String, Long>>> metricsCaptureFunction) {
HoodieCompactionOperation.Builder builder = HoodieCompactionOperation.newBuilder();
builder.setPartitionPath(partitionPath);
builder.setFileId(fileSlice.getFileId());
builder.setBaseInstantTime(fileSlice.getBaseInstantTime());
builder.setDeltaFilePaths(fileSlice.getLogFiles().map(lf -> lf.getPath().toString()).collect(Collectors.toList()));
if (fileSlice.getDataFile().isPresent()) {
builder.setDataFilePath(fileSlice.getDataFile().get().getPath());
}
if (metricsCaptureFunction.isPresent()) {
builder.setMetrics(metricsCaptureFunction.get().apply(new Pair(partitionPath, fileSlice)));
}
return builder.build();
}
/**
* Generate compaction workload from file-slices
*
* @param compactorId Compactor Id to set
* @param partitionFileSlicePairs list of partition file-slice pairs
* @param extraMetadata Extra Metadata
* @param metricsCaptureFunction Metrics Capture function
*/
public static HoodieCompactionPlan buildFromFileSlices(String compactorId,
List<Pair<String, FileSlice>> partitionFileSlicePairs,
Optional<Map<String, String>> extraMetadata,
Optional<Function<Pair<String, FileSlice>, Map<String, Long>>> metricsCaptureFunction) {
HoodieCompactionPlan.Builder builder = HoodieCompactionPlan.newBuilder();
builder.setCompactorId(compactorId);
extraMetadata.ifPresent(m -> builder.setExtraMetadata(m));
builder.setOperations(partitionFileSlicePairs.stream().map(pfPair ->
buildFromFileSlice(pfPair.getKey(), pfPair.getValue(), metricsCaptureFunction)).collect(Collectors.toList()));
return builder.build();
}
/**
* Build Avro generated Compaction operation payload from compaction operation POJO for serialization
*/
public static HoodieCompactionOperation buildHoodieCompactionOperation(CompactionOperation op) {
return HoodieCompactionOperation.newBuilder().setFileId(op.getFileId())
.setBaseInstantTime(op.getBaseInstantTime())
.setPartitionPath(op.getPartitionPath())
.setDataFilePath(op.getDataFilePath().isPresent() ? op.getDataFilePath().get() : null)
.setDeltaFilePaths(op.getDeltaFilePaths())
.setMetrics(op.getMetrics()).build();
}
/**
* Build Compaction operation payload from Avro version for using in Spark executors
*
* @param hc HoodieCompactionOperation
*/
public static CompactionOperation buildCompactionOperation(HoodieCompactionOperation hc) {
return CompactionOperation.convertFromAvroRecordInstance(hc);
}
}