1
0

FileSystem View must treat same fileIds present in different partitions as different file-groups and handle pending compaction correctly

This commit is contained in:
Balaji Varadarajan
2019-02-12 21:29:14 -08:00
committed by vinoth chandar
parent 363df2c12e
commit 3ae6cb4ed5
20 changed files with 388 additions and 120 deletions

View File

@@ -38,8 +38,7 @@ public class CompactionOperation implements Serializable {
private Optional<String> dataFileCommitTime;
private List<String> deltaFilePaths;
private Optional<String> dataFilePath;
private String fileId;
private String partitionPath;
private HoodieFileGroupId id;
private Map<String, Double> metrics;
//Only for serialization/de-serialization
@@ -52,17 +51,16 @@ public class CompactionOperation implements Serializable {
if (dataFile.isPresent()) {
this.baseInstantTime = dataFile.get().getCommitTime();
this.dataFilePath = Optional.of(dataFile.get().getPath());
this.fileId = dataFile.get().getFileId();
this.id = new HoodieFileGroupId(partitionPath, dataFile.get().getFileId());
this.dataFileCommitTime = Optional.of(dataFile.get().getCommitTime());
} else {
assert logFiles.size() > 0;
this.dataFilePath = Optional.absent();
this.baseInstantTime = FSUtils.getBaseCommitTimeFromLogPath(logFiles.get(0).getPath());
this.fileId = FSUtils.getFileIdFromLogPath(logFiles.get(0).getPath());
this.id = new HoodieFileGroupId(partitionPath, FSUtils.getFileIdFromLogPath(logFiles.get(0).getPath()));
this.dataFileCommitTime = Optional.absent();
}
this.partitionPath = partitionPath;
this.deltaFilePaths = logFiles.stream().map(s -> s.getPath().toString())
.collect(Collectors.toList());
this.metrics = metrics;
@@ -85,17 +83,21 @@ public class CompactionOperation implements Serializable {
}
public String getFileId() {
return fileId;
return id.getFileId();
}
public String getPartitionPath() {
return partitionPath;
return id.getPartitionPath();
}
public Map<String, Double> getMetrics() {
return metrics;
}
public HoodieFileGroupId getFileGroupId() {
return id;
}
/**
* Convert Avro generated Compaction operation to POJO for Spark RDD operation
* @param operation Hoodie Compaction Operation
@@ -106,9 +108,8 @@ public class CompactionOperation implements Serializable {
op.baseInstantTime = operation.getBaseInstantTime();
op.dataFilePath = Optional.fromNullable(operation.getDataFilePath());
op.deltaFilePaths = new ArrayList<>(operation.getDeltaFilePaths());
op.fileId = operation.getFileId();
op.id = new HoodieFileGroupId(operation.getPartitionPath(), operation.getFileId());
op.metrics = operation.getMetrics() == null ? new HashMap<>() : new HashMap<>(operation.getMetrics());
op.partitionPath = operation.getPartitionPath();
return op;
}
}

View File

@@ -30,9 +30,9 @@ import java.util.stream.Stream;
public class FileSlice implements Serializable {
/**
* id of the slice
* File Group Id of the Slice
*/
private String fileId;
private HoodieFileGroupId fileGroupId;
/**
* Point in the timeline, at which the slice was created
@@ -50,8 +50,12 @@ public class FileSlice implements Serializable {
*/
private final TreeSet<HoodieLogFile> logFiles;
public FileSlice(String baseInstantTime, String fileId) {
this.fileId = fileId;
public FileSlice(String partitionPath, String baseInstantTime, String fileId) {
this(new HoodieFileGroupId(partitionPath, fileId), baseInstantTime);
}
public FileSlice(HoodieFileGroupId fileGroupId, String baseInstantTime) {
this.fileGroupId = fileGroupId;
this.baseInstantTime = baseInstantTime;
this.dataFile = null;
this.logFiles = new TreeSet<>(HoodieLogFile.getBaseInstantAndLogVersionComparator());
@@ -73,8 +77,16 @@ public class FileSlice implements Serializable {
return baseInstantTime;
}
public String getPartitionPath() {
return fileGroupId.getPartitionPath();
}
public String getFileId() {
return fileId;
return fileGroupId.getFileId();
}
public HoodieFileGroupId getFileGroupId() {
return fileGroupId;
}
public Optional<HoodieDataFile> getDataFile() {
@@ -84,6 +96,7 @@ public class FileSlice implements Serializable {
@Override
public String toString() {
final StringBuilder sb = new StringBuilder("FileSlice {");
sb.append("fileGroupId=").append(fileGroupId);
sb.append("baseCommitTime=").append(baseInstantTime);
sb.append(", dataFile='").append(dataFile).append('\'');
sb.append(", logFiles='").append(logFiles).append('\'');

View File

@@ -34,14 +34,9 @@ public class HoodieFileGroup implements Serializable {
}
/**
* Partition containing the file group.
* file group id
*/
private final String partitionPath;
/**
* uniquely identifies the file group
*/
private final String id;
private final HoodieFileGroupId fileGroupId;
/**
* Slices of files in this group, sorted with greater commit first.
@@ -59,8 +54,11 @@ public class HoodieFileGroup implements Serializable {
private final Optional<HoodieInstant> lastInstant;
public HoodieFileGroup(String partitionPath, String id, HoodieTimeline timeline) {
this.partitionPath = partitionPath;
this.id = id;
this(new HoodieFileGroupId(partitionPath, id), timeline);
}
public HoodieFileGroup(HoodieFileGroupId fileGroupId, HoodieTimeline timeline) {
this.fileGroupId = fileGroupId;
this.fileSlices = new TreeMap<>(HoodieFileGroup.getReverseCommitTimeComparator());
this.timeline = timeline;
this.lastInstant = timeline.lastInstant();
@@ -72,7 +70,7 @@ public class HoodieFileGroup implements Serializable {
*/
public void addNewFileSliceAtInstant(String baseInstantTime) {
if (!fileSlices.containsKey(baseInstantTime)) {
fileSlices.put(baseInstantTime, new FileSlice(baseInstantTime, id));
fileSlices.put(baseInstantTime, new FileSlice(fileGroupId, baseInstantTime));
}
}
@@ -81,7 +79,7 @@ public class HoodieFileGroup implements Serializable {
*/
public void addDataFile(HoodieDataFile dataFile) {
if (!fileSlices.containsKey(dataFile.getCommitTime())) {
fileSlices.put(dataFile.getCommitTime(), new FileSlice(dataFile.getCommitTime(), id));
fileSlices.put(dataFile.getCommitTime(), new FileSlice(fileGroupId, dataFile.getCommitTime()));
}
fileSlices.get(dataFile.getCommitTime()).setDataFile(dataFile);
}
@@ -91,17 +89,17 @@ public class HoodieFileGroup implements Serializable {
*/
public void addLogFile(HoodieLogFile logFile) {
if (!fileSlices.containsKey(logFile.getBaseCommitTime())) {
fileSlices.put(logFile.getBaseCommitTime(), new FileSlice(logFile.getBaseCommitTime(), id));
fileSlices.put(logFile.getBaseCommitTime(), new FileSlice(fileGroupId, logFile.getBaseCommitTime()));
}
fileSlices.get(logFile.getBaseCommitTime()).addLogFile(logFile);
}
public String getId() {
return id;
public String getPartitionPath() {
return fileGroupId.getPartitionPath();
}
public String getPartitionPath() {
return partitionPath;
public HoodieFileGroupId getFileGroupId() {
return fileGroupId;
}
/**
@@ -197,7 +195,7 @@ public class HoodieFileGroup implements Serializable {
@Override
public String toString() {
final StringBuilder sb = new StringBuilder("HoodieFileGroup {");
sb.append("id=").append(id);
sb.append("id=").append(fileGroupId);
sb.append(", fileSlices='").append(fileSlices).append('\'');
sb.append('}');
return sb.toString();

View File

@@ -0,0 +1,69 @@
/*
* Copyright (c) 2019 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.model;
import java.io.Serializable;
import java.util.Objects;
/**
* Unique ID to identify a file-group in a data-set
*/
public class HoodieFileGroupId implements Serializable {
private final String partitionPath;
private final String fileId;
public HoodieFileGroupId(String partitionPath, String fileId) {
this.partitionPath = partitionPath;
this.fileId = fileId;
}
public String getPartitionPath() {
return partitionPath;
}
public String getFileId() {
return fileId;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
HoodieFileGroupId that = (HoodieFileGroupId) o;
return Objects.equals(partitionPath, that.partitionPath)
&& Objects.equals(fileId, that.fileId);
}
@Override
public int hashCode() {
return Objects.hash(partitionPath, fileId);
}
@Override
public String toString() {
return "HoodieFileGroupId{"
+ "partitionPath='" + partitionPath + '\''
+ ", fileId='" + fileId + '\''
+ '}';
}
}

View File

@@ -21,6 +21,7 @@ import com.uber.hoodie.common.model.CompactionOperation;
import com.uber.hoodie.common.model.FileSlice;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieFileGroup;
import com.uber.hoodie.common.model.HoodieFileGroupId;
import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
@@ -70,12 +71,12 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
// mapping from partition paths to file groups contained within them
protected HashMap<String, List<HoodieFileGroup>> partitionToFileGroupsMap;
// mapping from file id to the file group.
protected HashMap<String, HoodieFileGroup> fileGroupMap;
protected HashMap<HoodieFileGroupId, HoodieFileGroup> fileGroupMap;
/**
* File Id to pending compaction instant time
* PartitionPath + File-Id to pending compaction instant time
*/
private final Map<String, Pair<String, CompactionOperation>> fileIdToPendingCompaction;
private final Map<HoodieFileGroupId, Pair<String, CompactionOperation>> fgIdToPendingCompaction;
/**
* Create a file system view, as of the given timeline
@@ -90,7 +91,7 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
// Build fileId to Pending Compaction Instants
List<HoodieInstant> pendingCompactionInstants =
metaClient.getActiveTimeline().filterPendingCompactionTimeline().getInstants().collect(Collectors.toList());
this.fileIdToPendingCompaction = ImmutableMap.copyOf(
this.fgIdToPendingCompaction = ImmutableMap.copyOf(
CompactionUtils.getAllPendingCompactionOperations(metaClient).entrySet().stream()
.map(entry -> Pair.of(entry.getKey(), Pair.of(entry.getValue().getKey(),
CompactionOperation.convertFromAvroRecordInstance(entry.getValue().getValue()))))
@@ -123,6 +124,10 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
out.defaultWriteObject();
}
private String getPartitionPathFromFileStatus(FileStatus fileStatus) {
return FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), fileStatus.getPath().getParent());
}
/**
* Adds the provided statuses into the file system view, and also caches it inside this object.
*/
@@ -130,9 +135,7 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
Map<Pair<String, String>, List<HoodieDataFile>> dataFiles = convertFileStatusesToDataFiles(
statuses)
.collect(Collectors.groupingBy((dataFile) -> {
String partitionPathStr = FSUtils.getRelativePartitionPath(
new Path(metaClient.getBasePath()),
dataFile.getFileStatus().getPath().getParent());
String partitionPathStr = getPartitionPathFromFileStatus(dataFile.getFileStatus());
return Pair.of(partitionPathStr, dataFile.getFileId());
}));
Map<Pair<String, String>, List<HoodieLogFile>> logFiles = convertFileStatusesToLogFiles(
@@ -157,17 +160,18 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
if (logFiles.containsKey(pair)) {
logFiles.get(pair).forEach(group::addLogFile);
}
if (fileIdToPendingCompaction.containsKey(fileId)) {
HoodieFileGroupId fgId = group.getFileGroupId();
if (fgIdToPendingCompaction.containsKey(fgId)) {
// If there is no delta-commit after compaction request, this step would ensure a new file-slice appears
// so that any new ingestion uses the correct base-instant
group.addNewFileSliceAtInstant(fileIdToPendingCompaction.get(fileId).getKey());
group.addNewFileSliceAtInstant(fgIdToPendingCompaction.get(fgId).getKey());
}
fileGroups.add(group);
});
// add to the cache.
fileGroups.forEach(group -> {
fileGroupMap.put(group.getId(), group);
fileGroupMap.put(group.getFileGroupId(), group);
if (!partitionToFileGroupsMap.containsKey(group.getPartitionPath())) {
partitionToFileGroupsMap.put(group.getPartitionPath(), new ArrayList<>());
}
@@ -198,7 +202,9 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
* @param dataFile Data File
*/
private boolean isDataFileDueToPendingCompaction(HoodieDataFile dataFile) {
Pair<String, CompactionOperation> compactionWithInstantTime = fileIdToPendingCompaction.get(dataFile.getFileId());
final String partitionPath = getPartitionPathFromFileStatus(dataFile.getFileStatus());
HoodieFileGroupId fgId = new HoodieFileGroupId(partitionPath, dataFile.getFileId());
Pair<String, CompactionOperation> compactionWithInstantTime = fgIdToPendingCompaction.get(fgId);
if ((null != compactionWithInstantTime) && (null != compactionWithInstantTime.getLeft())
&& dataFile.getCommitTime().equals(compactionWithInstantTime.getKey())) {
return true;
@@ -210,7 +216,8 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
public Stream<HoodieDataFile> getLatestDataFiles(final String partitionPath) {
return getAllFileGroups(partitionPath)
.map(fileGroup -> {
return fileGroup.getAllDataFiles().filter(df -> !isDataFileDueToPendingCompaction(df)).findFirst();
return fileGroup.getAllDataFiles()
.filter(df -> !isDataFileDueToPendingCompaction(df)).findFirst();
})
.filter(Optional::isPresent)
.map(Optional::get);
@@ -278,7 +285,7 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
.map(HoodieFileGroup::getLatestFileSlice)
.filter(Optional::isPresent)
.map(Optional::get)
.map(this::filterDataFileAfterPendingCompaction);
.map(fs -> filterDataFileAfterPendingCompaction(fs));
}
@Override
@@ -288,7 +295,7 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
FileSlice fileSlice = fileGroup.getLatestFileSlice().get();
// if the file-group is under compaction, pick the latest before compaction instant time.
if (isFileSliceAfterPendingCompaction(fileSlice)) {
String compactionInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()).getLeft();
String compactionInstantTime = fgIdToPendingCompaction.get(fileSlice.getFileGroupId()).getLeft();
return fileGroup.getLatestFileSliceBefore(compactionInstantTime);
}
return Optional.of(fileSlice);
@@ -303,7 +310,8 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
* @return
*/
private boolean isFileSliceAfterPendingCompaction(FileSlice fileSlice) {
Pair<String, CompactionOperation> compactionWithInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId());
Pair<String, CompactionOperation> compactionWithInstantTime =
fgIdToPendingCompaction.get(fileSlice.getFileGroupId());
return (null != compactionWithInstantTime)
&& fileSlice.getBaseInstantTime().equals(compactionWithInstantTime.getKey());
}
@@ -318,7 +326,8 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
if (isFileSliceAfterPendingCompaction(fileSlice)) {
// Data file is filtered out of the file-slice as the corresponding compaction
// instant not completed yet.
FileSlice transformed = new FileSlice(fileSlice.getBaseInstantTime(), fileSlice.getFileId());
FileSlice transformed = new FileSlice(fileSlice.getPartitionPath(),
fileSlice.getBaseInstantTime(), fileSlice.getFileId());
fileSlice.getLogFiles().forEach(transformed::addLogFile);
return transformed;
}
@@ -332,7 +341,7 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
.map(fileGroup -> fileGroup.getLatestFileSliceBeforeOrOn(maxCommitTime))
.filter(Optional::isPresent)
.map(Optional::get)
.map(this::filterDataFileAfterPendingCompaction);
.map(fs -> filterDataFileAfterPendingCompaction(fs));
}
/**
@@ -342,7 +351,8 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
* @param penultimateSlice Penultimate file slice for a file-group in commit timeline order
*/
private static FileSlice mergeCompactionPendingFileSlices(FileSlice lastSlice, FileSlice penultimateSlice) {
FileSlice merged = new FileSlice(penultimateSlice.getBaseInstantTime(), penultimateSlice.getFileId());
FileSlice merged = new FileSlice(penultimateSlice.getPartitionPath(),
penultimateSlice.getBaseInstantTime(), penultimateSlice.getFileId());
if (penultimateSlice.getDataFile().isPresent()) {
merged.setDataFile(penultimateSlice.getDataFile().get());
}
@@ -361,8 +371,9 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
*/
private FileSlice getMergedFileSlice(HoodieFileGroup fileGroup, FileSlice fileSlice) {
// if the file-group is under construction, pick the latest before compaction instant time.
if (fileIdToPendingCompaction.containsKey(fileSlice.getFileId())) {
String compactionInstantTime = fileIdToPendingCompaction.get(fileSlice.getFileId()).getKey();
HoodieFileGroupId fgId = fileSlice.getFileGroupId();
if (fgIdToPendingCompaction.containsKey(fgId)) {
String compactionInstantTime = fgIdToPendingCompaction.get(fgId).getKey();
if (fileSlice.getBaseInstantTime().equals(compactionInstantTime)) {
Optional<FileSlice> prevFileSlice = fileGroup.getLatestFileSliceBefore(compactionInstantTime);
if (prevFileSlice.isPresent()) {
@@ -426,8 +437,8 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
}
}
public Map<String, Pair<String, CompactionOperation>> getFileIdToPendingCompaction() {
return fileIdToPendingCompaction;
public Map<HoodieFileGroupId, Pair<String, CompactionOperation>> getFgIdToPendingCompaction() {
return fgIdToPendingCompaction;
}
public Stream<HoodieFileGroup> getAllFileGroups() {

View File

@@ -20,6 +20,7 @@ import com.uber.hoodie.avro.model.HoodieCompactionOperation;
import com.uber.hoodie.avro.model.HoodieCompactionPlan;
import com.uber.hoodie.common.model.CompactionOperation;
import com.uber.hoodie.common.model.FileSlice;
import com.uber.hoodie.common.model.HoodieFileGroupId;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
@@ -134,36 +135,38 @@ public class CompactionUtils {
}
/**
* Get all file-ids with pending Compaction operations and their target compaction instant time
* Get all PartitionPath + file-ids with pending Compaction operations and their target compaction instant time
*
* @param metaClient Hoodie Table Meta Client
*/
public static Map<String, Pair<String, HoodieCompactionOperation>> getAllPendingCompactionOperations(
public static Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> getAllPendingCompactionOperations(
HoodieTableMetaClient metaClient) {
List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactionPlanWithInstants =
getAllPendingCompactionPlans(metaClient);
Map<String, Pair<String, HoodieCompactionOperation>> fileIdToPendingCompactionWithInstantMap = new HashMap<>();
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToPendingCompactionWithInstantMap =
new HashMap<>();
pendingCompactionPlanWithInstants.stream().flatMap(instantPlanPair -> {
HoodieInstant instant = instantPlanPair.getKey();
HoodieCompactionPlan compactionPlan = instantPlanPair.getValue();
List<HoodieCompactionOperation> ops = compactionPlan.getOperations();
if (null != ops) {
return ops.stream().map(op -> {
return Pair.of(op.getFileId(), Pair.of(instant.getTimestamp(), op));
return Pair.of(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()),
Pair.of(instant.getTimestamp(), op));
});
} else {
return Stream.empty();
}
}).forEach(pair -> {
// Defensive check to ensure a single-fileId does not have more than one pending compaction
if (fileIdToPendingCompactionWithInstantMap.containsKey(pair.getKey())) {
if (fgIdToPendingCompactionWithInstantMap.containsKey(pair.getKey())) {
String msg = "Hoodie File Id (" + pair.getKey() + ") has more thant 1 pending compactions. Instants: "
+ pair.getValue() + ", " + fileIdToPendingCompactionWithInstantMap.get(pair.getKey());
+ pair.getValue() + ", " + fgIdToPendingCompactionWithInstantMap.get(pair.getKey());
throw new IllegalStateException(msg);
}
fileIdToPendingCompactionWithInstantMap.put(pair.getKey(), pair.getValue());
fgIdToPendingCompactionWithInstantMap.put(pair.getKey(), pair.getValue());
});
return fileIdToPendingCompactionWithInstantMap;
return fgIdToPendingCompactionWithInstantMap;
}
}