1
0

Implement Merge on Read Storage (#76)

1. Create HoodieTable abstraction for commits and fileSystemView
2. HoodieMergeOnReadTable created
3. View is now always obtained from the table and the correct view based on the table type is returned
This commit is contained in:
prazanna
2017-02-21 15:24:00 -08:00
committed by Prasanna Rajaperumal
parent 11d2fd3428
commit eb46e7c72b
47 changed files with 1113 additions and 421 deletions

View File

@@ -0,0 +1,45 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.model;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
/**
* Statistics about a single Hoodie delta log operation.
*/
@JsonIgnoreProperties(ignoreUnknown = true)
public class HoodieDeltaWriteStat extends HoodieWriteStat {
private int logVersion;
private long logOffset;
public void setLogVersion(int logVersion) {
this.logVersion = logVersion;
}
public int getLogVersion() {
return logVersion;
}
public void setLogOffset(long logOffset) {
this.logOffset = logOffset;
}
public long getLogOffset() {
return logOffset;
}
}

View File

@@ -39,16 +39,20 @@ import java.util.stream.Stream;
*/
public interface HoodieTimeline extends Serializable {
String COMMIT_ACTION = "commit";
String DELTA_COMMIT_ACTION = "deltacommit";
String CLEAN_ACTION = "clean";
String SAVEPOINT_ACTION = "savepoint";
String COMPACTION_ACTION = "compaction";
String INFLIGHT_EXTENSION = ".inflight";
String COMMIT_EXTENSION = "." + COMMIT_ACTION;
String DELTA_COMMIT_EXTENSION = "." + DELTA_COMMIT_ACTION;
String CLEAN_EXTENSION = "." + CLEAN_ACTION;
String SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION;
String COMPACTION_EXTENSION = "." + COMPACTION_ACTION;
//this is to preserve backwards compatibility on commit in-flight filenames
String INFLIGHT_COMMIT_EXTENSION = INFLIGHT_EXTENSION;
String INFLIGHT_DELTA_COMMIT_EXTENSION = "." + DELTA_COMMIT_ACTION + INFLIGHT_EXTENSION;
String INFLIGHT_CLEAN_EXTENSION = "." + CLEAN_ACTION + INFLIGHT_EXTENSION;
String INFLIGHT_SAVEPOINT_EXTENSION = "." + SAVEPOINT_ACTION + INFLIGHT_EXTENSION;
String INFLIGHT_COMPACTION_EXTENSION = "." + COMPACTION_ACTION + INFLIGHT_EXTENSION;
@@ -203,6 +207,14 @@ public interface HoodieTimeline extends Serializable {
return commitTime + HoodieTimeline.COMPACTION_EXTENSION;
}
static String makeInflightDeltaFileName(String commitTime) {
return commitTime + HoodieTimeline.INFLIGHT_DELTA_COMMIT_EXTENSION;
}
static String makeDeltaFileName(String commitTime) {
return commitTime + HoodieTimeline.DELTA_COMMIT_EXTENSION;
}
static String getCommitFromCommitFile(String commitFileName) {
return commitFileName.split("\\.")[0];
}

View File

@@ -18,9 +18,13 @@ package com.uber.hoodie.common.table;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;
/**
@@ -29,6 +33,7 @@ import java.util.stream.Stream;
* <p>
* ReadOptimizedView - Lets queries run only on organized columnar data files at the expense of latency
* WriteOptimizedView - Lets queries run on columnar data as well as delta files (sequential) at the expense of query execution time
*
* @since 0.3.0
*/
public interface TableFileSystemView {
@@ -90,4 +95,14 @@ public interface TableFileSystemView {
* @return
*/
Stream<HoodieDataFile> getLatestVersions(FileStatus[] fileStatuses);
/**
* Group data files with corresponding delta files
* @param fs
* @param partitionPath
* @return
* @throws IOException
*/
Map<HoodieDataFile, List<HoodieLogFile>> groupLatestDataFileWithLogFiles(String partitionPath) throws IOException;
}

View File

@@ -20,6 +20,7 @@ import com.uber.hoodie.common.table.log.avro.AvroLogAppender;
import com.uber.hoodie.common.table.log.avro.RollingAvroLogAppender;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
/**
@@ -36,7 +37,7 @@ public interface HoodieLogAppender<R> {
* @param records
* @throws IOException
*/
void append(List<R> records) throws IOException, InterruptedException;
void append(Iterator<R> records) throws IOException, InterruptedException;
/**
* Syncs the log manually if auto-flush is not set in HoodieLogAppendConfig. If auto-flush is set

View File

@@ -37,6 +37,7 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
/**
@@ -99,8 +100,8 @@ public class AvroLogAppender implements HoodieLogAppender<IndexedRecord> {
}
}
public void append(List<IndexedRecord> records) throws IOException {
records.forEach(r -> {
public void append(Iterator<IndexedRecord> records) throws IOException {
records.forEachRemaining(r -> {
try {
writer.append(r);
} catch (IOException e) {

View File

@@ -25,6 +25,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
/**
@@ -64,8 +65,8 @@ public class RollingAvroLogAppender implements HoodieLogAppender<IndexedRecord>
return logWriter.getCurrentSize();
}
public void append(List<IndexedRecord> records) throws IOException, InterruptedException {
LOG.info("Appending " + records.size() + " records to " + config.getLogFile());
public void append(Iterator<IndexedRecord> records) throws IOException, InterruptedException {
LOG.info("Appending records to " + config.getLogFile());
rollOverIfNeeded();
Preconditions.checkArgument(logWriter != null);
logWriter.append(records);

View File

@@ -16,6 +16,7 @@
package com.uber.hoodie.common.table.timeline;
import com.google.common.collect.Sets;
import com.google.common.io.Closeables;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
@@ -85,8 +86,9 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
public HoodieActiveTimeline(FileSystem fs, String metaPath) {
this(fs, metaPath,
new String[] {COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, COMPACTION_EXTENSION,
INFLIGHT_SAVEPOINT_EXTENSION, CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, COMPACTION_EXTENSION});
new String[] {COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION,
INFLIGHT_DELTA_COMMIT_EXTENSION, COMPACTION_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION,
CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION, COMPACTION_EXTENSION});
}
/**
@@ -113,7 +115,16 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
* @return
*/
public HoodieTimeline getCommitTimeline() {
return new HoodieDefaultTimeline(filterInstantsByAction(COMMIT_ACTION),
return getTimelineOfActions(Sets.newHashSet(COMMIT_ACTION, COMPACTION_ACTION));
}
/**
* Get only the commits (inflight and completed) in the active timeline
*
* @return
*/
public HoodieTimeline getDeltaCommitTimeline() {
return new HoodieDefaultTimeline(filterInstantsByAction(DELTA_COMMIT_ACTION),
(Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails);
}
@@ -138,6 +149,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
(Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails);
}
/**
* Get only the cleaner action (inflight and completed) in the active timeline
*

View File

@@ -97,6 +97,10 @@ public class HoodieInstant implements Serializable {
return isInflight ?
HoodieTimeline.makeInflightCompactionFileName(timestamp) :
HoodieTimeline.makeCompactionFileName(timestamp);
} else if (HoodieTimeline.DELTA_COMMIT_ACTION.equals(action)) {
return isInflight ?
HoodieTimeline.makeInflightDeltaFileName(timestamp) :
HoodieTimeline.makeDeltaFileName(timestamp);
}
throw new IllegalArgumentException("Cannot get file name for unknown action " + action);
}

View File

@@ -16,12 +16,16 @@
package com.uber.hoodie.common.table.view;
import com.google.common.collect.Maps;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIOException;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileStatus;
@@ -47,19 +51,18 @@ import java.util.stream.Stream;
* listDataFilesInPartition which includes files to be included in the view
*
* @see TableFileSystemView
* @see ReadOptimizedTableView
* @since 0.3.0
*/
public abstract class AbstractTableFileSystemView implements TableFileSystemView, Serializable {
public class HoodieTableFileSystemView implements TableFileSystemView, Serializable {
protected HoodieTableMetaClient metaClient;
protected transient FileSystem fs;
// This is the commits that will be visible for all views extending this view
protected HoodieTimeline visibleActiveCommitTimeline;
public AbstractTableFileSystemView(FileSystem fs, HoodieTableMetaClient metaClient,
public HoodieTableFileSystemView(HoodieTableMetaClient metaClient,
HoodieTimeline visibleActiveCommitTimeline) {
this.metaClient = metaClient;
this.fs = fs;
this.fs = metaClient.getFs();
this.visibleActiveCommitTimeline = visibleActiveCommitTimeline;
}
@@ -183,6 +186,37 @@ public abstract class AbstractTableFileSystemView implements TableFileSystemView
}
}
public Map<HoodieDataFile, List<HoodieLogFile>> groupLatestDataFileWithLogFiles(
String partitionPath) throws IOException {
if (metaClient.getTableType() != HoodieTableType.MERGE_ON_READ) {
throw new HoodieException("Unsupported table type :" + metaClient.getTableType());
}
// All the files in the partition
FileStatus[] files = fs.listStatus(new Path(metaClient.getBasePath(), partitionPath));
// All the log files filtered from the above list, sorted by version numbers
List<HoodieLogFile> allLogFiles = Arrays.stream(files).filter(s -> s.getPath().getName()
.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()))
.map(HoodieLogFile::new).collect(Collectors.collectingAndThen(Collectors.toList(),
l -> l.stream().sorted(HoodieLogFile.getLogVersionComparator())
.collect(Collectors.toList())));
// Filter the delta files by the commit time of the latest base fine and collect as a list
Optional<HoodieInstant> lastTimestamp = metaClient.getActiveTimeline().lastInstant();
if (!lastTimestamp.isPresent()) {
return Maps.newHashMap();
}
return getLatestVersionInPartition(partitionPath, lastTimestamp.get().getTimestamp()).map(
hoodieDataFile -> Pair.of(hoodieDataFile, allLogFiles.stream().filter(
s -> s.getFileId().equals(hoodieDataFile.getFileId()) && s.getBaseCommitTime()
.equals(hoodieDataFile.getCommitTime())).collect(Collectors.toList()))).collect(
Collectors.toMap(
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, HoodieDataFile>) Pair::getKey,
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, List<HoodieLogFile>>) Pair::getRight));
}
protected Stream<List<HoodieDataFile>> getFilesByFileId(FileStatus[] files,
String maxCommitTime) throws IOException {
return groupFilesByFileId(files, maxCommitTime).values().stream();

View File

@@ -1,37 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.table.view;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.exception.HoodieIOException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
/**
* ReadOptimized view which includes only the ROStorageformat files
*/
public class ReadOptimizedTableView extends AbstractTableFileSystemView {
public ReadOptimizedTableView(FileSystem fs, HoodieTableMetaClient metaClient) {
// Get the active timeline and filter only completed commits
super(fs, metaClient,
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants());
}
}

View File

@@ -1,80 +0,0 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.table.view;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.log.HoodieLogFile;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* Realtime Table View which includes both ROStorageformat files and RTStorageFormat files
*/
public class RealtimeTableView extends AbstractTableFileSystemView {
public RealtimeTableView(FileSystem fs, HoodieTableMetaClient metaClient) {
// For realtime table view, visibleActiveCommitTimeline is a merged timeline of all commits and compactions
super(fs, metaClient, metaClient.getActiveTimeline().getTimelineOfActions(
Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
HoodieActiveTimeline.COMPACTION_ACTION)).filterCompletedInstants());
Preconditions.checkArgument(metaClient.getTableType() == HoodieTableType.MERGE_ON_READ,
"Realtime view can only be constructed on Hoodie tables with MERGE_ON_READ storage type");
}
public Map<HoodieDataFile, List<HoodieLogFile>> groupLatestDataFileWithLogFiles(FileSystem fs,
String partitionPath) throws IOException {
// All the files in the partition
FileStatus[] files = fs.listStatus(new Path(metaClient.getBasePath(), partitionPath));
// All the log files filtered from the above list, sorted by version numbers
List<HoodieLogFile> allLogFiles = Arrays.stream(files).filter(s -> s.getPath().getName()
.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()))
.map(HoodieLogFile::new).collect(Collectors.collectingAndThen(Collectors.toList(),
l -> l.stream().sorted(HoodieLogFile.getLogVersionComparator())
.collect(Collectors.toList())));
// Filter the delta files by the commit time of the latest base fine and collect as a list
Optional<HoodieInstant> lastTimestamp = metaClient.getActiveTimeline().lastInstant();
if(!lastTimestamp.isPresent()) {
return Maps.newHashMap();
}
return getLatestVersionInPartition(partitionPath, lastTimestamp.get().getTimestamp()).map(
hoodieDataFile -> Pair.of(hoodieDataFile, allLogFiles.stream().filter(
s -> s.getFileId().equals(hoodieDataFile.getFileId()) && s.getBaseCommitTime()
.equals(hoodieDataFile.getCommitTime())).collect(Collectors.toList()))).collect(
Collectors.toMap(
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, HoodieDataFile>) Pair::getKey,
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, List<HoodieLogFile>>) Pair::getRight));
}
}

View File

@@ -0,0 +1,23 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.exception;
public class HoodieNotSupportedException extends HoodieException {
public HoodieNotSupportedException(String errorMsg) {
super(errorMsg);
}
}