1
0

Fixes needed to run merge-on-read testing on production scale data

This commit is contained in:
Prasanna Rajaperumal
2017-03-31 01:02:02 -07:00
committed by prazanna
parent 57ab7a2405
commit aee136777b
26 changed files with 659 additions and 199 deletions

View File

@@ -0,0 +1,108 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.model;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.uber.hoodie.common.util.FSUtils;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.Path;
@JsonIgnoreProperties(ignoreUnknown = true)
public class CompactionWriteStat implements Serializable {
private final HoodieWriteStat writeStat;
private String partitionPath;
private final long totalLogRecords;
private final long totalLogFiles;
private final long totalRecordsToBeUpdate;
public CompactionWriteStat(HoodieWriteStat writeStat, String partitionPath, long totalLogFiles, long totalLogRecords,
long totalRecordsToUpdate) {
this.writeStat = writeStat;
this.partitionPath = partitionPath;
this.totalLogFiles = totalLogFiles;
this.totalLogRecords = totalLogRecords;
this.totalRecordsToBeUpdate = totalRecordsToUpdate;
}
public long getTotalLogRecords() {
return totalLogRecords;
}
public long getTotalLogFiles() {
return totalLogFiles;
}
public long getTotalRecordsToBeUpdate() {
return totalRecordsToBeUpdate;
}
public HoodieWriteStat getHoodieWriteStat() {
return writeStat;
}
public String getPartitionPath() {
return partitionPath;
}
public static Builder newBuilder() {
return new Builder();
}
public static class Builder {
private HoodieWriteStat writeStat;
private long totalLogRecords;
private long totalRecordsToUpdate;
private long totalLogFiles;
private String partitionPath;
public Builder withHoodieWriteStat(HoodieWriteStat writeStat) {
this.writeStat = writeStat;
return this;
}
public Builder setTotalLogRecords(long records) {
this.totalLogRecords = records;
return this;
}
public Builder setTotalLogFiles(long totalLogFiles) {
this.totalLogFiles = totalLogFiles;
return this;
}
public Builder setTotalRecordsToUpdate(long records) {
this.totalRecordsToUpdate = records;
return this;
}
public Builder onPartition(String path) {
this.partitionPath = path;
return this;
}
public CompactionWriteStat build() {
return new CompactionWriteStat(writeStat, partitionPath, totalLogFiles, totalLogRecords,
totalRecordsToUpdate);
}
}
}

View File

@@ -18,10 +18,12 @@ package com.uber.hoodie.common.model;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.databind.DeserializationFeature;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.codehaus.jackson.annotate.JsonAutoDetect;
import org.codehaus.jackson.annotate.JsonMethod;
import org.codehaus.jackson.map.DeserializationConfig.Feature;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
@@ -38,7 +40,7 @@ import java.util.Map;
@JsonIgnoreProperties(ignoreUnknown = true)
public class HoodieCommitMetadata implements Serializable {
private static volatile Logger log = LogManager.getLogger(HoodieCommitMetadata.class);
private HashMap<String, List<HoodieWriteStat>> partitionToWriteStats;
protected HashMap<String, List<HoodieWriteStat>> partitionToWriteStats;
private HashMap<String, String> extraMetadataMap;
@@ -98,6 +100,7 @@ public class HoodieCommitMetadata implements Serializable {
return new HoodieCommitMetadata();
}
ObjectMapper mapper = new ObjectMapper();
mapper.configure(Feature.FAIL_ON_UNKNOWN_PROPERTIES, false);
mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY);
return mapper.readValue(jsonStr, HoodieCommitMetadata.class);
}

View File

@@ -0,0 +1,87 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.common.model;
import com.google.common.collect.Maps;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BinaryOperator;
import java.util.function.Supplier;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.codehaus.jackson.annotate.JsonAutoDetect;
import org.codehaus.jackson.annotate.JsonMethod;
import org.codehaus.jackson.map.DeserializationConfig.Feature;
import org.codehaus.jackson.map.ObjectMapper;
/**
* Place holder for the compaction specific meta-data, uses all the details used in a normal HoodieCommitMetadata
*/
public class HoodieCompactionMetadata extends HoodieCommitMetadata {
private static volatile Logger log = LogManager.getLogger(HoodieCompactionMetadata.class);
protected HashMap<String, List<CompactionWriteStat>> partitionToCompactionWriteStats;
public HoodieCompactionMetadata() {
partitionToCompactionWriteStats = new HashMap<>();
}
public void addWriteStat(String partitionPath, CompactionWriteStat stat) {
addWriteStat(partitionPath, stat.getHoodieWriteStat());
if (!partitionToCompactionWriteStats.containsKey(partitionPath)) {
partitionToCompactionWriteStats.put(partitionPath, new ArrayList<>());
}
partitionToCompactionWriteStats.get(partitionPath).add(stat);
}
public List<CompactionWriteStat> getCompactionWriteStats(String partitionPath) {
return partitionToCompactionWriteStats.get(partitionPath);
}
public Map<String, List<CompactionWriteStat>> getPartitionToCompactionWriteStats() {
return partitionToCompactionWriteStats;
}
public String toJsonString() throws IOException {
if(partitionToCompactionWriteStats.containsKey(null)) {
log.info("partition path is null for " + partitionToCompactionWriteStats.get(null));
partitionToCompactionWriteStats.remove(null);
}
ObjectMapper mapper = new ObjectMapper();
mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY);
return mapper.defaultPrettyPrintingWriter().writeValueAsString(this);
}
public static HoodieCompactionMetadata fromJsonString(String jsonStr) throws IOException {
if (jsonStr == null || jsonStr.isEmpty()) {
// For empty commit file (no data or somethings bad happen).
return new HoodieCompactionMetadata();
}
ObjectMapper mapper = new ObjectMapper();
mapper.configure(Feature.FAIL_ON_UNKNOWN_PROPERTIES, false);
mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY);
return mapper.readValue(jsonStr, HoodieCompactionMetadata.class);
}
public static HoodieCompactionMetadata fromBytes(byte[] bytes) throws IOException {
return fromJsonString(new String(bytes, Charset.forName("utf-8")));
}
}

View File

@@ -25,6 +25,7 @@ import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.mapred.FsInput;
import org.apache.hadoop.fs.AvroFSInput;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileContext;
@@ -67,7 +68,7 @@ public class AvroLogAppender implements HoodieLogAppender<IndexedRecord> {
//TODO - check for log corruption and roll over if needed
log.info(config.getLogFile() + " exists. Appending to existing file");
// this log path exists, we will append to it
fs = FileSystem.get(fs.getConf());
// fs = FileSystem.get(fs.getConf());
try {
this.output = fs.append(path, config.getBufferSize());
} catch (RemoteException e) {
@@ -85,8 +86,9 @@ public class AvroLogAppender implements HoodieLogAppender<IndexedRecord> {
}
}
}
this.writer
.appendTo(new AvroFSInput(FileContext.getFileContext(fs.getConf()), path), output);
.appendTo(new FsInput(path, fs.getConf()), output);
// we always want to flush to disk everytime a avro block is written
this.writer.setFlushOnEveryBlock(true);
} else {

View File

@@ -16,7 +16,10 @@
package com.uber.hoodie.common.table.view;
import static java.util.stream.Collectors.toList;
import com.google.common.collect.Maps;
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
import com.uber.hoodie.common.model.HoodieDataFile;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
@@ -27,6 +30,7 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieException;
import com.uber.hoodie.exception.HoodieIOException;
import java.util.function.BinaryOperator;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -199,23 +203,18 @@ public class HoodieTableFileSystemView implements TableFileSystemView, Serializa
// All the log files filtered from the above list, sorted by version numbers
List<HoodieLogFile> allLogFiles = Arrays.stream(files).filter(s -> s.getPath().getName()
.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()))
.map(HoodieLogFile::new).collect(Collectors.collectingAndThen(Collectors.toList(),
.map(HoodieLogFile::new).collect(Collectors.collectingAndThen(toList(),
l -> l.stream().sorted(HoodieLogFile.getLogVersionComparator())
.collect(Collectors.toList())));
.collect(toList())));
// Filter the delta files by the commit time of the latest base fine and collect as a list
Optional<HoodieInstant> lastTimestamp = metaClient.getActiveTimeline().lastInstant();
if (!lastTimestamp.isPresent()) {
return Maps.newHashMap();
}
return getLatestVersionInPartition(partitionPath, lastTimestamp.get().getTimestamp()).map(
return lastTimestamp.map(hoodieInstant -> getLatestVersionInPartition(partitionPath,
hoodieInstant.getTimestamp()).map(
hoodieDataFile -> Pair.of(hoodieDataFile, allLogFiles.stream().filter(
s -> s.getFileId().equals(hoodieDataFile.getFileId()) && s.getBaseCommitTime()
.equals(hoodieDataFile.getCommitTime())).collect(Collectors.toList()))).collect(
Collectors.toMap(
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, HoodieDataFile>) Pair::getKey,
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, List<HoodieLogFile>>) Pair::getRight));
Collectors.toMap(Pair::getKey, Pair::getRight))).orElseGet(Maps::newHashMap);
}
@@ -248,9 +247,9 @@ public class HoodieTableFileSystemView implements TableFileSystemView, Serializa
}
private Collector<HoodieDataFile, ?, List<HoodieDataFile>> toSortedFileStatus() {
return Collectors.collectingAndThen(Collectors.toList(),
return Collectors.collectingAndThen(toList(),
l -> l.stream().sorted(HoodieDataFile.getCommitTimeComparator())
.collect(Collectors.toList()));
.collect(toList()));
}

View File

@@ -47,6 +47,7 @@ import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.Decoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.mapred.FsInput;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.specific.SpecificRecordBase;
@@ -67,33 +68,39 @@ public class AvroUtils {
public static List<HoodieRecord<HoodieAvroPayload>> loadFromFiles(FileSystem fs,
List<String> deltaFilePaths, Schema expectedSchema) {
List<HoodieRecord<HoodieAvroPayload>> loadedRecords = Lists.newArrayList();
deltaFilePaths.forEach(s -> {
Path path = new Path(s);
try {
SeekableInput input =
new AvroFSInput(FileContext.getFileContext(fs.getConf()), path);
GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>();
// Set the expected schema to be the current schema to account for schema evolution
reader.setExpected(expectedSchema);
FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
for (GenericRecord deltaRecord : fileReader) {
String key = deltaRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partitionPath =
deltaRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
loadedRecords.add(new HoodieRecord<>(new HoodieKey(key, partitionPath),
new HoodieAvroPayload(Optional.of(deltaRecord))));
}
fileReader.close(); // also closes underlying FsInput
} catch (IOException e) {
throw new HoodieIOException("Could not read avro records from path " + s, e);
}
List<HoodieRecord<HoodieAvroPayload>> records = loadFromFile(fs, s, expectedSchema);
loadedRecords.addAll(records);
});
return loadedRecords;
}
public static List<HoodieRecord<HoodieAvroPayload>> loadFromFile(FileSystem fs,
String deltaFilePath, Schema expectedSchema) {
List<HoodieRecord<HoodieAvroPayload>> loadedRecords = Lists.newArrayList();
Path path = new Path(deltaFilePath);
try {
SeekableInput input = new FsInput(path, fs.getConf());
GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>();
// Set the expected schema to be the current schema to account for schema evolution
reader.setExpected(expectedSchema);
FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
for (GenericRecord deltaRecord : fileReader) {
String key = deltaRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
String partitionPath =
deltaRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
loadedRecords.add(new HoodieRecord<>(new HoodieKey(key, partitionPath),
new HoodieAvroPayload(Optional.of(deltaRecord))));
}
fileReader.close(); // also closes underlying FsInput
} catch (IOException e) {
throw new HoodieIOException("Could not read avro records from path " + deltaFilePath,
e);
}
return loadedRecords;
}
public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime,
Optional<Long> durationInMs, List<HoodieCleanStat> cleanStats) {

View File

@@ -210,6 +210,11 @@ public class FSUtils {
return String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version);
}
public static String maskWithoutLogVersion(String commitTime, String fileId, String logFileExtension) {
return String.format("%s_%s%s*", fileId, commitTime, logFileExtension);
}
/**
* Get the latest log file written from the list of log files passed in
*