Fixes needed to run merge-on-read testing on production scale data
This commit is contained in:
committed by
prazanna
parent
57ab7a2405
commit
aee136777b
@@ -0,0 +1,108 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.common.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||
public class CompactionWriteStat implements Serializable {
|
||||
|
||||
private final HoodieWriteStat writeStat;
|
||||
private String partitionPath;
|
||||
private final long totalLogRecords;
|
||||
private final long totalLogFiles;
|
||||
private final long totalRecordsToBeUpdate;
|
||||
|
||||
public CompactionWriteStat(HoodieWriteStat writeStat, String partitionPath, long totalLogFiles, long totalLogRecords,
|
||||
long totalRecordsToUpdate) {
|
||||
this.writeStat = writeStat;
|
||||
this.partitionPath = partitionPath;
|
||||
this.totalLogFiles = totalLogFiles;
|
||||
this.totalLogRecords = totalLogRecords;
|
||||
this.totalRecordsToBeUpdate = totalRecordsToUpdate;
|
||||
}
|
||||
|
||||
public long getTotalLogRecords() {
|
||||
return totalLogRecords;
|
||||
}
|
||||
|
||||
public long getTotalLogFiles() {
|
||||
return totalLogFiles;
|
||||
}
|
||||
|
||||
public long getTotalRecordsToBeUpdate() {
|
||||
return totalRecordsToBeUpdate;
|
||||
}
|
||||
public HoodieWriteStat getHoodieWriteStat() {
|
||||
return writeStat;
|
||||
}
|
||||
|
||||
public String getPartitionPath() {
|
||||
return partitionPath;
|
||||
}
|
||||
|
||||
public static Builder newBuilder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private HoodieWriteStat writeStat;
|
||||
private long totalLogRecords;
|
||||
private long totalRecordsToUpdate;
|
||||
private long totalLogFiles;
|
||||
private String partitionPath;
|
||||
|
||||
|
||||
public Builder withHoodieWriteStat(HoodieWriteStat writeStat) {
|
||||
this.writeStat = writeStat;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setTotalLogRecords(long records) {
|
||||
this.totalLogRecords = records;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setTotalLogFiles(long totalLogFiles) {
|
||||
this.totalLogFiles = totalLogFiles;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder setTotalRecordsToUpdate(long records) {
|
||||
this.totalRecordsToUpdate = records;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder onPartition(String path) {
|
||||
this.partitionPath = path;
|
||||
return this;
|
||||
}
|
||||
|
||||
public CompactionWriteStat build() {
|
||||
return new CompactionWriteStat(writeStat, partitionPath, totalLogFiles, totalLogRecords,
|
||||
totalRecordsToUpdate);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -18,10 +18,12 @@ package com.uber.hoodie.common.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.codehaus.jackson.annotate.JsonAutoDetect;
|
||||
import org.codehaus.jackson.annotate.JsonMethod;
|
||||
import org.codehaus.jackson.map.DeserializationConfig.Feature;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
@@ -38,7 +40,7 @@ import java.util.Map;
|
||||
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||
public class HoodieCommitMetadata implements Serializable {
|
||||
private static volatile Logger log = LogManager.getLogger(HoodieCommitMetadata.class);
|
||||
private HashMap<String, List<HoodieWriteStat>> partitionToWriteStats;
|
||||
protected HashMap<String, List<HoodieWriteStat>> partitionToWriteStats;
|
||||
|
||||
private HashMap<String, String> extraMetadataMap;
|
||||
|
||||
@@ -98,6 +100,7 @@ public class HoodieCommitMetadata implements Serializable {
|
||||
return new HoodieCommitMetadata();
|
||||
}
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(Feature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY);
|
||||
return mapper.readValue(jsonStr, HoodieCommitMetadata.class);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.common.model;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Supplier;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.codehaus.jackson.annotate.JsonAutoDetect;
|
||||
import org.codehaus.jackson.annotate.JsonMethod;
|
||||
import org.codehaus.jackson.map.DeserializationConfig.Feature;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
/**
|
||||
* Place holder for the compaction specific meta-data, uses all the details used in a normal HoodieCommitMetadata
|
||||
*/
|
||||
public class HoodieCompactionMetadata extends HoodieCommitMetadata {
|
||||
private static volatile Logger log = LogManager.getLogger(HoodieCompactionMetadata.class);
|
||||
protected HashMap<String, List<CompactionWriteStat>> partitionToCompactionWriteStats;
|
||||
|
||||
public HoodieCompactionMetadata() {
|
||||
partitionToCompactionWriteStats = new HashMap<>();
|
||||
}
|
||||
|
||||
public void addWriteStat(String partitionPath, CompactionWriteStat stat) {
|
||||
addWriteStat(partitionPath, stat.getHoodieWriteStat());
|
||||
if (!partitionToCompactionWriteStats.containsKey(partitionPath)) {
|
||||
partitionToCompactionWriteStats.put(partitionPath, new ArrayList<>());
|
||||
}
|
||||
partitionToCompactionWriteStats.get(partitionPath).add(stat);
|
||||
}
|
||||
|
||||
public List<CompactionWriteStat> getCompactionWriteStats(String partitionPath) {
|
||||
return partitionToCompactionWriteStats.get(partitionPath);
|
||||
}
|
||||
|
||||
public Map<String, List<CompactionWriteStat>> getPartitionToCompactionWriteStats() {
|
||||
return partitionToCompactionWriteStats;
|
||||
}
|
||||
|
||||
public String toJsonString() throws IOException {
|
||||
if(partitionToCompactionWriteStats.containsKey(null)) {
|
||||
log.info("partition path is null for " + partitionToCompactionWriteStats.get(null));
|
||||
partitionToCompactionWriteStats.remove(null);
|
||||
}
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY);
|
||||
return mapper.defaultPrettyPrintingWriter().writeValueAsString(this);
|
||||
}
|
||||
|
||||
public static HoodieCompactionMetadata fromJsonString(String jsonStr) throws IOException {
|
||||
if (jsonStr == null || jsonStr.isEmpty()) {
|
||||
// For empty commit file (no data or somethings bad happen).
|
||||
return new HoodieCompactionMetadata();
|
||||
}
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
mapper.configure(Feature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
mapper.setVisibility(JsonMethod.FIELD, JsonAutoDetect.Visibility.ANY);
|
||||
return mapper.readValue(jsonStr, HoodieCompactionMetadata.class);
|
||||
}
|
||||
|
||||
public static HoodieCompactionMetadata fromBytes(byte[] bytes) throws IOException {
|
||||
return fromJsonString(new String(bytes, Charset.forName("utf-8")));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -25,6 +25,7 @@ import org.apache.avro.file.DataFileWriter;
|
||||
import org.apache.avro.generic.GenericDatumWriter;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.avro.mapred.FsInput;
|
||||
import org.apache.hadoop.fs.AvroFSInput;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileContext;
|
||||
@@ -67,7 +68,7 @@ public class AvroLogAppender implements HoodieLogAppender<IndexedRecord> {
|
||||
//TODO - check for log corruption and roll over if needed
|
||||
log.info(config.getLogFile() + " exists. Appending to existing file");
|
||||
// this log path exists, we will append to it
|
||||
fs = FileSystem.get(fs.getConf());
|
||||
// fs = FileSystem.get(fs.getConf());
|
||||
try {
|
||||
this.output = fs.append(path, config.getBufferSize());
|
||||
} catch (RemoteException e) {
|
||||
@@ -85,8 +86,9 @@ public class AvroLogAppender implements HoodieLogAppender<IndexedRecord> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.writer
|
||||
.appendTo(new AvroFSInput(FileContext.getFileContext(fs.getConf()), path), output);
|
||||
.appendTo(new FsInput(path, fs.getConf()), output);
|
||||
// we always want to flush to disk everytime a avro block is written
|
||||
this.writer.setFlushOnEveryBlock(true);
|
||||
} else {
|
||||
|
||||
@@ -16,7 +16,10 @@
|
||||
|
||||
package com.uber.hoodie.common.table.view;
|
||||
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.common.model.HoodieCompactionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
@@ -27,6 +30,7 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import java.util.function.BinaryOperator;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -199,23 +203,18 @@ public class HoodieTableFileSystemView implements TableFileSystemView, Serializa
|
||||
// All the log files filtered from the above list, sorted by version numbers
|
||||
List<HoodieLogFile> allLogFiles = Arrays.stream(files).filter(s -> s.getPath().getName()
|
||||
.contains(metaClient.getTableConfig().getRTFileFormat().getFileExtension()))
|
||||
.map(HoodieLogFile::new).collect(Collectors.collectingAndThen(Collectors.toList(),
|
||||
.map(HoodieLogFile::new).collect(Collectors.collectingAndThen(toList(),
|
||||
l -> l.stream().sorted(HoodieLogFile.getLogVersionComparator())
|
||||
.collect(Collectors.toList())));
|
||||
.collect(toList())));
|
||||
|
||||
// Filter the delta files by the commit time of the latest base fine and collect as a list
|
||||
Optional<HoodieInstant> lastTimestamp = metaClient.getActiveTimeline().lastInstant();
|
||||
if (!lastTimestamp.isPresent()) {
|
||||
return Maps.newHashMap();
|
||||
}
|
||||
|
||||
return getLatestVersionInPartition(partitionPath, lastTimestamp.get().getTimestamp()).map(
|
||||
return lastTimestamp.map(hoodieInstant -> getLatestVersionInPartition(partitionPath,
|
||||
hoodieInstant.getTimestamp()).map(
|
||||
hoodieDataFile -> Pair.of(hoodieDataFile, allLogFiles.stream().filter(
|
||||
s -> s.getFileId().equals(hoodieDataFile.getFileId()) && s.getBaseCommitTime()
|
||||
.equals(hoodieDataFile.getCommitTime())).collect(Collectors.toList()))).collect(
|
||||
Collectors.toMap(
|
||||
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, HoodieDataFile>) Pair::getKey,
|
||||
(Function<Pair<HoodieDataFile, List<HoodieLogFile>>, List<HoodieLogFile>>) Pair::getRight));
|
||||
Collectors.toMap(Pair::getKey, Pair::getRight))).orElseGet(Maps::newHashMap);
|
||||
}
|
||||
|
||||
|
||||
@@ -248,9 +247,9 @@ public class HoodieTableFileSystemView implements TableFileSystemView, Serializa
|
||||
}
|
||||
|
||||
private Collector<HoodieDataFile, ?, List<HoodieDataFile>> toSortedFileStatus() {
|
||||
return Collectors.collectingAndThen(Collectors.toList(),
|
||||
return Collectors.collectingAndThen(toList(),
|
||||
l -> l.stream().sorted(HoodieDataFile.getCommitTimeComparator())
|
||||
.collect(Collectors.toList()));
|
||||
.collect(toList()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -47,6 +47,7 @@ import org.apache.avro.io.DatumWriter;
|
||||
import org.apache.avro.io.Decoder;
|
||||
import org.apache.avro.io.DecoderFactory;
|
||||
import org.apache.avro.io.EncoderFactory;
|
||||
import org.apache.avro.mapred.FsInput;
|
||||
import org.apache.avro.specific.SpecificDatumReader;
|
||||
import org.apache.avro.specific.SpecificDatumWriter;
|
||||
import org.apache.avro.specific.SpecificRecordBase;
|
||||
@@ -67,33 +68,39 @@ public class AvroUtils {
|
||||
|
||||
public static List<HoodieRecord<HoodieAvroPayload>> loadFromFiles(FileSystem fs,
|
||||
List<String> deltaFilePaths, Schema expectedSchema) {
|
||||
|
||||
List<HoodieRecord<HoodieAvroPayload>> loadedRecords = Lists.newArrayList();
|
||||
deltaFilePaths.forEach(s -> {
|
||||
Path path = new Path(s);
|
||||
try {
|
||||
SeekableInput input =
|
||||
new AvroFSInput(FileContext.getFileContext(fs.getConf()), path);
|
||||
GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>();
|
||||
// Set the expected schema to be the current schema to account for schema evolution
|
||||
reader.setExpected(expectedSchema);
|
||||
|
||||
FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
|
||||
for (GenericRecord deltaRecord : fileReader) {
|
||||
String key = deltaRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
String partitionPath =
|
||||
deltaRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
|
||||
loadedRecords.add(new HoodieRecord<>(new HoodieKey(key, partitionPath),
|
||||
new HoodieAvroPayload(Optional.of(deltaRecord))));
|
||||
}
|
||||
fileReader.close(); // also closes underlying FsInput
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Could not read avro records from path " + s, e);
|
||||
}
|
||||
List<HoodieRecord<HoodieAvroPayload>> records = loadFromFile(fs, s, expectedSchema);
|
||||
loadedRecords.addAll(records);
|
||||
});
|
||||
return loadedRecords;
|
||||
}
|
||||
|
||||
public static List<HoodieRecord<HoodieAvroPayload>> loadFromFile(FileSystem fs,
|
||||
String deltaFilePath, Schema expectedSchema) {
|
||||
List<HoodieRecord<HoodieAvroPayload>> loadedRecords = Lists.newArrayList();
|
||||
Path path = new Path(deltaFilePath);
|
||||
try {
|
||||
SeekableInput input = new FsInput(path, fs.getConf());
|
||||
GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>();
|
||||
// Set the expected schema to be the current schema to account for schema evolution
|
||||
reader.setExpected(expectedSchema);
|
||||
|
||||
FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);
|
||||
for (GenericRecord deltaRecord : fileReader) {
|
||||
String key = deltaRecord.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
String partitionPath =
|
||||
deltaRecord.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
|
||||
loadedRecords.add(new HoodieRecord<>(new HoodieKey(key, partitionPath),
|
||||
new HoodieAvroPayload(Optional.of(deltaRecord))));
|
||||
}
|
||||
fileReader.close(); // also closes underlying FsInput
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Could not read avro records from path " + deltaFilePath,
|
||||
e);
|
||||
}
|
||||
return loadedRecords;
|
||||
}
|
||||
|
||||
public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime,
|
||||
Optional<Long> durationInMs, List<HoodieCleanStat> cleanStats) {
|
||||
|
||||
@@ -210,6 +210,11 @@ public class FSUtils {
|
||||
return String.format("%s_%s%s.%d", fileId, baseCommitTime, logFileExtension, version);
|
||||
}
|
||||
|
||||
public static String maskWithoutLogVersion(String commitTime, String fileId, String logFileExtension) {
|
||||
return String.format("%s_%s%s*", fileId, commitTime, logFileExtension);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the latest log file written from the list of log files passed in
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user