1
0

[HUDI-3365] Make sure Metadata Table records are updated appropriately on HDFS (#4739)

- This change makes sure MT records are updated appropriately on HDFS: previously after Log File append operations MT records were updated w/ just the size of the deltas being appended to the original files, which have been found to be the cause of issues in case of Rollbacks that were instead updating MT with records bearing the full file-size.

- To make sure that we hedge against similar issues going f/w, this PR alleviates this discrepancy and streamlines the flow of MT table always ingesting records bearing full file-sizes.
This commit is contained in:
Alexey Kudinkin
2022-03-07 12:38:27 -08:00
committed by GitHub
parent f0bcee3c01
commit a66fd40692
18 changed files with 415 additions and 255 deletions

View File

@@ -77,8 +77,8 @@ public class CollectionUtils {
* NOTE: That values associated with overlapping keys from the second map, will override
* values from the first one
*/
public static <K, V> Map<K, V> combine(Map<K, V> one, Map<K, V> another) {
Map<K, V> combined = new HashMap<>(one.size() + another.size());
public static <K, V> HashMap<K, V> combine(Map<K, V> one, Map<K, V> another) {
HashMap<K, V> combined = new HashMap<>(one.size() + another.size());
combined.putAll(one);
combined.putAll(another);
return combined;

View File

@@ -52,6 +52,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -239,10 +240,22 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
Option<Map<String, Long>> filesAdded,
Option<List<String>> filesDeleted) {
Map<String, HoodieMetadataFileInfo> fileInfo = new HashMap<>();
filesAdded.ifPresent(
m -> m.forEach((filename, size) -> fileInfo.put(filename, new HoodieMetadataFileInfo(size, false))));
filesDeleted.ifPresent(
m -> m.forEach(filename -> fileInfo.put(filename, new HoodieMetadataFileInfo(0L, true))));
filesAdded.ifPresent(filesMap ->
fileInfo.putAll(
filesMap.entrySet().stream().collect(
Collectors.toMap(Map.Entry::getKey, (entry) -> {
long fileSize = entry.getValue();
// Assert that the file-size of the file being added is positive, since Hudi
// should not be creating empty files
checkState(fileSize > 0);
return new HoodieMetadataFileInfo(fileSize, false);
})))
);
filesDeleted.ifPresent(filesList ->
fileInfo.putAll(
filesList.stream().collect(
Collectors.toMap(Function.identity(), (ignored) -> new HoodieMetadataFileInfo(0L, true))))
);
HoodieKey key = new HoodieKey(partition, MetadataPartitionType.FILES.getPartitionPath());
HoodieMetadataPayload payload = new HoodieMetadataPayload(key.getRecordKey(), METADATA_TYPE_FILE_LIST, fileInfo);
@@ -288,7 +301,7 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
switch (type) {
case METADATA_TYPE_PARTITION_LIST:
case METADATA_TYPE_FILE_LIST:
Map<String, HoodieMetadataFileInfo> combinedFileInfo = combineFilesystemMetadata(previousRecord);
Map<String, HoodieMetadataFileInfo> combinedFileInfo = combineFileSystemMetadata(previousRecord);
return new HoodieMetadataPayload(key, type, combinedFileInfo);
case METADATA_TYPE_BLOOM_FILTER:
HoodieMetadataBloomFilter combineBloomFilterMetadata = combineBloomFilterMetadata(previousRecord);
@@ -392,28 +405,53 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
return filesystemMetadata.entrySet().stream().filter(e -> e.getValue().getIsDeleted() == isDeleted);
}
private Map<String, HoodieMetadataFileInfo> combineFilesystemMetadata(HoodieMetadataPayload previousRecord) {
private Map<String, HoodieMetadataFileInfo> combineFileSystemMetadata(HoodieMetadataPayload previousRecord) {
Map<String, HoodieMetadataFileInfo> combinedFileInfo = new HashMap<>();
// First, add all files listed in the previous record
if (previousRecord.filesystemMetadata != null) {
combinedFileInfo.putAll(previousRecord.filesystemMetadata);
}
// Second, merge in the files listed in the new record
if (filesystemMetadata != null) {
filesystemMetadata.forEach((filename, fileInfo) -> {
// If the filename wasnt present then we carry it forward
if (!combinedFileInfo.containsKey(filename)) {
combinedFileInfo.put(filename, fileInfo);
} else {
if (fileInfo.getIsDeleted()) {
// file deletion
combinedFileInfo.remove(filename);
} else {
// file appends.
combinedFileInfo.merge(filename, fileInfo, (oldFileInfo, newFileInfo) -> {
return new HoodieMetadataFileInfo(oldFileInfo.getSize() + newFileInfo.getSize(), false);
});
}
}
validatePayload(type, filesystemMetadata);
filesystemMetadata.forEach((key, fileInfo) -> {
combinedFileInfo.merge(key, fileInfo,
// Combine previous record w/ the new one, new records taking precedence over
// the old one
//
// NOTE: That if previous listing contains the file that is being deleted by the tombstone
// record (`IsDeleted` = true) in the new one, we simply delete the file from the resulting
// listing as well as drop the tombstone itself.
// However, if file is not present in the previous record we have to persist tombstone
// record in the listing to make sure we carry forward information that this file
// was deleted. This special case could occur since the merging flow is 2-stage:
// - First we merge records from all of the delta log-files
// - Then we merge records from base-files with the delta ones (coming as a result
// of the previous step)
(oldFileInfo, newFileInfo) ->
// NOTE: We cant assume that MT update records will be ordered the same way as actual
// FS operations (since they are not atomic), therefore MT record merging should be a
// _commutative_ & _associative_ operation (ie one that would work even in case records
// will get re-ordered), which is
// - Possible for file-sizes (since file-sizes will ever grow, we can simply
// take max of the old and new records)
// - Not possible for is-deleted flags*
//
// *However, were assuming that the case of concurrent write and deletion of the same
// file is _impossible_ -- it would only be possible with concurrent upsert and
// rollback operation (affecting the same log-file), which is implausible, b/c either
// of the following have to be true:
// - Were appending to failed log-file (then the other writer is trying to
// rollback it concurrently, before its own write)
// - Rollback (of completed instant) is running concurrently with append (meaning
// that restore is running concurrently with a write, which is also nut supported
// currently)
newFileInfo.getIsDeleted()
? null
: new HoodieMetadataFileInfo(Math.max(newFileInfo.getSize(), oldFileInfo.getSize()), false));
});
}
@@ -509,6 +547,14 @@ public class HoodieMetadataPayload implements HoodieRecordPayload<HoodieMetadata
return sb.toString();
}
private static void validatePayload(int type, Map<String, HoodieMetadataFileInfo> filesystemMetadata) {
if (type == METADATA_TYPE_FILE_LIST) {
filesystemMetadata.forEach((fileName, fileInfo) -> {
checkState(fileInfo.getIsDeleted() || fileInfo.getSize() > 0, "Existing files should have size > 0");
});
}
}
private static <T> T getNestedFieldValue(GenericRecord record, String fieldName) {
// NOTE: This routine is more lightweight than {@code HoodieAvroUtils.getNestedFieldVal}
if (record.getSchema().getField(fieldName) == null) {

View File

@@ -40,6 +40,7 @@ import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.CollectionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ParquetUtils;
import org.apache.hudi.common.util.ValidationUtils;
@@ -147,40 +148,58 @@ public class HoodieTableMetadataUtil {
*/
public static List<HoodieRecord> convertMetadataToFilesPartitionRecords(HoodieCommitMetadata commitMetadata,
String instantTime) {
List<HoodieRecord> records = new LinkedList<>();
List<String> allPartitions = new LinkedList<>();
commitMetadata.getPartitionToWriteStats().forEach((partitionStatName, writeStats) -> {
final String partition = partitionStatName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionStatName;
allPartitions.add(partition);
List<HoodieRecord> records = new ArrayList<>(commitMetadata.getPartitionToWriteStats().size());
Map<String, Long> newFiles = new HashMap<>(writeStats.size());
writeStats.forEach(hoodieWriteStat -> {
String pathWithPartition = hoodieWriteStat.getPath();
if (pathWithPartition == null) {
// Empty partition
LOG.warn("Unable to find path in write stat to update metadata table " + hoodieWriteStat);
return;
}
// Add record bearing partitions list
ArrayList<String> partitionsList = new ArrayList<>(commitMetadata.getPartitionToWriteStats().keySet());
int offset = partition.equals(NON_PARTITIONED_NAME) ? (pathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
String filename = pathWithPartition.substring(offset);
long totalWriteBytes = newFiles.containsKey(filename)
? newFiles.get(filename) + hoodieWriteStat.getTotalWriteBytes()
: hoodieWriteStat.getTotalWriteBytes();
newFiles.put(filename, totalWriteBytes);
});
// New files added to a partition
HoodieRecord record = HoodieMetadataPayload.createPartitionFilesRecord(
partition, Option.of(newFiles), Option.empty());
records.add(record);
});
records.add(HoodieMetadataPayload.createPartitionListRecord(partitionsList));
// New partitions created
HoodieRecord record = HoodieMetadataPayload.createPartitionListRecord(new ArrayList<>(allPartitions));
records.add(record);
// Update files listing records for each individual partition
List<HoodieRecord<HoodieMetadataPayload>> updatedPartitionFilesRecords =
commitMetadata.getPartitionToWriteStats().entrySet()
.stream()
.map(entry -> {
String partitionStatName = entry.getKey();
List<HoodieWriteStat> writeStats = entry.getValue();
String partition = partitionStatName.equals(EMPTY_PARTITION_NAME) ? NON_PARTITIONED_NAME : partitionStatName;
HashMap<String, Long> updatedFilesToSizesMapping =
writeStats.stream().reduce(new HashMap<>(writeStats.size()),
(map, stat) -> {
String pathWithPartition = stat.getPath();
if (pathWithPartition == null) {
// Empty partition
LOG.warn("Unable to find path in write stat to update metadata table " + stat);
return map;
}
int offset = partition.equals(NON_PARTITIONED_NAME)
? (pathWithPartition.startsWith("/") ? 1 : 0)
: partition.length() + 1;
String filename = pathWithPartition.substring(offset);
// Since write-stats are coming in no particular order, if the same
// file have previously been appended to w/in the txn, we simply pick max
// of the sizes as reported after every write, since file-sizes are
// monotonically increasing (ie file-size never goes down, unless deleted)
map.merge(filename, stat.getFileSizeInBytes(), Math::max);
return map;
},
CollectionUtils::combine);
return HoodieMetadataPayload.createPartitionFilesRecord(partition, Option.of(updatedFilesToSizesMapping),
Option.empty());
})
.collect(Collectors.toList());
records.addAll(updatedPartitionFilesRecords);
LOG.info("Updating at " + instantTime + " from Commit/" + commitMetadata.getOperationType()
+ ". #partitions_updated=" + records.size());
return records;
}