1
0

[HUDI-3356][HUDI-3203] HoodieData for metadata index records; BloomFilter construction from index based on the type param (#4848)

Rework of #4761 
This diff introduces following changes:

- Write stats are converted to metadata index records during the commit. Making them use the HoodieData type so that the record generation scales up with needs. 
- Metadata index init support for bloom filter and column stats partitions.
- When building the BloomFilter from the index records, using the type param stored in the payload instead of hardcoded type.
- Delta writes can change column ranges and the column stats index need to be properly updated with new ranges to be consistent with the table dataset. This fix add column stats index update support for the delta writes.

Co-authored-by: Manoj Govindassamy <manoj.govindassamy@gmail.com>
This commit is contained in:
Sagar Sumit
2022-03-08 21:09:04 +05:30
committed by GitHub
parent ed26c5265c
commit 575bc63468
24 changed files with 1051 additions and 533 deletions

View File

@@ -20,8 +20,7 @@ package org.apache.hudi.index.bloom;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.client.utils.LazyIterableIterator;
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieKey;
@@ -37,8 +36,6 @@ import org.apache.log4j.Logger;
import org.apache.spark.api.java.function.Function2;
import scala.Tuple2;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@@ -113,7 +110,7 @@ public class HoodieMetadataBloomIndexCheckFunction implements
}
List<Pair<String, String>> partitionNameFileNameList = new ArrayList<>(fileToKeysMap.keySet());
Map<Pair<String, String>, ByteBuffer> fileToBloomFilterMap =
Map<Pair<String, String>, BloomFilter> fileToBloomFilterMap =
hoodieTable.getMetadataTable().getBloomFilters(partitionNameFileNameList);
final AtomicInteger totalKeys = new AtomicInteger(0);
@@ -126,11 +123,7 @@ public class HoodieMetadataBloomIndexCheckFunction implements
if (!fileToBloomFilterMap.containsKey(partitionPathFileNamePair)) {
throw new HoodieIndexException("Failed to get the bloom filter for " + partitionPathFileNamePair);
}
final ByteBuffer fileBloomFilterByteBuffer = fileToBloomFilterMap.get(partitionPathFileNamePair);
HoodieDynamicBoundedBloomFilter fileBloomFilter =
new HoodieDynamicBoundedBloomFilter(StandardCharsets.UTF_8.decode(fileBloomFilterByteBuffer).toString(),
BloomFilterTypeCode.DYNAMIC_V0);
final BloomFilter fileBloomFilter = fileToBloomFilterMap.get(partitionPathFileNamePair);
List<String> candidateRecordKeys = new ArrayList<>();
hoodieKeyList.forEach(hoodieKey -> {

View File

@@ -113,7 +113,7 @@ public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
});
if (enabled) {
bootstrapIfNeeded(engineContext, dataMetaClient, actionMetadata, inflightInstantTimestamp);
initializeIfNeeded(dataMetaClient, actionMetadata, inflightInstantTimestamp);
}
} catch (IOException e) {
LOG.error("Failed to initialize metadata table. Disabling the writer.", e);