[HUDI-3356][HUDI-3203] HoodieData for metadata index records; BloomFilter construction from index based on the type param (#4848)

Rework of #4761 This diff introduces following changes: - Write stats are converted to metadata index records during the commit. Making them use the HoodieData type so that the record generation scales up with needs. - Metadata index init support for bloom filter and column stats partitions. - When building the BloomFilter from the index records, using the type param stored in the payload instead of hardcoded type. - Delta writes can change column ranges and the column stats index need to be properly updated with new ranges to be consistent with the table dataset. This fix add column stats index update support for the delta writes. Co-authored-by: Manoj Govindassamy <manoj.govindassamy@gmail.com>
2022-03-08 21:09:04 +05:30
parent ed26c5265c
commit 575bc63468
24 changed files with 1051 additions and 533 deletions
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieMetadataBloomIndexCheckFunction.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/bloom/HoodieMetadataBloomIndexCheckFunction.java
@@ -20,8 +20,7 @@ package org.apache.hudi.index.bloom;

 import org.apache.hadoop.fs.Path;
 import org.apache.hudi.client.utils.LazyIterableIterator;
-import org.apache.hudi.common.bloom.BloomFilterTypeCode;
-import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
+import org.apache.hudi.common.bloom.BloomFilter;
 import org.apache.hudi.common.fs.FSUtils;
 import org.apache.hudi.common.model.HoodieBaseFile;
 import org.apache.hudi.common.model.HoodieKey;
@@ -37,8 +36,6 @@ import org.apache.log4j.Logger;
 import org.apache.spark.api.java.function.Function2;
 import scala.Tuple2;

-import java.nio.ByteBuffer;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -113,7 +110,7 @@ public class HoodieMetadataBloomIndexCheckFunction implements
      }

      List<Pair<String, String>> partitionNameFileNameList = new ArrayList<>(fileToKeysMap.keySet());
-      Map<Pair<String, String>, ByteBuffer> fileToBloomFilterMap =
+      Map<Pair<String, String>, BloomFilter> fileToBloomFilterMap =
          hoodieTable.getMetadataTable().getBloomFilters(partitionNameFileNameList);

      final AtomicInteger totalKeys = new AtomicInteger(0);
@@ -126,11 +123,7 @@ public class HoodieMetadataBloomIndexCheckFunction implements
        if (!fileToBloomFilterMap.containsKey(partitionPathFileNamePair)) {
          throw new HoodieIndexException("Failed to get the bloom filter for " + partitionPathFileNamePair);
        }
-        final ByteBuffer fileBloomFilterByteBuffer = fileToBloomFilterMap.get(partitionPathFileNamePair);
-
-        HoodieDynamicBoundedBloomFilter fileBloomFilter =
-            new HoodieDynamicBoundedBloomFilter(StandardCharsets.UTF_8.decode(fileBloomFilterByteBuffer).toString(),
-                BloomFilterTypeCode.DYNAMIC_V0);
+        final BloomFilter fileBloomFilter = fileToBloomFilterMap.get(partitionPathFileNamePair);

        List<String> candidateRecordKeys = new ArrayList<>();
        hoodieKeyList.forEach(hoodieKey -> {
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/metadata/SparkHoodieBackedTableMetadataWriter.java
@@ -113,7 +113,7 @@ public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
      });

      if (enabled) {
-        bootstrapIfNeeded(engineContext, dataMetaClient, actionMetadata, inflightInstantTimestamp);
+        initializeIfNeeded(dataMetaClient, actionMetadata, inflightInstantTimestamp);
      }
    } catch (IOException e) {
      LOG.error("Failed to initialize metadata table. Disabling the writer.", e);