[HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups (#4352)

* [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Today, base files have bloom filter at their footers and index lookups have to load the base file to perform any bloom lookups. Though we have interval tree based file purging, we still end up in significant amount of base file read for the bloom filter for the end index lookups for the keys. This index lookup operation can be made more performant by having all the bloom filters in a new metadata partition and doing pointed lookups based on keys. * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Adding indexing support for clean, restore and rollback operations. Each of these operations will now be converted to index records for bloom filter and column stats additionally. * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Making hoodie key consistent for both column stats and bloom index by including fileId instead of fileName, in both read and write paths. - Performance optimization for looking up records in the metadata table. - Avoiding multi column sorting needed for HoodieBloomMetaIndexBatchCheckFunction * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - HoodieBloomMetaIndexBatchCheckFunction cleanup to remove unused classes - Base file checking before reading the file footer for bloom or column stats * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Updating the bloom index and column stats index to have full file name included in the key instead of just file id. - Minor test fixes. * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Fixed flink commit method to handle metadata table all partition update records - TestBloomIndex fixes * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - SparkHoodieBloomIndexHelper code simplification for various config modes - Signature change for getBloomFilters() and getColumnStats(). Callers can just pass in interested partition and file names, the index key is then constructed internally based on the passed in parameters. - KeyLookupHandle and KeyLookupResults code refactoring - Metadata schema changes - removed the reserved field * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Removing HoodieColumnStatsMetadata and using HoodieColumnRangeMetadata instead. Fixed the users of the the removed class. * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Extending meta index test to cover deletes, compactions, clean and restore table operations. Also, fixed the getBloomFilters() and getColumnStats() to account for deleted entries. * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Addressing review comments - java doc for new classes, keys sorting for lookup, index methods renaming. * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Consolidated the bloom filter checking for keys in to one HoodieMetadataBloomIndexCheckFunction instead of a spearate batch and lazy mode. Removed all the configs around it. - Made the metadata table partition file group count configurable. - Fixed the HoodieKeyLookupHandle to have auto closable file reader when checking bloom filter and range keys. - Config property renames. Test fixes. * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Enabling column stats indexing for all columns by default - Handling column stat generation errors and test update * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Metadata table partition file group count taken from the slices when the table is bootstrapped. - Prep records for the commit refactored to the base class - HoodieFileReader interface changes for filtering keys - Multi column and data types support for colums stats index * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - rebase to latest master and merge fixes for the build and test failures * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Extending the metadata column stats type payload schema to include more statistics about the column ranges to help query integration. * [HUDI-1295] Metadata Index - Bloom filter and Column stats index to speed up index lookups - Addressing review comments
2022-02-03 04:42:48 -08:00
parent d681824982
commit 5927bdd1c0
49 changed files with 2304 additions and 522 deletions
--- a/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java
+++ b/hudi-client/hudi-flink-client/src/main/java/org/apache/hudi/metadata/FlinkHoodieBackedTableMetadataWriter.java
@@ -22,11 +22,10 @@ import org.apache.avro.specific.SpecificRecordBase;
 import org.apache.hudi.client.HoodieFlinkWriteClient;
 import org.apache.hudi.client.WriteStatus;
 import org.apache.hudi.common.data.HoodieData;
+import org.apache.hudi.common.data.HoodieList;
 import org.apache.hudi.common.engine.HoodieEngineContext;
 import org.apache.hudi.common.metrics.Registry;
-import org.apache.hudi.common.model.FileSlice;
 import org.apache.hudi.common.model.HoodieRecord;
-import org.apache.hudi.common.model.HoodieRecordLocation;
 import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieInstant;
 import org.apache.hudi.common.util.Option;
@@ -41,7 +40,7 @@ import org.apache.log4j.Logger;
 import java.io.IOException;
 import java.util.Collections;
 import java.util.List;
-import java.util.stream.Collectors;
+import java.util.Map;

 public class FlinkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetadataWriter {

@@ -101,10 +100,12 @@ public class FlinkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
  }

  @Override
-  protected void commit(HoodieData<HoodieRecord> hoodieDataRecords, String partitionName, String instantTime, boolean canTriggerTableService) {
+  protected void commit(String instantTime, Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionRecordsMap,
+                        boolean canTriggerTableService) {
    ValidationUtils.checkState(enabled, "Metadata table cannot be committed to as it is not enabled");
-    List<HoodieRecord> records = (List<HoodieRecord>) hoodieDataRecords.get();
-    List<HoodieRecord> recordList = prepRecords(records, partitionName, 1);
+    ValidationUtils.checkState(metadataMetaClient != null, "Metadata table is not fully initialized yet.");
+    HoodieData<HoodieRecord> preppedRecords = prepRecords(partitionRecordsMap);
+    List<HoodieRecord> preppedRecordList = HoodieList.getList(preppedRecords);

    try (HoodieFlinkWriteClient writeClient = new HoodieFlinkWriteClient(engineContext, metadataWriteConfig)) {
      if (!metadataMetaClient.getActiveTimeline().filterCompletedInstants().containsInstant(instantTime)) {
@@ -119,13 +120,14 @@ public class FlinkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
        // once rollback is complete, compaction will be retried again, which will eventually hit this code block where the respective commit is
        // already part of completed commit. So, we have to manually remove the completed instant and proceed.
        // and it is for the same reason we enabled withAllowMultiWriteOnSameInstant for metadata table.
-        HoodieInstant alreadyCompletedInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get();
+        HoodieInstant alreadyCompletedInstant =
+            metadataMetaClient.getActiveTimeline().filterCompletedInstants().filter(entry -> entry.getTimestamp().equals(instantTime)).lastInstant().get();
        HoodieActiveTimeline.deleteInstantFile(metadataMetaClient.getFs(), metadataMetaClient.getMetaPath(), alreadyCompletedInstant);
        metadataMetaClient.reloadActiveTimeline();
      }

-      List<WriteStatus> statuses = records.size() > 0
-          ? writeClient.upsertPreppedRecords(recordList, instantTime)
+      List<WriteStatus> statuses = preppedRecordList.size() > 0
+          ? writeClient.upsertPreppedRecords(preppedRecordList, instantTime)
          : Collections.emptyList();
      statuses.forEach(writeStatus -> {
        if (writeStatus.hasErrors()) {
@@ -147,21 +149,4 @@ public class FlinkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
    // Update total size of the metadata and count of base/log files
    metrics.ifPresent(m -> m.updateSizeMetrics(metadataMetaClient, metadata));
  }
-
-  /**
-   * Tag each record with the location in the given partition.
-   *
-   * The record is tagged with respective file slice's location based on its record key.
-   */
-  private List<HoodieRecord> prepRecords(List<HoodieRecord> records, String partitionName, int numFileGroups) {
-    List<FileSlice> fileSlices = HoodieTableMetadataUtil.getPartitionLatestFileSlices(metadataMetaClient, partitionName);
-    ValidationUtils.checkArgument(fileSlices.size() == numFileGroups, String.format("Invalid number of file groups: found=%d, required=%d", fileSlices.size(), numFileGroups));
-
-    return records.stream().map(r -> {
-      FileSlice slice = fileSlices.get(HoodieTableMetadataUtil.mapRecordKeyToFileGroupIndex(r.getRecordKey(), numFileGroups));
-      final String instantTime = slice.isEmpty() ? "I" : "U";
-      r.setCurrentLocation(new HoodieRecordLocation(instantTime, slice.getFileId()));
-      return r;
-    }).collect(Collectors.toList());
-  }
-}
+}
--- a/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java
+++ b/hudi-client/hudi-flink-client/src/test/java/org/apache/hudi/index/bloom/TestFlinkHoodieBloomIndex.java
@@ -31,7 +31,7 @@ import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.config.HoodieIndexConfig;
 import org.apache.hudi.config.HoodieWriteConfig;
-import org.apache.hudi.io.HoodieKeyLookupHandle;
+import org.apache.hudi.index.HoodieIndexUtils;
 import org.apache.hudi.table.HoodieFlinkTable;
 import org.apache.hudi.table.HoodieTable;
 import org.apache.hudi.testutils.HoodieFlinkClientTestHarness;
@@ -130,7 +130,7 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
        new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);

    List<String> partitions = asList("2016/01/21", "2016/04/01", "2015/03/12");
-    List<Pair<String, BloomIndexFileInfo>> filesList = index.loadInvolvedFiles(partitions, context, hoodieTable);
+    List<Pair<String, BloomIndexFileInfo>> filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
    // Still 0, as no valid commit
    assertEquals(0, filesList.size());

@@ -140,7 +140,7 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {
        .withInserts("2015/03/12", "4", record2, record3, record4);
    metaClient.reloadActiveTimeline();

-    filesList = index.loadInvolvedFiles(partitions, context, hoodieTable);
+    filesList = index.loadColumnRangesFromFiles(partitions, context, hoodieTable);
    assertEquals(4, filesList.size());

    if (rangePruning) {
@@ -242,9 +242,8 @@ public class TestFlinkHoodieBloomIndex extends HoodieFlinkClientTestHarness {

    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
    HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient);
-    HoodieKeyLookupHandle keyHandle = new HoodieKeyLookupHandle<>(config, table, Pair.of(partition, fileId));
-    List<String> results = keyHandle.checkCandidatesAgainstFile(hadoopConf, uuids,
-        new Path(java.nio.file.Paths.get(basePath, partition, filename).toString()));
+    List<String> results = HoodieIndexUtils.filterKeysFromFile(
+        new Path(java.nio.file.Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf);
    assertEquals(results.size(), 2);
    assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
        || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));