1
0

[HUDI-3807] Add a new config to control the use of metadata index in HoodieBloomIndex (#5268)

This commit is contained in:
Y Ethan Guo
2022-04-09 12:30:11 -07:00
committed by GitHub
parent 5e65aefc61
commit 3e97c88c4f
5 changed files with 34 additions and 4 deletions

View File

@@ -30,17 +30,22 @@ import org.apache.hudi.data.HoodieJavaPairRDD;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.io.HoodieKeyLookupResult;
import org.apache.hudi.table.HoodieTable;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.Partitioner;
import org.apache.spark.api.java.JavaRDD;
import scala.Tuple2;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import scala.Tuple2;
import static org.apache.hudi.metadata.HoodieTableMetadataUtil.getCompletedMetadataPartitions;
import static org.apache.hudi.metadata.MetadataPartitionType.BLOOM_FILTERS;
/**
* Helper for {@link HoodieBloomIndex} containing Spark-specific logic.
*/
@@ -75,7 +80,9 @@ public class SparkHoodieBloomIndexHelper extends BaseHoodieBloomIndexHelper {
+ config.getBloomIndexParallelism() + "}");
JavaRDD<List<HoodieKeyLookupResult>> keyLookupResultRDD;
if (config.isMetadataBloomFilterIndexEnabled()) {
if (config.getBloomIndexUseMetadata()
&& getCompletedMetadataPartitions(hoodieTable.getMetaClient().getTableConfig())
.contains(BLOOM_FILTERS.getPartitionPath())) {
// Step 1: Sort by file id
JavaRDD<Tuple2<String, HoodieKey>> sortedFileIdAndKeyPairs =
fileComparisonsRDD.sortBy(Tuple2::_1, true, joinParallelism);