[HUDI-1951] Add bucket hash index, compatible with the hive bucket (#3173)

* [HUDI-2154] Add index key field to HoodieKey * [HUDI-2157] Add the bucket index and its read/write implemention of Spark engine. * revert HUDI-2154 add index key field to HoodieKey * fix all comments and introduce a new tricky way to get index key at runtime support double insert for bucket index * revert spark read optimizer based on bucket index * add the storage layout * index tag, hash function and add ut * fix ut * address partial comments * Code review feedback * add layout config and docs * fix ut * rename hoodie.layout and rebase master Co-authored-by: Vinoth Chandar <vinoth@apache.org>
2021-12-31 04:38:26 +08:00
parent 0f0088fe4b
commit a4e622ac61
46 changed files with 1335 additions and 47 deletions
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/index/SparkHoodieIndexFactory.java
@@ -28,6 +28,7 @@ import org.apache.hudi.exception.HoodieIndexException;
 import org.apache.hudi.index.bloom.HoodieBloomIndex;
 import org.apache.hudi.index.bloom.HoodieGlobalBloomIndex;
 import org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper;
+import org.apache.hudi.index.bucket.HoodieBucketIndex;
 import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex;
 import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex;
 import org.apache.hudi.index.simple.HoodieGlobalSimpleIndex;
@@ -55,6 +56,8 @@ public final class SparkHoodieIndexFactory {
        return new SparkHoodieHBaseIndex<>(config);
      case INMEMORY:
        return new HoodieInMemoryHashIndex<>(config);
+      case BUCKET:
+        return new HoodieBucketIndex(config);
      case BLOOM:
        return new HoodieBloomIndex<>(config, SparkHoodieBloomIndexHelper.getInstance());
      case GLOBAL_BLOOM:
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/BaseSparkCommitActionExecutor.java
@@ -216,7 +216,10 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
  }

  protected Partitioner getPartitioner(WorkloadProfile profile) {
-    if (WriteOperationType.isChangingRecords(operationType)) {
+    Option<String> layoutPartitionerClass = table.getStorageLayout().layoutPartitionerClass();
+    if (layoutPartitionerClass.isPresent()) {
+      return getLayoutPartitioner(profile, layoutPartitionerClass.get());
+    } else if (WriteOperationType.isChangingRecords(operationType)) {
      return getUpsertPartitioner(profile);
    } else {
      return getInsertPartitioner(profile);
@@ -305,7 +308,7 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
  @SuppressWarnings("unchecked")
  protected Iterator<List<WriteStatus>> handleUpsertPartition(String instantTime, Integer partition, Iterator recordItr,
                                                              Partitioner partitioner) {
-    UpsertPartitioner upsertPartitioner = (UpsertPartitioner) partitioner;
+    SparkHoodiePartitioner upsertPartitioner = (SparkHoodiePartitioner) partitioner;
    BucketInfo binfo = upsertPartitioner.getBucketInfo(partition);
    BucketType btype = binfo.bucketType;
    try {
@@ -394,6 +397,12 @@ public abstract class BaseSparkCommitActionExecutor<T extends HoodieRecordPayloa
    return getUpsertPartitioner(profile);
  }

+  public Partitioner getLayoutPartitioner(WorkloadProfile profile, String layoutPartitionerClass) {
+    return (Partitioner) ReflectionUtils.loadClass(layoutPartitionerClass,
+        new Class[] { WorkloadProfile.class, HoodieEngineContext.class, HoodieTable.class, HoodieWriteConfig.class },
+        profile, context, table, config);
+  }
+
  @Override
  protected void runPrecommitValidators(HoodieWriteMetadata<JavaRDD<WriteStatus>> writeMetadata) {
    SparkValidatorUtils.runValidators(config, writeMetadata, context, table, instantTime);
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkBucketIndexPartitioner.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.commit;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.hudi.index.bucket.BucketIdentifier;
+import scala.Tuple2;
+
+import org.apache.hudi.common.engine.HoodieEngineContext;
+import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.model.HoodieRecordLocation;
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.index.bucket.HoodieBucketIndex;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.WorkloadProfile;
+import org.apache.hudi.table.WorkloadStat;
+
+/**
+ * Packs incoming records to be inserted into buckets (1 bucket = 1 RDD partition).
+ */
+public class SparkBucketIndexPartitioner<T extends HoodieRecordPayload<T>> extends
+    SparkHoodiePartitioner<T> {
+
+  private final int numBuckets;
+  private final String indexKeyField;
+  private final int totalPartitionPaths;
+  private final List<String> partitionPaths;
+  /**
+   * Helps get the RDD partition id, partition id is partition offset + bucket id.
+   * The partition offset is a multiple of the bucket num.
+   */
+  private final Map<String, Integer> partitionPathOffset;
+
+  /**
+   * Partition path and file groups in it pair. Decide the file group an incoming update should go to.
+   */
+  private Map<String, Set<String>> updatePartitionPathFileIds;
+
+  public SparkBucketIndexPartitioner(WorkloadProfile profile,
+                                     HoodieEngineContext context,
+                                     HoodieTable table,
+                                     HoodieWriteConfig config) {
+    super(profile, table);
+    if (!(table.getIndex() instanceof HoodieBucketIndex)) {
+      throw new HoodieException(
+          " Bucket index partitioner should only be used by BucketIndex other than "
+              + table.getIndex().getClass().getSimpleName());
+    }
+    this.numBuckets = ((HoodieBucketIndex<T>) table.getIndex()).getNumBuckets();
+    this.indexKeyField = config.getBucketIndexHashField();
+    this.totalPartitionPaths = profile.getPartitionPaths().size();
+    partitionPaths = new ArrayList<>(profile.getPartitionPaths());
+    partitionPathOffset = new HashMap<>();
+    int i = 0;
+    for (Object partitionPath : profile.getPartitionPaths()) {
+      partitionPathOffset.put(partitionPath.toString(), i);
+      i += numBuckets;
+    }
+    assignUpdates(profile);
+  }
+
+  private void assignUpdates(WorkloadProfile profile) {
+    updatePartitionPathFileIds = new HashMap<>();
+    // each update location gets a partition
+    Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getPartitionPathStatMap()
+        .entrySet();
+    for (Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
+      if (!updatePartitionPathFileIds.containsKey(partitionStat.getKey())) {
+        updatePartitionPathFileIds.put(partitionStat.getKey(), new HashSet<>());
+      }
+      for (Entry<String, Pair<String, Long>> updateLocEntry :
+          partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
+        updatePartitionPathFileIds.get(partitionStat.getKey()).add(updateLocEntry.getKey());
+      }
+    }
+  }
+
+  @Override
+  public BucketInfo getBucketInfo(int bucketNumber) {
+    String partitionPath = partitionPaths.get(bucketNumber / numBuckets);
+    String bucketId = BucketIdentifier.bucketIdStr(bucketNumber % numBuckets);
+    Option<String> fileIdOption = Option.fromJavaOptional(updatePartitionPathFileIds
+        .getOrDefault(partitionPath, Collections.emptySet()).stream()
+        .filter(e -> e.startsWith(bucketId))
+        .findFirst());
+    if (fileIdOption.isPresent()) {
+      return new BucketInfo(BucketType.UPDATE, fileIdOption.get(), partitionPath);
+    } else {
+      return new BucketInfo(BucketType.INSERT, BucketIdentifier.newBucketFileIdPrefix(bucketId), partitionPath);
+    }
+  }
+
+  @Override
+  public int numPartitions() {
+    return totalPartitionPaths * numBuckets;
+  }
+
+  @Override
+  public int getPartition(Object key) {
+    Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
+    String partitionPath = keyLocation._1.getPartitionPath();
+    Option<HoodieRecordLocation> location = keyLocation._2;
+    int bucketId = location.isPresent()
+        ? BucketIdentifier.bucketIdFromFileId(location.get().getFileId())
+        : BucketIdentifier.getBucketId(keyLocation._1, indexKeyField, numBuckets);
+    return partitionPathOffset.get(partitionPath) + bucketId;
+  }
+}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkHoodiePartitioner.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkHoodiePartitioner.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.table.action.commit;
+
+import org.apache.hudi.common.model.HoodieRecordPayload;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.WorkloadProfile;
+import org.apache.spark.Partitioner;
+
+/**
+ * Packs incoming records to be inserted into buckets (1 bucket = 1 RDD partition).
+ */
+public abstract class SparkHoodiePartitioner<T extends HoodieRecordPayload<T>> extends Partitioner
+    implements org.apache.hudi.table.action.commit.Partitioner {
+
+  /**
+   * Stat for the current workload. Helps in determining inserts, upserts etc.
+   */
+  protected WorkloadProfile profile;
+
+  protected final HoodieTable table;
+
+  public SparkHoodiePartitioner(WorkloadProfile profile, HoodieTable table) {
+    this.profile = profile;
+    this.table = table;
+  }
+
+  @Override
+  public int getNumPartitions() {
+    return numPartitions();
+  }
+
+  public abstract BucketInfo getBucketInfo(int bucketNumber);
+}
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertCommitActionExecutor.java
@@ -44,6 +44,6 @@ public class SparkInsertCommitActionExecutor<T extends HoodieRecordPayload<T>>
  @Override
  public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
    return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
-        config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, false);
+        config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType);
  }
 }
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkInsertOverwriteCommitActionExecutor.java
@@ -58,12 +58,14 @@ public class SparkInsertOverwriteCommitActionExecutor<T extends HoodieRecordPayl
  @Override
  public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
    return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
-        config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, false);
+        config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(), this, operationType);
  }

  @Override
  protected Partitioner getPartitioner(WorkloadProfile profile) {
-    return new SparkInsertOverwritePartitioner(profile, context, table, config);
+    return table.getStorageLayout().layoutPartitionerClass()
+        .map(c -> getLayoutPartitioner(profile, c))
+        .orElse(new SparkInsertOverwritePartitioner(profile, context, table, config));
  }

  @Override
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/SparkUpsertCommitActionExecutor.java
@@ -44,6 +44,6 @@ public class SparkUpsertCommitActionExecutor<T extends HoodieRecordPayload<T>>
  @Override
  public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
    return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
-        config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, true);
+        config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(), this, operationType);
  }
 }
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/commit/UpsertPartitioner.java
@@ -37,7 +37,6 @@ import org.apache.hudi.table.WorkloadProfile;
 import org.apache.hudi.table.WorkloadStat;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
-import org.apache.spark.Partitioner;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.PairFunction;
@@ -57,7 +56,7 @@ import scala.Tuple2;
 /**
 * Packs incoming records to be upserted, into buckets (1 bucket = 1 RDD partition).
 */
-public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends Partitioner {
+public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends SparkHoodiePartitioner<T> {

  private static final Logger LOG = LogManager.getLogger(UpsertPartitioner.class);

@@ -69,10 +68,6 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends Partiti
   * Total number of RDD partitions, is determined by total buckets we want to pack the incoming workload into.
   */
  private int totalBuckets = 0;
-  /**
-   * Stat for the current workload. Helps in determining inserts, upserts etc.
-   */
-  private WorkloadProfile profile;
  /**
   * Helps decide which bucket an incoming update should go to.
   */
@@ -86,17 +81,14 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends Partiti
   */
  private HashMap<Integer, BucketInfo> bucketInfoMap;

-  protected final HoodieTable table;
-
  protected final HoodieWriteConfig config;

  public UpsertPartitioner(WorkloadProfile profile, HoodieEngineContext context, HoodieTable table,
      HoodieWriteConfig config) {
+    super(profile, table);
    updateLocationToBucket = new HashMap<>();
    partitionPathToInsertBucketInfos = new HashMap<>();
    bucketInfoMap = new HashMap<>();
-    this.profile = profile;
-    this.table = table;
    this.config = config;
    assignUpdates(profile);
    assignInserts(profile, context);
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/AbstractSparkDeltaCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/AbstractSparkDeltaCommitActionExecutor.java
@@ -74,8 +74,8 @@ public abstract class AbstractSparkDeltaCommitActionExecutor<T extends HoodieRec
  public Iterator<List<WriteStatus>> handleUpdate(String partitionPath, String fileId,
      Iterator<HoodieRecord<T>> recordItr) throws IOException {
    LOG.info("Merging updates for commit " + instantTime + " for file " + fileId);
-
-    if (!table.getIndex().canIndexLogFiles() && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
+    if (!table.getIndex().canIndexLogFiles() && mergeOnReadUpsertPartitioner != null
+        && mergeOnReadUpsertPartitioner.getSmallFileIds().contains(fileId)) {
      LOG.info("Small file corrections for updates for commit " + instantTime + " for file " + fileId);
      return super.handleUpdate(partitionPath, fileId, recordItr);
    } else {
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkInsertDeltaCommitActionExecutor.java
@@ -45,6 +45,6 @@ public class SparkInsertDeltaCommitActionExecutor<T extends HoodieRecordPayload<
  @Override
  public HoodieWriteMetadata<JavaRDD<WriteStatus>> execute() {
    return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
-        config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(),this, false);
+        config.shouldCombineBeforeInsert(), config.getInsertShuffleParallelism(),this, operationType);
  }
 }
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/table/action/deltacommit/SparkUpsertDeltaCommitActionExecutor.java
@@ -44,6 +44,6 @@ public class SparkUpsertDeltaCommitActionExecutor<T extends HoodieRecordPayload<
  @Override
  public HoodieWriteMetadata execute() {
    return SparkWriteHelper.newInstance().write(instantTime, inputRecordsRDD, context, table,
-        config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(),this, true);
+        config.shouldCombineBeforeUpsert(), config.getUpsertShuffleParallelism(),this, operationType);
  }
 }
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/client/functional/TestHoodieIndex.java
@@ -36,6 +36,7 @@ import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.config.HoodieCompactionConfig;
 import org.apache.hudi.config.HoodieIndexConfig;
+import org.apache.hudi.config.HoodieLayoutConfig;
 import org.apache.hudi.config.HoodieStorageConfig;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.index.HoodieIndex;
@@ -44,6 +45,7 @@ import org.apache.hudi.metadata.HoodieTableMetadataWriter;
 import org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter;
 import org.apache.hudi.table.HoodieSparkTable;
 import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.action.commit.SparkBucketIndexPartitioner;
 import org.apache.hudi.testutils.Assertions;
 import org.apache.hudi.testutils.HoodieClientTestHarness;
 import org.apache.hudi.testutils.HoodieSparkWriteableTestTable;
@@ -89,7 +91,8 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
        {IndexType.SIMPLE, true},
        {IndexType.GLOBAL_SIMPLE, true},
        {IndexType.SIMPLE, false},
-        {IndexType.GLOBAL_SIMPLE, false}
+        {IndexType.GLOBAL_SIMPLE, false},
+        {IndexType.BUCKET, false}
    };
    return Stream.of(data).map(Arguments::of);
  }
@@ -112,11 +115,16 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
    initFileSystem();
    metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties()
        : getPropertiesForKeyGen());
+    HoodieIndexConfig.Builder indexBuilder = HoodieIndexConfig.newBuilder().withIndexType(indexType)
+        .fromProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen())
+        .withIndexType(indexType);
    config = getConfigBuilder()
        .withProperties(populateMetaFields ? new Properties() : getPropertiesForKeyGen())
        .withRollbackUsingMarkers(rollbackUsingMarkers)
-        .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType)
-            .build()).withAutoCommit(false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadata).build()).build();
+        .withIndexConfig(indexBuilder
+            .build()).withAutoCommit(false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadata).build())
+        .withLayoutConfig(HoodieLayoutConfig.newBuilder().fromProperties(indexBuilder.build().getProps())
+            .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build()).build();
    writeClient = getHoodieWriteClient(config);
    this.index = writeClient.getIndex();
  }
@@ -239,7 +247,7 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
    // Insert 200 records
    JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
    Assertions.assertNoWriteErrors(writeStatues.collect());
-
+    List<String> fileIds = writeStatues.map(WriteStatus::getFileId).collect();
    // commit this upsert
    writeClient.commit(newCommitTime, writeStatues);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
@@ -249,7 +257,6 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
    assert (javaRDD.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == totalRecords);

    // check tagged records are tagged with correct fileIds
-    List<String> fileIds = writeStatues.map(WriteStatus::getFileId).collect();
    assert (javaRDD.filter(record -> record.getCurrentLocation().getFileId() == null).collect().size() == 0);
    List<String> taggedFileIds = javaRDD.map(record -> record.getCurrentLocation().getFileId()).distinct().collect();

@@ -474,7 +481,6 @@ public class TestHoodieIndex extends HoodieClientTestHarness {
        .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
        .withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024).parquetMaxFileSize(1024 * 1024).build())
        .forTable("test-trip-table")
-        .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build())
        .withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder()
            .withStorageType(FileSystemViewStorageType.EMBEDDED_KV_STORE).build());
  }
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/TestHoodieIndexConfigs.java
@@ -32,6 +32,7 @@ import org.apache.hudi.exception.HoodieIndexException;
 import org.apache.hudi.index.HoodieIndex.IndexType;
 import org.apache.hudi.index.bloom.HoodieBloomIndex;
 import org.apache.hudi.index.bloom.HoodieGlobalBloomIndex;
+import org.apache.hudi.index.bucket.HoodieBucketIndex;
 import org.apache.hudi.index.hbase.SparkHoodieHBaseIndex;
 import org.apache.hudi.index.inmemory.HoodieInMemoryHashIndex;
 import org.apache.hudi.index.simple.HoodieSimpleIndex;
@@ -60,7 +61,7 @@ public class TestHoodieIndexConfigs {
  }

  @ParameterizedTest
-  @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE", "HBASE"})
+  @EnumSource(value = IndexType.class, names = {"BLOOM", "GLOBAL_BLOOM", "SIMPLE", "GLOBAL_SIMPLE", "HBASE", "BUCKET"})
  public void testCreateIndex(IndexType indexType) throws Exception {
    HoodieWriteConfig config;
    HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
@@ -93,6 +94,11 @@ public class TestHoodieIndexConfigs {
            .build();
        assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof SparkHoodieHBaseIndex);
        break;
+      case BUCKET:
+        config = clientConfigBuilder.withPath(basePath)
+            .withIndexConfig(indexConfigBuilder.withIndexType(IndexType.BUCKET).build()).build();
+        assertTrue(SparkHoodieIndexFactory.createIndex(config) instanceof HoodieBucketIndex);
+        break;
      default:
        // no -op. just for checkstyle errors
    }
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestBucketIdentifier.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.index.bucket;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.keygen.KeyGenUtils;
+import org.apache.hudi.testutils.KeyGeneratorTestUtilities;
+import org.junit.jupiter.api.Test;
+
+public class TestBucketIdentifier {
+
+  @Test
+  public void testBucketFileId() {
+    for (int i = 0; i < 1000; i++) {
+      String bucketId = BucketIdentifier.bucketIdStr(i);
+      String fileId = BucketIdentifier.newBucketFileIdPrefix(bucketId);
+      assert BucketIdentifier.bucketIdFromFileId(fileId) == i;
+    }
+  }
+
+  @Test
+  public void testBucketIdWithSimpleRecordKey() {
+    String recordKeyField = "_row_key";
+    String indexKeyField = "_row_key";
+    GenericRecord record = KeyGeneratorTestUtilities.getRecord();
+    HoodieRecord hoodieRecord = new HoodieRecord(
+        new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField), ""), null);
+    int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
+    assert bucketId == BucketIdentifier.getBucketId(
+        Arrays.asList(record.get(indexKeyField).toString()), 8);
+  }
+
+  @Test
+  public void testBucketIdWithComplexRecordKey() {
+    List<String> recordKeyField = Arrays.asList("_row_key","ts_ms");
+    String indexKeyField = "_row_key";
+    GenericRecord record = KeyGeneratorTestUtilities.getRecord();
+    HoodieRecord hoodieRecord = new HoodieRecord(
+        new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField), ""), null);
+    int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
+    assert bucketId == BucketIdentifier.getBucketId(
+        Arrays.asList(record.get(indexKeyField).toString()), 8);
+  }
+}
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieBucketIndex.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/index/bucket/TestHoodieBucketIndex.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.index.bucket;
+
+import org.apache.avro.Schema;
+import org.apache.hudi.common.data.HoodieData;
+import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.model.HoodieRecord;
+import org.apache.hudi.common.testutils.RawTripTestPayload;
+import org.apache.hudi.config.HoodieIndexConfig;
+import org.apache.hudi.config.HoodieWriteConfig;
+import org.apache.hudi.data.HoodieJavaRDD;
+import org.apache.hudi.exception.HoodieIndexException;
+import org.apache.hudi.index.HoodieIndex;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
+import org.apache.hudi.table.HoodieSparkTable;
+import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.testutils.HoodieClientTestHarness;
+import org.apache.hudi.testutils.HoodieSparkWriteableTestTable;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.apache.spark.api.java.JavaRDD;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.util.Arrays;
+import java.util.Properties;
+import java.util.UUID;
+
+import static org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestHoodieBucketIndex extends HoodieClientTestHarness {
+
+  private static final Logger LOG = LogManager.getLogger(TestHoodieBucketIndex.class);
+  private static final Schema SCHEMA = getSchemaFromResource(TestHoodieBucketIndex.class, "/exampleSchema.avsc", true);
+  private static final int NUM_BUCKET = 8;
+
+  @BeforeEach
+  public void setUp() throws Exception {
+    initSparkContexts();
+    initPath();
+    initFileSystem();
+    // We have some records to be tagged (two different partitions)
+    initMetaClient();
+  }
+
+  @AfterEach
+  public void tearDown() throws Exception {
+    cleanupResources();
+  }
+
+  @Test
+  public void testBucketIndexValidityCheck() {
+    Properties props = new Properties();
+    props.setProperty(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key(), "_row_key");
+    assertThrows(HoodieIndexException.class, () -> {
+      HoodieIndexConfig.newBuilder().fromProperties(props)
+          .withIndexType(HoodieIndex.IndexType.BUCKET).withBucketNum("8").build();
+    });
+    props.setProperty(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key(), "uuid");
+    HoodieIndexConfig.newBuilder().fromProperties(props)
+        .withIndexType(HoodieIndex.IndexType.BUCKET).withBucketNum("8").build();
+  }
+
+  @Test
+  public void testTagLocation() throws Exception {
+    String rowKey1 = UUID.randomUUID().toString();
+    String rowKey2 = UUID.randomUUID().toString();
+    String rowKey3 = UUID.randomUUID().toString();
+    String recordStr1 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
+    String recordStr2 = "{\"_row_key\":\"" + rowKey2 + "\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
+    String recordStr3 = "{\"_row_key\":\"" + rowKey3 + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
+    String recordStr4 = "{\"_row_key\":\"" + rowKey1 + "\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
+    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
+    HoodieRecord record1 = new HoodieRecord(
+        new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
+    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
+    HoodieRecord record2 = new HoodieRecord(
+        new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
+    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
+    HoodieRecord record3 = new HoodieRecord(
+        new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
+    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
+    HoodieRecord record4 = new HoodieRecord(
+        new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
+    JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
+
+    HoodieWriteConfig config = makeConfig();
+    HoodieTable table = HoodieSparkTable.create(config, context, metaClient);
+    HoodieBucketIndex bucketIndex = new HoodieBucketIndex(config);
+    HoodieData<HoodieRecord> taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context, table);
+    assertFalse(taggedRecordRDD.collectAsList().stream().anyMatch(r -> r.isCurrentLocationKnown()));
+
+    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(table, SCHEMA);
+    testTable.addCommit("001").withInserts("2016/01/31", getRecordFileId(record1), record1);
+    testTable.addCommit("002").withInserts("2016/01/31", getRecordFileId(record2), record2);
+    testTable.addCommit("003").withInserts("2016/01/31", getRecordFileId(record3), record3);
+    taggedRecordRDD = bucketIndex.tagLocation(HoodieJavaRDD.of(recordRDD), context,
+        HoodieSparkTable.create(config, context, metaClient));
+    assertFalse(taggedRecordRDD.collectAsList().stream().filter(r -> r.isCurrentLocationKnown())
+        .filter(r -> BucketIdentifier.bucketIdFromFileId(r.getCurrentLocation().getFileId())
+            != getRecordBucketId(r)).findAny().isPresent());
+    assertTrue(taggedRecordRDD.collectAsList().stream().filter(r -> r.getPartitionPath().equals("2015/01/31")
+        && !r.isCurrentLocationKnown()).count() == 1L);
+  }
+
+  private HoodieWriteConfig makeConfig() {
+    Properties props = new Properties();
+    props.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "_row_key");
+    return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(SCHEMA.toString())
+        .withIndexConfig(HoodieIndexConfig.newBuilder().fromProperties(props)
+            .withIndexType(HoodieIndex.IndexType.BUCKET)
+            .withIndexKeyField("_row_key")
+            .withBucketNum(String.valueOf(NUM_BUCKET)).build()).build();
+  }
+
+  private String getRecordFileId(HoodieRecord record) {
+    return BucketIdentifier.bucketIdStr(
+        BucketIdentifier.getBucketId(record, "_row_key", NUM_BUCKET));
+  }
+
+  private int getRecordBucketId(HoodieRecord record) {
+    return BucketIdentifier
+        .getBucketId(record, "_row_key", NUM_BUCKET);
+  }
+}
--- a/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java
+++ b/hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/action/commit/TestCopyOnWriteActionExecutor.java
@@ -33,14 +33,19 @@ import org.apache.hudi.common.testutils.Transformations;
 import org.apache.hudi.common.util.BaseFileUtils;
 import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.collection.Pair;
+import org.apache.hudi.config.HoodieIndexConfig;
+import org.apache.hudi.config.HoodieLayoutConfig;
 import org.apache.hudi.config.HoodieStorageConfig;
 import org.apache.hudi.config.HoodieWriteConfig;
 import org.apache.hudi.hadoop.HoodieParquetInputFormat;
 import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
+import org.apache.hudi.index.HoodieIndex;
 import org.apache.hudi.io.HoodieCreateHandle;
+import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
 import org.apache.hudi.table.HoodieSparkCopyOnWriteTable;
 import org.apache.hudi.table.HoodieSparkTable;
 import org.apache.hudi.table.HoodieTable;
+import org.apache.hudi.table.storage.HoodieStorageLayout;
 import org.apache.hudi.testutils.HoodieClientTestBase;
 import org.apache.hudi.testutils.MetadataMergeWriteStatus;

@@ -58,6 +63,8 @@ import org.apache.spark.TaskContext;
 import org.apache.spark.api.java.JavaRDD;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 import org.junit.jupiter.params.provider.ValueSource;

 import java.io.File;
@@ -67,7 +74,9 @@ import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Properties;
 import java.util.UUID;
+import java.util.stream.Stream;

 import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
 import static org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime;
@@ -83,6 +92,13 @@ public class TestCopyOnWriteActionExecutor extends HoodieClientTestBase {

  private static final Logger LOG = LogManager.getLogger(TestCopyOnWriteActionExecutor.class);
  private static final Schema SCHEMA = getSchemaFromResource(TestCopyOnWriteActionExecutor.class, "/exampleSchema.avsc");
+  private static final Stream<Arguments> indexType() {
+    HoodieIndex.IndexType[] data = new HoodieIndex.IndexType[] {
+        HoodieIndex.IndexType.BLOOM,
+        HoodieIndex.IndexType.BUCKET
+    };
+    return Stream.of(data).map(Arguments::of);
+  }

  @Test
  public void testMakeNewPath() {
@@ -118,11 +134,29 @@ public class TestCopyOnWriteActionExecutor extends HoodieClientTestBase {
            .withRemoteServerPort(timelineServicePort).build());
  }

+  private Properties makeIndexConfig(HoodieIndex.IndexType indexType) {
+    Properties props = new Properties();
+    HoodieIndexConfig.Builder indexConfig = HoodieIndexConfig.newBuilder()
+        .withIndexType(indexType);
+    props.putAll(indexConfig.build().getProps());
+    if (indexType.equals(HoodieIndex.IndexType.BUCKET)) {
+      props.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "_row_key");
+      indexConfig.fromProperties(props).withIndexKeyField("_row_key").withBucketNum("1");
+      props.putAll(indexConfig.build().getProps());
+      props.putAll(HoodieLayoutConfig.newBuilder().fromProperties(props)
+          .withLayoutType(HoodieStorageLayout.LayoutType.BUCKET.name())
+          .withLayoutPartitioner(SparkBucketIndexPartitioner.class.getName()).build().getProps());
+    }
+    return props;
+  }
+
  // TODO (weiy): Add testcases for crossing file writing.
-  @Test
-  public void testUpdateRecords() throws Exception {
+  @ParameterizedTest
+  @MethodSource("indexType")
+  public void testUpdateRecords(HoodieIndex.IndexType indexType) throws Exception {
    // Prepare the AvroParquetIO
-    HoodieWriteConfig config = makeHoodieClientConfig();
+    HoodieWriteConfig config = makeHoodieClientConfigBuilder()
+        .withProps(makeIndexConfig(indexType)).build();
    String firstCommitTime = makeNewCommitTime();
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    writeClient.startCommitWithTime(firstCommitTime);
@@ -168,7 +202,6 @@ public class TestCopyOnWriteActionExecutor extends HoodieClientTestBase {
    GenericRecord newRecord;
    int index = 0;
    for (GenericRecord record : fileRecords) {
-      //System.out.println("Got :" + record.get("_row_key").toString() + ", Exp :" + records.get(index).getRecordKey());
      assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString());
      index++;
    }