[HUDI-1951] Add bucket hash index, compatible with the hive bucket (#3173)

* [HUDI-2154] Add index key field to HoodieKey * [HUDI-2157] Add the bucket index and its read/write implemention of Spark engine. * revert HUDI-2154 add index key field to HoodieKey * fix all comments and introduce a new tricky way to get index key at runtime support double insert for bucket index * revert spark read optimizer based on bucket index * add the storage layout * index tag, hash function and add ut * fix ut * address partial comments * Code review feedback * add layout config and docs * fix ut * rename hoodie.layout and rebase master Co-authored-by: Vinoth Chandar <vinoth@apache.org>
2021-12-31 04:38:26 +08:00
parent 0f0088fe4b
commit a4e622ac61
46 changed files with 1335 additions and 47 deletions
--- a/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/java/org/apache/hudi/DataSourceUtils.java
@@ -44,7 +44,6 @@ import org.apache.hudi.exception.HoodieNotSupportedException;
 import org.apache.hudi.exception.TableNotFoundException;
 import org.apache.hudi.hive.HiveSyncConfig;
 import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor;
-import org.apache.hudi.index.HoodieIndex.IndexType;
 import org.apache.hudi.table.BulkInsertPartitioner;
 import org.apache.hudi.util.DataTypeUtils;
 import org.apache.log4j.LogManager;
@@ -185,7 +184,6 @@ public class DataSourceUtils {
    }

    return builder.forTable(tblName)
-        .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
        .withCompactionConfig(HoodieCompactionConfig.newBuilder()
            .withPayloadClass(parameters.get(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key()))
            .withInlineCompaction(inlineCompact).build())
@@ -309,6 +307,10 @@ public class DataSourceUtils {
        DataSourceWriteOptions.HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE().defaultValue()));
    hiveSyncConfig.supportTimestamp = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SUPPORT_TIMESTAMP_TYPE().key(),
        DataSourceWriteOptions.HIVE_SUPPORT_TIMESTAMP_TYPE().defaultValue()));
+    hiveSyncConfig.bucketSpec = props.getBoolean(DataSourceWriteOptions.HIVE_SYNC_BUCKET_SYNC().key(),
+        (boolean) DataSourceWriteOptions.HIVE_SYNC_BUCKET_SYNC().defaultValue())
+        ? HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()),
+            props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())) : null;
    return hiveSyncConfig;
  }

--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
@@ -526,6 +526,12 @@ object DataSourceWriteOptions {
    .noDefaultValue()
    .withDocumentation("Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.")

+  val HIVE_SYNC_BUCKET_SYNC: ConfigProperty[Boolean] = ConfigProperty
+    .key("hoodie.datasource.hive_sync.bucket_sync")
+    .defaultValue(false)
+    .withDocumentation("Whether sync hive metastore bucket specification when using bucket index." +
+      "The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'")
+
  // Async Compaction - Enabled by default for MOR
  val ASYNC_COMPACT_ENABLE: ConfigProperty[String] = ConfigProperty
    .key("hoodie.datasource.compaction.async.enable")