[HUDI-1951] Add bucket hash index, compatible with the hive bucket (#3173)

* [HUDI-2154] Add index key field to HoodieKey * [HUDI-2157] Add the bucket index and its read/write implemention of Spark engine. * revert HUDI-2154 add index key field to HoodieKey * fix all comments and introduce a new tricky way to get index key at runtime support double insert for bucket index * revert spark read optimizer based on bucket index * add the storage layout * index tag, hash function and add ut * fix ut * address partial comments * Code review feedback * add layout config and docs * fix ut * rename hoodie.layout and rebase master Co-authored-by: Vinoth Chandar <vinoth@apache.org>
2021-12-31 04:38:26 +08:00
parent 0f0088fe4b
commit a4e622ac61
46 changed files with 1335 additions and 47 deletions
--- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
+++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncConfig.java
@@ -70,6 +70,9 @@ public class HiveSyncConfig implements Serializable {
          + "org.apache.hudi input format.")
  public Boolean usePreApacheInputFormat = false;

+  @Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore", required = false)
+  public String bucketSpec;
+
  @Deprecated
  @Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
  public Boolean useJdbc = true;
@@ -135,6 +138,7 @@ public class HiveSyncConfig implements Serializable {
    newConfig.partitionValueExtractorClass = cfg.partitionValueExtractorClass;
    newConfig.jdbcUrl = cfg.jdbcUrl;
    newConfig.tableName = cfg.tableName;
+    newConfig.bucketSpec = cfg.bucketSpec;
    newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat;
    newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
    newConfig.supportTimestamp = cfg.supportTimestamp;
@@ -155,6 +159,7 @@ public class HiveSyncConfig implements Serializable {
    return "HiveSyncConfig{"
      + "databaseName='" + databaseName + '\''
      + ", tableName='" + tableName + '\''
+      + ", bucketSpec='" + bucketSpec + '\''
      + ", baseFileFormat='" + baseFileFormat + '\''
      + ", hiveUser='" + hiveUser + '\''
      + ", hivePass='" + hivePass + '\''
@@ -181,4 +186,8 @@ public class HiveSyncConfig implements Serializable {
      + ", isConditionalSync=" + isConditionalSync
      + '}';
  }
+
+  public static String getBucketSpec(String bucketCols, int bucketNum) {
+    return "CLUSTERED BY (" + bucketCols + " INTO " + bucketNum + " BUCKETS";
+  }
 }
--- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/HiveSchemaUtil.java
+++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/HiveSchemaUtil.java
@@ -471,6 +471,9 @@ public class HiveSchemaUtil {
    if (!config.partitionFields.isEmpty()) {
      sb.append(" PARTITIONED BY (").append(partitionsStr).append(")");
    }
+    if (config.bucketSpec != null) {
+      sb.append(' ' + config.bucketSpec + ' ');
+    }
    sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'");
    if (serdeProperties != null && !serdeProperties.isEmpty()) {
      sb.append(" WITH SERDEPROPERTIES (").append(propertyToString(serdeProperties)).append(")");