1
0

[HUDI-1951] Add bucket hash index, compatible with the hive bucket (#3173)

* [HUDI-2154] Add index key field to HoodieKey

* [HUDI-2157] Add the bucket index and its read/write implemention of Spark engine.
* revert HUDI-2154 add index key field to HoodieKey
* fix all comments and introduce a new tricky way to get index key at runtime
support double insert for bucket index
* revert spark read optimizer based on bucket index
* add the storage layout
* index tag, hash function and add ut
* fix ut
* address partial comments
* Code review feedback
* add layout config and docs
* fix ut
* rename hoodie.layout and rebase master

Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
Shawy Geng
2021-12-31 04:38:26 +08:00
committed by GitHub
parent 0f0088fe4b
commit a4e622ac61
46 changed files with 1335 additions and 47 deletions

View File

@@ -70,6 +70,9 @@ public class HiveSyncConfig implements Serializable {
+ "org.apache.hudi input format.")
public Boolean usePreApacheInputFormat = false;
@Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore", required = false)
public String bucketSpec;
@Deprecated
@Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
public Boolean useJdbc = true;
@@ -135,6 +138,7 @@ public class HiveSyncConfig implements Serializable {
newConfig.partitionValueExtractorClass = cfg.partitionValueExtractorClass;
newConfig.jdbcUrl = cfg.jdbcUrl;
newConfig.tableName = cfg.tableName;
newConfig.bucketSpec = cfg.bucketSpec;
newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat;
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
newConfig.supportTimestamp = cfg.supportTimestamp;
@@ -155,6 +159,7 @@ public class HiveSyncConfig implements Serializable {
return "HiveSyncConfig{"
+ "databaseName='" + databaseName + '\''
+ ", tableName='" + tableName + '\''
+ ", bucketSpec='" + bucketSpec + '\''
+ ", baseFileFormat='" + baseFileFormat + '\''
+ ", hiveUser='" + hiveUser + '\''
+ ", hivePass='" + hivePass + '\''
@@ -181,4 +186,8 @@ public class HiveSyncConfig implements Serializable {
+ ", isConditionalSync=" + isConditionalSync
+ '}';
}
public static String getBucketSpec(String bucketCols, int bucketNum) {
return "CLUSTERED BY (" + bucketCols + " INTO " + bucketNum + " BUCKETS";
}
}

View File

@@ -471,6 +471,9 @@ public class HiveSchemaUtil {
if (!config.partitionFields.isEmpty()) {
sb.append(" PARTITIONED BY (").append(partitionsStr).append(")");
}
if (config.bucketSpec != null) {
sb.append(' ' + config.bucketSpec + ' ');
}
sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'");
if (serdeProperties != null && !serdeProperties.isEmpty()) {
sb.append(" WITH SERDEPROPERTIES (").append(propertyToString(serdeProperties)).append(")");