[HUDI-1951] Add bucket hash index, compatible with the hive bucket (#3173)
* [HUDI-2154] Add index key field to HoodieKey * [HUDI-2157] Add the bucket index and its read/write implemention of Spark engine. * revert HUDI-2154 add index key field to HoodieKey * fix all comments and introduce a new tricky way to get index key at runtime support double insert for bucket index * revert spark read optimizer based on bucket index * add the storage layout * index tag, hash function and add ut * fix ut * address partial comments * Code review feedback * add layout config and docs * fix ut * rename hoodie.layout and rebase master Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
@@ -70,6 +70,9 @@ public class HiveSyncConfig implements Serializable {
|
||||
+ "org.apache.hudi input format.")
|
||||
public Boolean usePreApacheInputFormat = false;
|
||||
|
||||
@Parameter(names = {"--bucket-spec"}, description = "bucket spec stored in metastore", required = false)
|
||||
public String bucketSpec;
|
||||
|
||||
@Deprecated
|
||||
@Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
|
||||
public Boolean useJdbc = true;
|
||||
@@ -135,6 +138,7 @@ public class HiveSyncConfig implements Serializable {
|
||||
newConfig.partitionValueExtractorClass = cfg.partitionValueExtractorClass;
|
||||
newConfig.jdbcUrl = cfg.jdbcUrl;
|
||||
newConfig.tableName = cfg.tableName;
|
||||
newConfig.bucketSpec = cfg.bucketSpec;
|
||||
newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat;
|
||||
newConfig.useFileListingFromMetadata = cfg.useFileListingFromMetadata;
|
||||
newConfig.supportTimestamp = cfg.supportTimestamp;
|
||||
@@ -155,6 +159,7 @@ public class HiveSyncConfig implements Serializable {
|
||||
return "HiveSyncConfig{"
|
||||
+ "databaseName='" + databaseName + '\''
|
||||
+ ", tableName='" + tableName + '\''
|
||||
+ ", bucketSpec='" + bucketSpec + '\''
|
||||
+ ", baseFileFormat='" + baseFileFormat + '\''
|
||||
+ ", hiveUser='" + hiveUser + '\''
|
||||
+ ", hivePass='" + hivePass + '\''
|
||||
@@ -181,4 +186,8 @@ public class HiveSyncConfig implements Serializable {
|
||||
+ ", isConditionalSync=" + isConditionalSync
|
||||
+ '}';
|
||||
}
|
||||
|
||||
public static String getBucketSpec(String bucketCols, int bucketNum) {
|
||||
return "CLUSTERED BY (" + bucketCols + " INTO " + bucketNum + " BUCKETS";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -471,6 +471,9 @@ public class HiveSchemaUtil {
|
||||
if (!config.partitionFields.isEmpty()) {
|
||||
sb.append(" PARTITIONED BY (").append(partitionsStr).append(")");
|
||||
}
|
||||
if (config.bucketSpec != null) {
|
||||
sb.append(' ' + config.bucketSpec + ' ');
|
||||
}
|
||||
sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'");
|
||||
if (serdeProperties != null && !serdeProperties.isEmpty()) {
|
||||
sb.append(" WITH SERDEPROPERTIES (").append(propertyToString(serdeProperties)).append(")");
|
||||
|
||||
Reference in New Issue
Block a user