1
0

[HUDI-4101] BucketIndexPartitioner should take partition path for better dispersion (#5590)

This commit is contained in:
Danny Chan
2022-05-17 10:34:57 +08:00
committed by GitHub
parent fdd96cc97e
commit d52d13302d
4 changed files with 16 additions and 8 deletions

View File

@@ -73,7 +73,7 @@ public class BucketIdentifier implements Serializable {
.map(p -> p.split(":"))
.collect(Collectors.toMap(p -> p[0], p -> p[1]));
return indexKeyFields.stream()
.map(f -> recordKeyPairs.get(f)).collect(Collectors.toList());
.map(recordKeyPairs::get).collect(Collectors.toList());
}
public static String partitionBucketIdStr(String partition, int bucketId) {

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.sink.bulk;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
import org.apache.hudi.common.util.StringUtils;
@@ -127,6 +128,10 @@ public class RowDataKeyGen implements Serializable {
keyGeneratorOpt);
}
public HoodieKey getHoodieKey(RowData rowData) {
return new HoodieKey(getRecordKey(rowData), getPartitionPath(rowData));
}
public String getRecordKey(RowData rowData) {
if (this.simpleRecordKey) {
return getRecordKey(recordKeyFieldGetter.getFieldOrNull(rowData), this.recordKeyFields[0]);

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.sink.partitioner;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.index.bucket.BucketIdentifier;
import org.apache.flink.api.common.functions.Partitioner;
@@ -28,7 +29,7 @@ import org.apache.flink.api.common.functions.Partitioner;
*
* @param <T> The type of obj to hash
*/
public class BucketIndexPartitioner<T extends String> implements Partitioner<T> {
public class BucketIndexPartitioner<T extends HoodieKey> implements Partitioner<T> {
private final int bucketNum;
private final String indexKeyFields;
@@ -39,8 +40,9 @@ public class BucketIndexPartitioner<T extends String> implements Partitioner<T>
}
@Override
public int partition(String key, int numPartitions) {
public int partition(HoodieKey key, int numPartitions) {
int curBucket = BucketIdentifier.getBucketId(key, indexKeyFields, bucketNum);
return BucketIdentifier.mod(curBucket, numPartitions);
int globalHash = (key.getPartitionPath() + curBucket).hashCode() & Integer.MAX_VALUE;
return BucketIdentifier.mod(globalHash, numPartitions);
}
}

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.sink.utils;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.configuration.OptionsResolver;
@@ -96,13 +97,13 @@ public class Pipelines {
String indexKeys = conf.getString(FlinkOptions.INDEX_KEY_FIELD);
int numBuckets = conf.getInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS);
BucketIndexPartitioner<String> partitioner = new BucketIndexPartitioner<>(numBuckets, indexKeys);
BucketIndexPartitioner<HoodieKey> partitioner = new BucketIndexPartitioner<>(numBuckets, indexKeys);
RowDataKeyGen keyGen = RowDataKeyGen.instance(conf, rowType);
RowType rowTypeWithFileId = BucketBulkInsertWriterHelper.rowTypeWithFileId(rowType);
InternalTypeInfo<RowData> typeInfo = InternalTypeInfo.of(rowTypeWithFileId);
Map<String, String> bucketIdToFileId = new HashMap<>();
dataStream = dataStream.partitionCustom(partitioner, keyGen::getRecordKey)
dataStream = dataStream.partitionCustom(partitioner, keyGen::getHoodieKey)
.map(record -> BucketBulkInsertWriterHelper.rowWithFileId(bucketIdToFileId, keyGen, record, indexKeys, numBuckets), typeInfo)
.setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS)); // same parallelism as write task to avoid shuffle
if (conf.getBoolean(FlinkOptions.WRITE_BULK_INSERT_SORT_INPUT)) {
@@ -319,8 +320,8 @@ public class Pipelines {
WriteOperatorFactory<HoodieRecord> operatorFactory = BucketStreamWriteOperator.getFactory(conf);
int bucketNum = conf.getInteger(FlinkOptions.BUCKET_INDEX_NUM_BUCKETS);
String indexKeyFields = conf.getString(FlinkOptions.INDEX_KEY_FIELD);
BucketIndexPartitioner<String> partitioner = new BucketIndexPartitioner<>(bucketNum, indexKeyFields);
return dataStream.partitionCustom(partitioner, HoodieRecord::getRecordKey)
BucketIndexPartitioner<HoodieKey> partitioner = new BucketIndexPartitioner<>(bucketNum, indexKeyFields);
return dataStream.partitionCustom(partitioner, HoodieRecord::getKey)
.transform("bucket_write", TypeInformation.of(Object.class), operatorFactory)
.uid("uid_bucket_write" + conf.getString(FlinkOptions.TABLE_NAME))
.setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS));