1
0

[HUDI-4210] Create custom hbase index to solve data skew issue on hbase regions (#5797)

This commit is contained in:
冯健
2022-07-26 18:09:17 +08:00
committed by GitHub
parent 1ea1e659c2
commit e5faf2cc84
4 changed files with 53 additions and 3 deletions

View File

@@ -183,6 +183,10 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
.noDefaultValue() .noDefaultValue()
.withDocumentation("The value of hbase.master.kerberos.principal in hbase cluster."); .withDocumentation("The value of hbase.master.kerberos.principal in hbase cluster.");
public static final ConfigProperty<Integer> BUCKET_NUMBER = ConfigProperty
.key("hoodie.index.hbase.bucket.number")
.defaultValue(8)
.withDocumentation("Only applicable when using RebalancedSparkHoodieHBaseIndex, same as hbase regions count can get the best performance");
/** /**
* @deprecated Use {@link #ZKQUORUM} and its methods instead * @deprecated Use {@link #ZKQUORUM} and its methods instead

View File

@@ -1553,6 +1553,10 @@ public class HoodieWriteConfig extends HoodieConfig {
return getBooleanOrDefault(HoodieHBaseIndexConfig.UPDATE_PARTITION_PATH_ENABLE); return getBooleanOrDefault(HoodieHBaseIndexConfig.UPDATE_PARTITION_PATH_ENABLE);
} }
public int getHBaseIndexRegionCount() {
return getInt(HoodieHBaseIndexConfig.BUCKET_NUMBER);
}
public int getBloomIndexParallelism() { public int getBloomIndexParallelism() {
return getInt(HoodieIndexConfig.BLOOM_INDEX_PARALLELISM); return getInt(HoodieIndexConfig.BLOOM_INDEX_PARALLELISM);
} }

View File

@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.index.hbase;
import org.apache.hudi.config.HoodieWriteConfig;
/**
* Extends {@link SparkHoodieHBaseIndex}, add random prefix to key for avoiding data skew issue in hbase regions.
*/
public class RebalancedSparkHoodieHBaseIndex extends SparkHoodieHBaseIndex {
public RebalancedSparkHoodieHBaseIndex(HoodieWriteConfig config) {
super(config);
}
@Override
protected String getHBaseKey(String originalKey) {
int bucket = Math.abs(originalKey.hashCode()) % config.getHBaseIndexRegionCount();
String bucketStr = String.format("%0" + String.valueOf(config.getHBaseIndexRegionCount() - 1).length() + "d", bucket);
return bucketStr + originalKey;
}
}

View File

@@ -205,7 +205,7 @@ public class SparkHoodieHBaseIndex extends HoodieIndex<Object, Object> {
} }
private Get generateStatement(String key) throws IOException { private Get generateStatement(String key) throws IOException {
return new Get(Bytes.toBytes(key)).setMaxVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN) return new Get(Bytes.toBytes(getHBaseKey(key))).setMaxVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN); .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
} }
@@ -213,6 +213,10 @@ public class SparkHoodieHBaseIndex extends HoodieIndex<Object, Object> {
return generateStatement(key).setTimeRange(startTime, endTime); return generateStatement(key).setTimeRange(startTime, endTime);
} }
protected String getHBaseKey(String key) {
return key;
}
private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) { private boolean checkIfValidCommit(HoodieTableMetaClient metaClient, String commitTs) {
HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants(); HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
// Check if the last commit ts for this row is 1) present in the timeline or // Check if the last commit ts for this row is 1) present in the timeline or
@@ -354,14 +358,14 @@ public class SparkHoodieHBaseIndex extends HoodieIndex<Object, Object> {
// This is an update, no need to update index // This is an update, no need to update index
continue; continue;
} }
Put put = new Put(Bytes.toBytes(rec.getRecordKey())); Put put = new Put(Bytes.toBytes(getHBaseKey(rec.getRecordKey())));
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime())); put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, Bytes.toBytes(loc.get().getInstantTime()));
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId())); put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, Bytes.toBytes(loc.get().getFileId()));
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath())); put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, Bytes.toBytes(rec.getPartitionPath()));
mutations.add(put); mutations.add(put);
} else { } else {
// Delete existing index for a deleted record // Delete existing index for a deleted record
Delete delete = new Delete(Bytes.toBytes(rec.getRecordKey())); Delete delete = new Delete(Bytes.toBytes(getHBaseKey(rec.getRecordKey())));
mutations.add(delete); mutations.add(delete);
} }
} }