[HUDI-3123] consistent hashing index: basic write path (upsert/insert) (#4480)
1. basic write path(insert/upsert) implementation 2. adapt simple bucket index
This commit is contained in:
@@ -0,0 +1,122 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.bucket;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieAvroRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.keygen.KeyGenUtils;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class TestBucketIdentifier {
|
||||
|
||||
public static final String NESTED_COL_SCHEMA = "{\"type\":\"record\", \"name\":\"nested_col\",\"fields\": ["
|
||||
+ "{\"name\": \"prop1\",\"type\": \"string\"},{\"name\": \"prop2\", \"type\": \"long\"}]}";
|
||||
public static final String EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ "
|
||||
+ "{\"name\": \"timestamp\",\"type\": \"long\"},{\"name\": \"_row_key\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"ts_ms\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"pii_col\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"nested_col\",\"type\": "
|
||||
+ NESTED_COL_SCHEMA + "}"
|
||||
+ "]}";
|
||||
|
||||
public static GenericRecord getRecord() {
|
||||
return getRecord(getNestedColRecord("val1", 10L));
|
||||
}
|
||||
|
||||
public static GenericRecord getNestedColRecord(String prop1Value, Long prop2Value) {
|
||||
GenericRecord nestedColRecord = new GenericData.Record(new Schema.Parser().parse(NESTED_COL_SCHEMA));
|
||||
nestedColRecord.put("prop1", prop1Value);
|
||||
nestedColRecord.put("prop2", prop2Value);
|
||||
return nestedColRecord;
|
||||
}
|
||||
|
||||
public static GenericRecord getRecord(GenericRecord nestedColRecord) {
|
||||
GenericRecord record = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA));
|
||||
record.put("timestamp", 4357686L);
|
||||
record.put("_row_key", "key1");
|
||||
record.put("ts_ms", "2020-03-21");
|
||||
record.put("pii_col", "pi");
|
||||
record.put("nested_col", nestedColRecord);
|
||||
return record;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBucketFileId() {
|
||||
int[] ids = {0, 4, 8, 16, 32, 64, 128, 256, 512, 1000, 1024, 4096, 10000, 100000};
|
||||
for (int id : ids) {
|
||||
String bucketIdStr = BucketIdentifier.bucketIdStr(id);
|
||||
String fileId = BucketIdentifier.newBucketFileIdPrefix(bucketIdStr);
|
||||
assert BucketIdentifier.bucketIdFromFileId(fileId) == id;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBucketIdWithSimpleRecordKey() {
|
||||
String recordKeyField = "_row_key";
|
||||
String indexKeyField = "_row_key";
|
||||
GenericRecord record = getRecord();
|
||||
HoodieRecord hoodieRecord = new HoodieAvroRecord(
|
||||
new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null);
|
||||
int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
|
||||
assert bucketId == BucketIdentifier.getBucketId(
|
||||
Arrays.asList(record.get(indexKeyField).toString()), 8);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBucketIdWithComplexRecordKey() {
|
||||
List<String> recordKeyField = Arrays.asList("_row_key", "ts_ms");
|
||||
String indexKeyField = "_row_key";
|
||||
GenericRecord record = getRecord();
|
||||
HoodieRecord hoodieRecord = new HoodieAvroRecord(
|
||||
new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null);
|
||||
int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
|
||||
assert bucketId == BucketIdentifier.getBucketId(
|
||||
Arrays.asList(record.get(indexKeyField).toString()), 8);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetHashKeys() {
|
||||
BucketIdentifier identifier = new BucketIdentifier();
|
||||
List<String> keys = identifier.getHashKeys(new HoodieKey("abc", "partition"), "");
|
||||
Assertions.assertEquals(1, keys.size());
|
||||
Assertions.assertEquals("abc", keys.get(0));
|
||||
|
||||
keys = identifier.getHashKeys(new HoodieKey("f1:abc", "partition"), "f1");
|
||||
Assertions.assertEquals(1, keys.size());
|
||||
Assertions.assertEquals("abc", keys.get(0));
|
||||
|
||||
keys = identifier.getHashKeys(new HoodieKey("f1:abc,f2:bcd", "partition"), "f2");
|
||||
Assertions.assertEquals(1, keys.size());
|
||||
Assertions.assertEquals("bcd", keys.get(0));
|
||||
|
||||
keys = identifier.getHashKeys(new HoodieKey("f1:abc,f2:bcd", "partition"), "f1,f2");
|
||||
Assertions.assertEquals(2, keys.size());
|
||||
Assertions.assertEquals("abc", keys.get(0));
|
||||
Assertions.assertEquals("bcd", keys.get(1));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.index.bucket;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.ConsistentHashingNode;
|
||||
import org.apache.hudi.common.model.HoodieConsistentHashingMetadata;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static org.apache.hudi.common.model.HoodieConsistentHashingMetadata.HASH_VALUE_MASK;
|
||||
|
||||
/**
|
||||
* Unit test of consistent bucket identifier
|
||||
*/
|
||||
public class TestConsistentBucketIdIdentifier {
|
||||
|
||||
@Test
|
||||
public void testGetBucket() {
|
||||
List<ConsistentHashingNode> nodes = Arrays.asList(
|
||||
new ConsistentHashingNode(100, "0"),
|
||||
new ConsistentHashingNode(0x2fffffff, "1"),
|
||||
new ConsistentHashingNode(0x4fffffff, "2"));
|
||||
HoodieConsistentHashingMetadata meta = new HoodieConsistentHashingMetadata((short) 0, "", "", 3, 0, nodes);
|
||||
ConsistentBucketIdentifier identifier = new ConsistentBucketIdentifier(meta);
|
||||
|
||||
Assertions.assertEquals(3, identifier.getNumBuckets());
|
||||
|
||||
// Get bucket by hash keys
|
||||
Assertions.assertEquals(nodes.get(2), identifier.getBucket(Arrays.asList("Hudi")));
|
||||
Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("bucket_index")));
|
||||
Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("consistent_hashing")));
|
||||
Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("bucket_index", "consistent_hashing")));
|
||||
int[] ref1 = {2, 2, 1, 1, 0, 1, 1, 1, 0, 1};
|
||||
int[] ref2 = {1, 0, 1, 0, 1, 1, 1, 0, 1, 2};
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
Assertions.assertEquals(nodes.get(ref1[i]), identifier.getBucket(Arrays.asList(Integer.toString(i))));
|
||||
Assertions.assertEquals(nodes.get(ref2[i]), identifier.getBucket(Arrays.asList(Integer.toString(i), Integer.toString(i + 1))));
|
||||
}
|
||||
|
||||
// Get bucket by hash value
|
||||
Assertions.assertEquals(nodes.get(0), identifier.getBucket(0));
|
||||
Assertions.assertEquals(nodes.get(0), identifier.getBucket(50));
|
||||
Assertions.assertEquals(nodes.get(0), identifier.getBucket(100));
|
||||
Assertions.assertEquals(nodes.get(1), identifier.getBucket(101));
|
||||
Assertions.assertEquals(nodes.get(1), identifier.getBucket(0x1fffffff));
|
||||
Assertions.assertEquals(nodes.get(1), identifier.getBucket(0x2fffffff));
|
||||
Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x40000000));
|
||||
Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x40000001));
|
||||
Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x4fffffff));
|
||||
Assertions.assertEquals(nodes.get(0), identifier.getBucket(0x50000000));
|
||||
Assertions.assertEquals(nodes.get(0), identifier.getBucket(HASH_VALUE_MASK));
|
||||
|
||||
// Get bucket by file id
|
||||
Assertions.assertEquals(nodes.get(0), identifier.getBucketByFileId(FSUtils.createNewFileId("0", 0)));
|
||||
Assertions.assertEquals(nodes.get(1), identifier.getBucketByFileId(FSUtils.createNewFileId("1", 0)));
|
||||
Assertions.assertEquals(nodes.get(2), identifier.getBucketByFileId(FSUtils.createNewFileId("2", 0)));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user