1
0

[HUDI-3123] consistent hashing index: basic write path (upsert/insert) (#4480)

1. basic write path(insert/upsert) implementation
 2. adapt simple bucket index
This commit is contained in:
Yuwei XIAO
2022-05-16 11:07:01 +08:00
committed by GitHub
parent 1fded18dff
commit 61030d8e7a
41 changed files with 1510 additions and 237 deletions

View File

@@ -0,0 +1,122 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.index.bucket;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.keygen.KeyGenUtils;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import java.util.List;
public class TestBucketIdentifier {
public static final String NESTED_COL_SCHEMA = "{\"type\":\"record\", \"name\":\"nested_col\",\"fields\": ["
+ "{\"name\": \"prop1\",\"type\": \"string\"},{\"name\": \"prop2\", \"type\": \"long\"}]}";
public static final String EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"testrec\",\"fields\": [ "
+ "{\"name\": \"timestamp\",\"type\": \"long\"},{\"name\": \"_row_key\", \"type\": \"string\"},"
+ "{\"name\": \"ts_ms\", \"type\": \"string\"},"
+ "{\"name\": \"pii_col\", \"type\": \"string\"},"
+ "{\"name\": \"nested_col\",\"type\": "
+ NESTED_COL_SCHEMA + "}"
+ "]}";
public static GenericRecord getRecord() {
return getRecord(getNestedColRecord("val1", 10L));
}
public static GenericRecord getNestedColRecord(String prop1Value, Long prop2Value) {
GenericRecord nestedColRecord = new GenericData.Record(new Schema.Parser().parse(NESTED_COL_SCHEMA));
nestedColRecord.put("prop1", prop1Value);
nestedColRecord.put("prop2", prop2Value);
return nestedColRecord;
}
public static GenericRecord getRecord(GenericRecord nestedColRecord) {
GenericRecord record = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA));
record.put("timestamp", 4357686L);
record.put("_row_key", "key1");
record.put("ts_ms", "2020-03-21");
record.put("pii_col", "pi");
record.put("nested_col", nestedColRecord);
return record;
}
@Test
public void testBucketFileId() {
int[] ids = {0, 4, 8, 16, 32, 64, 128, 256, 512, 1000, 1024, 4096, 10000, 100000};
for (int id : ids) {
String bucketIdStr = BucketIdentifier.bucketIdStr(id);
String fileId = BucketIdentifier.newBucketFileIdPrefix(bucketIdStr);
assert BucketIdentifier.bucketIdFromFileId(fileId) == id;
}
}
@Test
public void testBucketIdWithSimpleRecordKey() {
String recordKeyField = "_row_key";
String indexKeyField = "_row_key";
GenericRecord record = getRecord();
HoodieRecord hoodieRecord = new HoodieAvroRecord(
new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null);
int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
assert bucketId == BucketIdentifier.getBucketId(
Arrays.asList(record.get(indexKeyField).toString()), 8);
}
@Test
public void testBucketIdWithComplexRecordKey() {
List<String> recordKeyField = Arrays.asList("_row_key", "ts_ms");
String indexKeyField = "_row_key";
GenericRecord record = getRecord();
HoodieRecord hoodieRecord = new HoodieAvroRecord(
new HoodieKey(KeyGenUtils.getRecordKey(record, recordKeyField, false), ""), null);
int bucketId = BucketIdentifier.getBucketId(hoodieRecord, indexKeyField, 8);
assert bucketId == BucketIdentifier.getBucketId(
Arrays.asList(record.get(indexKeyField).toString()), 8);
}
@Test
public void testGetHashKeys() {
BucketIdentifier identifier = new BucketIdentifier();
List<String> keys = identifier.getHashKeys(new HoodieKey("abc", "partition"), "");
Assertions.assertEquals(1, keys.size());
Assertions.assertEquals("abc", keys.get(0));
keys = identifier.getHashKeys(new HoodieKey("f1:abc", "partition"), "f1");
Assertions.assertEquals(1, keys.size());
Assertions.assertEquals("abc", keys.get(0));
keys = identifier.getHashKeys(new HoodieKey("f1:abc,f2:bcd", "partition"), "f2");
Assertions.assertEquals(1, keys.size());
Assertions.assertEquals("bcd", keys.get(0));
keys = identifier.getHashKeys(new HoodieKey("f1:abc,f2:bcd", "partition"), "f1,f2");
Assertions.assertEquals(2, keys.size());
Assertions.assertEquals("abc", keys.get(0));
Assertions.assertEquals("bcd", keys.get(1));
}
}

View File

@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.index.bucket;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.ConsistentHashingNode;
import org.apache.hudi.common.model.HoodieConsistentHashingMetadata;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import java.util.List;
import static org.apache.hudi.common.model.HoodieConsistentHashingMetadata.HASH_VALUE_MASK;
/**
* Unit test of consistent bucket identifier
*/
public class TestConsistentBucketIdIdentifier {
@Test
public void testGetBucket() {
List<ConsistentHashingNode> nodes = Arrays.asList(
new ConsistentHashingNode(100, "0"),
new ConsistentHashingNode(0x2fffffff, "1"),
new ConsistentHashingNode(0x4fffffff, "2"));
HoodieConsistentHashingMetadata meta = new HoodieConsistentHashingMetadata((short) 0, "", "", 3, 0, nodes);
ConsistentBucketIdentifier identifier = new ConsistentBucketIdentifier(meta);
Assertions.assertEquals(3, identifier.getNumBuckets());
// Get bucket by hash keys
Assertions.assertEquals(nodes.get(2), identifier.getBucket(Arrays.asList("Hudi")));
Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("bucket_index")));
Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("consistent_hashing")));
Assertions.assertEquals(nodes.get(1), identifier.getBucket(Arrays.asList("bucket_index", "consistent_hashing")));
int[] ref1 = {2, 2, 1, 1, 0, 1, 1, 1, 0, 1};
int[] ref2 = {1, 0, 1, 0, 1, 1, 1, 0, 1, 2};
for (int i = 0; i < 10; ++i) {
Assertions.assertEquals(nodes.get(ref1[i]), identifier.getBucket(Arrays.asList(Integer.toString(i))));
Assertions.assertEquals(nodes.get(ref2[i]), identifier.getBucket(Arrays.asList(Integer.toString(i), Integer.toString(i + 1))));
}
// Get bucket by hash value
Assertions.assertEquals(nodes.get(0), identifier.getBucket(0));
Assertions.assertEquals(nodes.get(0), identifier.getBucket(50));
Assertions.assertEquals(nodes.get(0), identifier.getBucket(100));
Assertions.assertEquals(nodes.get(1), identifier.getBucket(101));
Assertions.assertEquals(nodes.get(1), identifier.getBucket(0x1fffffff));
Assertions.assertEquals(nodes.get(1), identifier.getBucket(0x2fffffff));
Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x40000000));
Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x40000001));
Assertions.assertEquals(nodes.get(2), identifier.getBucket(0x4fffffff));
Assertions.assertEquals(nodes.get(0), identifier.getBucket(0x50000000));
Assertions.assertEquals(nodes.get(0), identifier.getBucket(HASH_VALUE_MASK));
// Get bucket by file id
Assertions.assertEquals(nodes.get(0), identifier.getBucketByFileId(FSUtils.createNewFileId("0", 0)));
Assertions.assertEquals(nodes.get(1), identifier.getBucketByFileId(FSUtils.createNewFileId("1", 0)));
Assertions.assertEquals(nodes.get(2), identifier.getBucketByFileId(FSUtils.createNewFileId("2", 0)));
}
}