1
0

[HUDI-3123] consistent hashing index: basic write path (upsert/insert) (#4480)

1. basic write path(insert/upsert) implementation
 2. adapt simple bucket index
This commit is contained in:
Yuwei XIAO
2022-05-16 11:07:01 +08:00
committed by GitHub
parent 1fded18dff
commit 61030d8e7a
41 changed files with 1510 additions and 237 deletions

View File

@@ -77,6 +77,15 @@ public abstract class HoodieData<T> implements Serializable {
public abstract <O> HoodieData<O> mapPartitions(
SerializableFunction<Iterator<T>, Iterator<O>> func, boolean preservesPartitioning);
/**
* @param func serializable map function by taking a partition of objects
* and generating an iterator.
* @param <O> output object type.
* @return {@link HoodieData<O>} containing the result. Actual execution may be deferred.
*/
public abstract <O> HoodieData<O> mapPartitions(
SerializableFunction<Iterator<T>, Iterator<O>> func);
/**
* @param func serializable flatmap function.
* @param <O> output object type.

View File

@@ -99,6 +99,11 @@ public class HoodieList<T> extends HoodieData<T> {
@Override
public <O> HoodieData<O> mapPartitions(SerializableFunction<Iterator<T>, Iterator<O>> func, boolean preservesPartitioning) {
return mapPartitions(func);
}
@Override
public <O> HoodieData<O> mapPartitions(SerializableFunction<Iterator<T>, Iterator<O>> func) {
List<O> result = new ArrayList<>();
throwingMapWrapper(func).apply(listData.iterator()).forEachRemaining(result::add);
return HoodieList.of(result);

View File

@@ -348,6 +348,10 @@ public class FSUtils {
return UUID.randomUUID().toString();
}
public static String createNewFileId(String idPfx, int id) {
return String.format("%s-%d", idPfx, id);
}
/**
* Get the file extension from the log file.
*/

View File

@@ -0,0 +1,78 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.model;
import org.apache.hudi.common.util.JsonUtils;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Used in consistent hashing index, representing nodes in the consistent hash ring.
* Record the end hash range value and its corresponding file group id.
*/
@JsonIgnoreProperties(ignoreUnknown = true)
public class ConsistentHashingNode implements Serializable {
private final int value;
private final String fileIdPrefix;
@JsonCreator
public ConsistentHashingNode(@JsonProperty("value") int value, @JsonProperty("fileIdPrefix") String fileIdPrefix) {
this.value = value;
this.fileIdPrefix = fileIdPrefix;
}
public static String toJsonString(List<ConsistentHashingNode> nodes) throws IOException {
return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(nodes);
}
public static List<ConsistentHashingNode> fromJsonString(String json) throws Exception {
if (json == null || json.isEmpty()) {
return Collections.emptyList();
}
ConsistentHashingNode[] nodes = JsonUtils.getObjectMapper().readValue(json, ConsistentHashingNode[].class);
return Arrays.asList(nodes);
}
public int getValue() {
return value;
}
public String getFileIdPrefix() {
return fileIdPrefix;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder("ConsistentHashingNode{");
sb.append("value=").append(value);
sb.append(", fileIdPfx='").append(fileIdPrefix).append('\'');
sb.append('}');
return sb.toString();
}
}

View File

@@ -18,17 +18,15 @@
package org.apache.hudi.common.model;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.JsonUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -227,7 +225,7 @@ public class HoodieCommitMetadata implements Serializable {
LOG.info("partition path is null for " + partitionToWriteStats.get(null));
partitionToWriteStats.remove(null);
}
return getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
}
public static <T> T fromJsonString(String jsonStr, Class<T> clazz) throws Exception {
@@ -235,7 +233,7 @@ public class HoodieCommitMetadata implements Serializable {
// For empty commit file (no data or somethings bad happen).
return clazz.newInstance();
}
return getObjectMapper().readValue(jsonStr, clazz);
return JsonUtils.getObjectMapper().readValue(jsonStr, clazz);
}
// Here the functions are named "fetch" instead of "get", to get avoid of the json conversion.
@@ -457,13 +455,6 @@ public class HoodieCommitMetadata implements Serializable {
}
}
protected static ObjectMapper getObjectMapper() {
ObjectMapper mapper = new ObjectMapper();
mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
mapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY);
return mapper;
}
@Override
public String toString() {
return "HoodieCommitMetadata{" + "partitionToWriteStats=" + partitionToWriteStats

View File

@@ -0,0 +1,142 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.model;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.JsonUtils;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* All the metadata that is used for consistent hashing bucket index
*/
@JsonIgnoreProperties(ignoreUnknown = true)
public class HoodieConsistentHashingMetadata implements Serializable {
private static final Logger LOG = LogManager.getLogger(HoodieConsistentHashingMetadata.class);
/**
* Upper-bound of the hash value
*/
public static final int HASH_VALUE_MASK = Integer.MAX_VALUE;
public static final String HASHING_METADATA_FILE_SUFFIX = ".hashing_meta";
private final short version;
private final String partitionPath;
private final String instant;
private final int numBuckets;
private final int seqNo;
private final List<ConsistentHashingNode> nodes;
@JsonCreator
public HoodieConsistentHashingMetadata(@JsonProperty("version") short version, @JsonProperty("partitionPath") String partitionPath,
@JsonProperty("instant") String instant, @JsonProperty("numBuckets") int numBuckets,
@JsonProperty("seqNo") int seqNo, @JsonProperty("nodes") List<ConsistentHashingNode> nodes) {
this.version = version;
this.partitionPath = partitionPath;
this.instant = instant;
this.numBuckets = numBuckets;
this.seqNo = seqNo;
this.nodes = nodes;
}
/**
* Construct default metadata with all bucket's file group uuid initialized
*/
public HoodieConsistentHashingMetadata(String partitionPath, int numBuckets) {
this((short) 0, partitionPath, HoodieTimeline.INIT_INSTANT_TS, numBuckets, 0, constructDefaultHashingNodes(numBuckets));
}
private static List<ConsistentHashingNode> constructDefaultHashingNodes(int numBuckets) {
long step = ((long) HASH_VALUE_MASK + numBuckets - 1) / numBuckets;
return IntStream.range(1, numBuckets + 1)
.mapToObj(i -> new ConsistentHashingNode((int) Math.min(step * i, HASH_VALUE_MASK), FSUtils.createNewFileIdPfx())).collect(Collectors.toList());
}
public short getVersion() {
return version;
}
public String getPartitionPath() {
return partitionPath;
}
public String getInstant() {
return instant;
}
public int getNumBuckets() {
return numBuckets;
}
public int getSeqNo() {
return seqNo;
}
public List<ConsistentHashingNode> getNodes() {
return nodes;
}
public String getFilename() {
return instant + HASHING_METADATA_FILE_SUFFIX;
}
public byte[] toBytes() throws IOException {
return toJsonString().getBytes(StandardCharsets.UTF_8);
}
public static HoodieConsistentHashingMetadata fromBytes(byte[] bytes) throws IOException {
try {
return fromJsonString(new String(bytes, StandardCharsets.UTF_8), HoodieConsistentHashingMetadata.class);
} catch (Exception e) {
throw new IOException("unable to read hashing metadata", e);
}
}
private String toJsonString() throws IOException {
return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
}
protected static <T> T fromJsonString(String jsonStr, Class<T> clazz) throws Exception {
if (jsonStr == null || jsonStr.isEmpty()) {
// For empty commit file (no data or something bad happen).
return clazz.newInstance();
}
return JsonUtils.getObjectMapper().readValue(jsonStr, clazz);
}
/**
* Get instant time from the hashing metadata filename
* Pattern of the filename: <instant>.HASHING_METADATA_FILE_SUFFIX
*/
public static String getTimestampFromFile(String filename) {
return filename.split("\\.")[0];
}
}

View File

@@ -18,11 +18,9 @@
package org.apache.hudi.common.model;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import org.apache.hudi.common.util.JsonUtils;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -80,7 +78,7 @@ public class HoodieReplaceCommitMetadata extends HoodieCommitMetadata {
LOG.info("partition path is null for " + partitionToReplaceFileIds.get(null));
partitionToReplaceFileIds.remove(null);
}
return getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
}
public static <T> T fromJsonString(String jsonStr, Class<T> clazz) throws Exception {
@@ -88,7 +86,7 @@ public class HoodieReplaceCommitMetadata extends HoodieCommitMetadata {
// For empty commit file (no data or somethings bad happen).
return clazz.newInstance();
}
return getObjectMapper().readValue(jsonStr, clazz);
return JsonUtils.getObjectMapper().readValue(jsonStr, clazz);
}
@Override
@@ -124,13 +122,6 @@ public class HoodieReplaceCommitMetadata extends HoodieCommitMetadata {
}
}
protected static ObjectMapper getObjectMapper() {
ObjectMapper mapper = new ObjectMapper();
mapper.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
mapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY);
return mapper;
}
@Override
public String toString() {
return "HoodieReplaceMetadata{" + "partitionToWriteStats=" + partitionToWriteStats

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.common.model;
import org.apache.hudi.common.util.JsonUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -81,7 +83,7 @@ public class HoodieRollingStatMetadata implements Serializable {
LOG.info("partition path is null for " + partitionToRollingStats.get(null));
partitionToRollingStats.remove(null);
}
return HoodieCommitMetadata.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
return JsonUtils.getObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
}
public HoodieRollingStatMetadata merge(HoodieRollingStatMetadata rollingStatMetadata) {

View File

@@ -85,6 +85,7 @@ public class HoodieTableMetaClient implements Serializable {
public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap";
public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat";
public static final String METADATA_TABLE_FOLDER_PATH = METAFOLDER_NAME + Path.SEPARATOR + "metadata";
public static final String HASHING_METADATA_FOLDER_NAME = ".bucket_index" + Path.SEPARATOR + "consistent_hashing_metadata";
public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH
+ Path.SEPARATOR + ".partitions";
public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR
@@ -211,6 +212,13 @@ public class HoodieTableMetaClient implements Serializable {
return new Path(metaPath.get(), SCHEMA_FOLDER_NAME).toString();
}
/**
* @return Hashing metadata base path
*/
public String getHashingMetadataPath() {
return new Path(metaPath.get(), HASHING_METADATA_FOLDER_NAME).toString();
}
/**
* @return Temp Folder path
*/

View File

@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.util;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
public class JsonUtils {
private static final ObjectMapper MAPPER = new ObjectMapper();
static {
MAPPER.disable(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES);
MAPPER.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY);
}
public static ObjectMapper getObjectMapper() {
return MAPPER;
}
}

View File

@@ -106,6 +106,15 @@ public class HashID implements Serializable {
}
}
public static int getXXHash32(final String message, int hashSeed) {
return getXXHash32(message.getBytes(StandardCharsets.UTF_8), hashSeed);
}
public static int getXXHash32(final byte[] message, int hashSeed) {
XXHashFactory factory = XXHashFactory.fastestInstance();
return factory.hash32().hash(message, 0, message.length, hashSeed);
}
private static byte[] getXXHash(final byte[] message, final Size bits) {
XXHashFactory factory = XXHashFactory.fastestInstance();
switch (bits) {

View File

@@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.model;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class TestHoodieConsistentHashingMetadata {
@Test
public void testGetTimestamp() {
Assertions.assertTrue(HoodieConsistentHashingMetadata.getTimestampFromFile("0000.hashing_metadata").equals("0000"));
Assertions.assertTrue(HoodieConsistentHashingMetadata.getTimestampFromFile("1234.hashing_metadata").equals("1234"));
}
}

View File

@@ -66,6 +66,10 @@ public class HoodieCommonTestHarness {
dataGen = new HoodieTestDataGenerator();
}
protected void initTestDataGenerator(String[] partitionPaths) {
dataGen = new HoodieTestDataGenerator(partitionPaths);
}
/**
* Cleanups test data generator.
*