From f6326693efe657cc0dbd45f1d394fcff7266e523 Mon Sep 17 00:00:00 2001 From: Manoj Govindassamy Date: Fri, 29 Oct 2021 16:19:38 -0700 Subject: [PATCH] [HUDI-1295] Hash ID generator util for Hudi table columns, partition and files (#3884) * [HUDI-1295] Hash ID generator util for Hudi table columns, partition and files - Adding a new utility class HashID to generate 32,64,128 bits hashes for any given message of string or byte array type. This class internally uses MessageDigest and xxhash libraries. - Adding stateful hash holders for Hudi table columns, partition and files to pass around for metaindex and to convert to base64encoded strings whenever needed --- hudi-common/pom.xml | 8 ++ .../hudi/common/util/Base64CodecUtil.java | 6 +- .../hudi/common/util/hash/ColumnID.java | 61 ++++++++ .../apache/hudi/common/util/hash/FileID.java | 61 ++++++++ .../apache/hudi/common/util/hash/HashID.java | 132 ++++++++++++++++++ .../hudi/common/util/hash/HoodieID.java | 117 ++++++++++++++++ .../hudi/common/util/hash/PartitionID.java | 62 ++++++++ .../hudi/common/util/hash/TestHashID.java | 127 +++++++++++++++++ 8 files changed, 571 insertions(+), 3 deletions(-) create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java create mode 100644 hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index 71326e750..bbf2b2972 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -248,5 +248,13 @@ + + + + org.lz4 + lz4-java + 1.8.0 + + diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java index a86879ad6..97e9133cf 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java @@ -26,11 +26,11 @@ public final class Base64CodecUtil { /** * Decodes data from the input string into using the encoding scheme. * - * @param serString + * @param encodedString - Base64 encoded string to decode * @return A newly-allocated byte array containing the decoded bytes. */ - public static byte[] decode(String serString) { - return Base64.getDecoder().decode(serString.getBytes(StandardCharsets.UTF_8)); + public static byte[] decode(String encodedString) { + return Base64.getDecoder().decode(encodedString.getBytes(StandardCharsets.UTF_8)); } /** diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java new file mode 100644 index 000000000..be4db44ec --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * A stateful Hoodie object ID representing any table column. + */ +public class ColumnID extends HoodieID { + + private static final Type TYPE = Type.COLUMN; + private static final HashID.Size ID_COLUMN_HASH_SIZE = HashID.Size.BITS_64; + private final byte[] hash; + + public ColumnID(final String message) { + this.hash = HashID.hash(message, ID_COLUMN_HASH_SIZE); + } + + @Override + public int bits() { + return ID_COLUMN_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java new file mode 100644 index 000000000..0cb73c5ab --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * Hoodie object ID representing any file. + */ +public class FileID extends HoodieID { + + private static final Type TYPE = Type.FILE; + private static final HashID.Size ID_FILE_HASH_SIZE = HashID.Size.BITS_128; + private final byte[] hash; + + public FileID(final String message) { + this.hash = HashID.hash(message, ID_FILE_HASH_SIZE); + } + + @Override + public int bits() { + return ID_FILE_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java new file mode 100644 index 000000000..c56d76097 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import net.jpountz.xxhash.XXHash32; +import net.jpountz.xxhash.XXHash64; +import net.jpountz.xxhash.XXHashFactory; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hudi.exception.HoodieIOException; + +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +/** + * A stateless Hash class which generates ID for the desired bit count. + */ +public class HashID implements Serializable { + + private static final String MD5_ALGORITHM_NAME = "MD5"; + private static final int HASH_SEED = 0xdabadaba; + + /** + * Represents HashID size in bits. + */ + public enum Size { + BITS_32(32), + BITS_64(64), + BITS_128(128); + + private final int bits; + + Size(int bitCount) { + this.bits = bitCount; + } + + /** + * Get this Hash size in bytes. + * + * @return Bytes needed to represent this size + */ + public int byteSize() { + return (((this.bits - 1) / Byte.SIZE) + 1); + } + + /** + * Get this Hash size in bits. + * + * @return bits needed to represent the size + */ + public int bits() { + return this.bits; + } + + @Override + public String toString() { + return "HashSize{" + bits + "}"; + } + } + + /** + * Get the hash value for a string message and for the desired @{@link Size}. + * + * @param message - String message to get the hash value for + * @param bits - @{@link Size} of the hash value + * @return Hash value for the message as byte array + */ + public static byte[] hash(final String message, final Size bits) { + return hash(message.getBytes(StandardCharsets.UTF_8), bits); + } + + /** + * Get the hash value for a byte array and for the desired @{@link Size}. + * + * @param messageBytes - Byte array message to get the hash value for + * @param bits - @{@link Size} of the hash value + * @return Hash value for the message as byte array + */ + public static byte[] hash(final byte[] messageBytes, final Size bits) { + switch (bits) { + case BITS_32: + case BITS_64: + return getXXHash(messageBytes, bits); + case BITS_128: + return getMD5Hash(messageBytes); + default: + throw new IllegalArgumentException("Unexpected Hash size bits: " + bits); + } + } + + private static byte[] getXXHash(final byte[] message, final Size bits) { + XXHashFactory factory = XXHashFactory.fastestInstance(); + switch (bits) { + case BITS_32: + XXHash32 hash32 = factory.hash32(); + return Bytes.toBytes(hash32.hash(message, 0, message.length, HASH_SEED)); + case BITS_64: + XXHash64 hash64 = factory.hash64(); + return Bytes.toBytes(hash64.hash(message, 0, message.length, HASH_SEED)); + default: + throw new HoodieIOException("XX" + bits + " hash is unsupported!"); + } + } + + private static byte[] getMD5Hash(final byte[] message) throws HoodieIOException { + try { + MessageDigest messageDigest = MessageDigest.getInstance(MD5_ALGORITHM_NAME); + messageDigest.update(message); + return messageDigest.digest(); + } catch (NoSuchAlgorithmException e) { + throw new HoodieIOException("Failed to create MD5 Hash: " + e); + } + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java new file mode 100644 index 000000000..e08e254b0 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.exception.HoodieNotSupportedException; + +import java.io.Serializable; + +/** + * A serializable ID that can be used to identify any Hoodie table fields and resources. + */ +public abstract class HoodieID implements Serializable { + + private static final long serialVersionUID = 1L; + + /** + * Supported ID types. + */ + public enum Type { + COLUMN("HoodieColumnID"), + PARTITION("HoodiePartitionID"), + FILE("HoodieFileID"); + + private final String name; + + Type(final String name) { + this.name = name; + } + + @Override + public String toString() { + return "Type{name='" + name + "'}"; + } + } + + /** + * Get the number of bits representing this ID in memory. + *

+ * Note: Will be in multiples of 8 only. + * + * @return The number of bits in this ID + */ + public abstract int bits(); + + /** + * Get this ID as a byte array. + * + * @return A byte array representing this ID + */ + public abstract byte[] asBytes(); + + /** + * Get the String version of this ID. + * + * @return String version of this ID. + */ + public abstract String toString(); + + /** + * + */ + public String asBase64EncodedString() { + throw new HoodieNotSupportedException("Unsupported hash for " + getType()); + } + + /** + * Get the ID type. + * + * @return This ID type + */ + protected abstract Type getType(); + + /** + * Is this ID a ColumnID type ? + * + * @return True if this ID of ColumnID type + */ + public final boolean isColumnID() { + return (getType() == Type.COLUMN); + } + + /** + * Is this ID a Partition type ? + * + * @return True if this ID of PartitionID type + */ + public final boolean isPartition() { + return (getType() == Type.PARTITION); + } + + /** + * Is this ID a FileID type ? + * + * @return True if this ID of FileID type + */ + public final boolean isFileID() { + return (getType() == Type.FILE); + } + +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java new file mode 100644 index 000000000..f31159faa --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.apache.hudi.common.util.Base64CodecUtil; + +/** + * Hoodie object ID representing any partition. + */ +public class PartitionID extends HoodieID { + + private static final Type TYPE = Type.PARTITION; + private static final HashID.Size ID_PARTITION_HASH_SIZE = HashID.Size.BITS_64; + private final byte[] hash; + + public PartitionID(final String message) { + this.hash = HashID.hash(message, ID_PARTITION_HASH_SIZE); + } + + @Override + public int bits() { + return ID_PARTITION_HASH_SIZE.byteSize(); + } + + @Override + public byte[] asBytes() { + return this.hash; + } + + @Override + public String asBase64EncodedString() { + return Base64CodecUtil.encode(this.hash); + } + + @Override + public String toString() { + return new String(this.hash); + } + + @Override + protected Type getType() { + return TYPE; + } + +} diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java new file mode 100644 index 000000000..de0424f42 --- /dev/null +++ b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.common.util.hash; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import javax.xml.bind.DatatypeConverter; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHashID { + + /** + * Test HashID of all sizes for ByteArray type input message. + */ + @ParameterizedTest + @EnumSource(HashID.Size.class) + public void testHashForByteInput(HashID.Size size) { + final int count = 8; + Random random = new Random(); + for (int i = 0; i < count; i++) { + final String message = random.ints(50, 120) + .filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97)) + .limit((32 + (i * 4))) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString(); + final byte[] originalData = message.getBytes(StandardCharsets.UTF_8); + final byte[] hashBytes = HashID.hash(originalData, size); + assertEquals(hashBytes.length, size.byteSize()); + } + } + + /** + * Test HashID of all sizes for String type input message. + */ + @ParameterizedTest + @EnumSource(HashID.Size.class) + public void testHashForStringInput(HashID.Size size) { + final int count = 8; + Random random = new Random(); + for (int i = 0; i < count; i++) { + final String message = random.ints(50, 120) + .filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97)) + .limit((32 + (i * 4))) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString(); + final byte[] hashBytes = HashID.hash(message, size); + assertEquals(hashBytes.length, size.byteSize()); + } + } + + /** + * Test expected hash values for all bit sizes. + */ + @Test + public void testHashValues() { + Map> expectedValuesMap = new HashMap>(); + Map hash32ExpectedValues = new HashMap() { + { + put("Hudi", "FB6A3F92"); + put("Data lake", "99913A4D"); + put("Data Lake", "6F7DAD6A"); + put("Col1", "B4393B9A"); + put("A", "CDD946CE"); + put("2021/10/28/", "BBD4FDB2"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_32, hash32ExpectedValues); + + Map hash64ExpectedValues = new HashMap() { + { + put("Hudi", "F7727B9A28379071"); + put("Data lake", "52BC72D592EBCAE5"); + put("Data Lake", "5ED19AF9FD746E3E"); + put("Col1", "22FB1DD2F4784D31"); + put("A", "EBF88350484B5AA7"); + put("2021/10/28/", "2A9399AF6E7C8B12"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_128, hash64ExpectedValues); + + Map hash128ExpectedValues = new HashMap() { + { + put("Hudi", "09DAB749F255311C1C9EF6DD7B790170"); + put("Data lake", "7F2FC1EA445FC81F67CAA25EC9089C08"); + put("Data Lake", "9D2CEF0D61B02848C528A070ED75C570"); + put("Col1", "EC0FFE21E704DE2A580661C59A81D453"); + put("A", "7FC56270E7A70FA81A5935B72EACBE29"); + put("2021/10/28/", "1BAE8F04F44CB7ACF2458EF5219742DC"); + } + }; + expectedValuesMap.put(HashID.Size.BITS_128, hash128ExpectedValues); + + for (Map.Entry> allSizeEntries : expectedValuesMap.entrySet()) { + for (Map.Entry sizeEntry : allSizeEntries.getValue().entrySet()) { + final byte[] actualHashBytes = HashID.hash(sizeEntry.getKey(), allSizeEntries.getKey()); + final byte[] expectedHashBytes = DatatypeConverter.parseHexBinary(sizeEntry.getValue()); + assertTrue(Arrays.equals(expectedHashBytes, actualHashBytes)); + } + } + } +}