diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml
index 71326e750..bbf2b2972 100644
--- a/hudi-common/pom.xml
+++ b/hudi-common/pom.xml
@@ -248,5 +248,13 @@
+
+
+
+ org.lz4
+ lz4-java
+ 1.8.0
+
+
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java b/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java
index a86879ad6..97e9133cf 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/Base64CodecUtil.java
@@ -26,11 +26,11 @@ public final class Base64CodecUtil {
/**
* Decodes data from the input string into using the encoding scheme.
*
- * @param serString
+ * @param encodedString - Base64 encoded string to decode
* @return A newly-allocated byte array containing the decoded bytes.
*/
- public static byte[] decode(String serString) {
- return Base64.getDecoder().decode(serString.getBytes(StandardCharsets.UTF_8));
+ public static byte[] decode(String encodedString) {
+ return Base64.getDecoder().decode(encodedString.getBytes(StandardCharsets.UTF_8));
}
/**
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java
new file mode 100644
index 000000000..be4db44ec
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/ColumnID.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.common.util.hash;
+
+import org.apache.hudi.common.util.Base64CodecUtil;
+
+/**
+ * A stateful Hoodie object ID representing any table column.
+ */
+public class ColumnID extends HoodieID {
+
+ private static final Type TYPE = Type.COLUMN;
+ private static final HashID.Size ID_COLUMN_HASH_SIZE = HashID.Size.BITS_64;
+ private final byte[] hash;
+
+ public ColumnID(final String message) {
+ this.hash = HashID.hash(message, ID_COLUMN_HASH_SIZE);
+ }
+
+ @Override
+ public int bits() {
+ return ID_COLUMN_HASH_SIZE.byteSize();
+ }
+
+ @Override
+ public byte[] asBytes() {
+ return this.hash;
+ }
+
+ @Override
+ public String asBase64EncodedString() {
+ return Base64CodecUtil.encode(this.hash);
+ }
+
+ @Override
+ public String toString() {
+ return new String(this.hash);
+ }
+
+ @Override
+ protected Type getType() {
+ return TYPE;
+ }
+}
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java
new file mode 100644
index 000000000..0cb73c5ab
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/FileID.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.common.util.hash;
+
+import org.apache.hudi.common.util.Base64CodecUtil;
+
+/**
+ * Hoodie object ID representing any file.
+ */
+public class FileID extends HoodieID {
+
+ private static final Type TYPE = Type.FILE;
+ private static final HashID.Size ID_FILE_HASH_SIZE = HashID.Size.BITS_128;
+ private final byte[] hash;
+
+ public FileID(final String message) {
+ this.hash = HashID.hash(message, ID_FILE_HASH_SIZE);
+ }
+
+ @Override
+ public int bits() {
+ return ID_FILE_HASH_SIZE.byteSize();
+ }
+
+ @Override
+ public byte[] asBytes() {
+ return this.hash;
+ }
+
+ @Override
+ public String asBase64EncodedString() {
+ return Base64CodecUtil.encode(this.hash);
+ }
+
+ @Override
+ public String toString() {
+ return new String(this.hash);
+ }
+
+ @Override
+ protected Type getType() {
+ return TYPE;
+ }
+}
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java
new file mode 100644
index 000000000..c56d76097
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HashID.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.common.util.hash;
+
+import net.jpountz.xxhash.XXHash32;
+import net.jpountz.xxhash.XXHash64;
+import net.jpountz.xxhash.XXHashFactory;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hudi.exception.HoodieIOException;
+
+import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+/**
+ * A stateless Hash class which generates ID for the desired bit count.
+ */
+public class HashID implements Serializable {
+
+ private static final String MD5_ALGORITHM_NAME = "MD5";
+ private static final int HASH_SEED = 0xdabadaba;
+
+ /**
+ * Represents HashID size in bits.
+ */
+ public enum Size {
+ BITS_32(32),
+ BITS_64(64),
+ BITS_128(128);
+
+ private final int bits;
+
+ Size(int bitCount) {
+ this.bits = bitCount;
+ }
+
+ /**
+ * Get this Hash size in bytes.
+ *
+ * @return Bytes needed to represent this size
+ */
+ public int byteSize() {
+ return (((this.bits - 1) / Byte.SIZE) + 1);
+ }
+
+ /**
+ * Get this Hash size in bits.
+ *
+ * @return bits needed to represent the size
+ */
+ public int bits() {
+ return this.bits;
+ }
+
+ @Override
+ public String toString() {
+ return "HashSize{" + bits + "}";
+ }
+ }
+
+ /**
+ * Get the hash value for a string message and for the desired @{@link Size}.
+ *
+ * @param message - String message to get the hash value for
+ * @param bits - @{@link Size} of the hash value
+ * @return Hash value for the message as byte array
+ */
+ public static byte[] hash(final String message, final Size bits) {
+ return hash(message.getBytes(StandardCharsets.UTF_8), bits);
+ }
+
+ /**
+ * Get the hash value for a byte array and for the desired @{@link Size}.
+ *
+ * @param messageBytes - Byte array message to get the hash value for
+ * @param bits - @{@link Size} of the hash value
+ * @return Hash value for the message as byte array
+ */
+ public static byte[] hash(final byte[] messageBytes, final Size bits) {
+ switch (bits) {
+ case BITS_32:
+ case BITS_64:
+ return getXXHash(messageBytes, bits);
+ case BITS_128:
+ return getMD5Hash(messageBytes);
+ default:
+ throw new IllegalArgumentException("Unexpected Hash size bits: " + bits);
+ }
+ }
+
+ private static byte[] getXXHash(final byte[] message, final Size bits) {
+ XXHashFactory factory = XXHashFactory.fastestInstance();
+ switch (bits) {
+ case BITS_32:
+ XXHash32 hash32 = factory.hash32();
+ return Bytes.toBytes(hash32.hash(message, 0, message.length, HASH_SEED));
+ case BITS_64:
+ XXHash64 hash64 = factory.hash64();
+ return Bytes.toBytes(hash64.hash(message, 0, message.length, HASH_SEED));
+ default:
+ throw new HoodieIOException("XX" + bits + " hash is unsupported!");
+ }
+ }
+
+ private static byte[] getMD5Hash(final byte[] message) throws HoodieIOException {
+ try {
+ MessageDigest messageDigest = MessageDigest.getInstance(MD5_ALGORITHM_NAME);
+ messageDigest.update(message);
+ return messageDigest.digest();
+ } catch (NoSuchAlgorithmException e) {
+ throw new HoodieIOException("Failed to create MD5 Hash: " + e);
+ }
+ }
+}
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java
new file mode 100644
index 000000000..e08e254b0
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/HoodieID.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.common.util.hash;
+
+import org.apache.hudi.exception.HoodieNotSupportedException;
+
+import java.io.Serializable;
+
+/**
+ * A serializable ID that can be used to identify any Hoodie table fields and resources.
+ */
+public abstract class HoodieID implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Supported ID types.
+ */
+ public enum Type {
+ COLUMN("HoodieColumnID"),
+ PARTITION("HoodiePartitionID"),
+ FILE("HoodieFileID");
+
+ private final String name;
+
+ Type(final String name) {
+ this.name = name;
+ }
+
+ @Override
+ public String toString() {
+ return "Type{name='" + name + "'}";
+ }
+ }
+
+ /**
+ * Get the number of bits representing this ID in memory.
+ *
+ * Note: Will be in multiples of 8 only.
+ *
+ * @return The number of bits in this ID
+ */
+ public abstract int bits();
+
+ /**
+ * Get this ID as a byte array.
+ *
+ * @return A byte array representing this ID
+ */
+ public abstract byte[] asBytes();
+
+ /**
+ * Get the String version of this ID.
+ *
+ * @return String version of this ID.
+ */
+ public abstract String toString();
+
+ /**
+ *
+ */
+ public String asBase64EncodedString() {
+ throw new HoodieNotSupportedException("Unsupported hash for " + getType());
+ }
+
+ /**
+ * Get the ID type.
+ *
+ * @return This ID type
+ */
+ protected abstract Type getType();
+
+ /**
+ * Is this ID a ColumnID type ?
+ *
+ * @return True if this ID of ColumnID type
+ */
+ public final boolean isColumnID() {
+ return (getType() == Type.COLUMN);
+ }
+
+ /**
+ * Is this ID a Partition type ?
+ *
+ * @return True if this ID of PartitionID type
+ */
+ public final boolean isPartition() {
+ return (getType() == Type.PARTITION);
+ }
+
+ /**
+ * Is this ID a FileID type ?
+ *
+ * @return True if this ID of FileID type
+ */
+ public final boolean isFileID() {
+ return (getType() == Type.FILE);
+ }
+
+}
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java
new file mode 100644
index 000000000..f31159faa
--- /dev/null
+++ b/hudi-common/src/main/java/org/apache/hudi/common/util/hash/PartitionID.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.common.util.hash;
+
+import org.apache.hudi.common.util.Base64CodecUtil;
+
+/**
+ * Hoodie object ID representing any partition.
+ */
+public class PartitionID extends HoodieID {
+
+ private static final Type TYPE = Type.PARTITION;
+ private static final HashID.Size ID_PARTITION_HASH_SIZE = HashID.Size.BITS_64;
+ private final byte[] hash;
+
+ public PartitionID(final String message) {
+ this.hash = HashID.hash(message, ID_PARTITION_HASH_SIZE);
+ }
+
+ @Override
+ public int bits() {
+ return ID_PARTITION_HASH_SIZE.byteSize();
+ }
+
+ @Override
+ public byte[] asBytes() {
+ return this.hash;
+ }
+
+ @Override
+ public String asBase64EncodedString() {
+ return Base64CodecUtil.encode(this.hash);
+ }
+
+ @Override
+ public String toString() {
+ return new String(this.hash);
+ }
+
+ @Override
+ protected Type getType() {
+ return TYPE;
+ }
+
+}
diff --git a/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java
new file mode 100644
index 000000000..de0424f42
--- /dev/null
+++ b/hudi-common/src/test/java/org/apache/hudi/common/util/hash/TestHashID.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hudi.common.util.hash;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.EnumSource;
+
+import javax.xml.bind.DatatypeConverter;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class TestHashID {
+
+ /**
+ * Test HashID of all sizes for ByteArray type input message.
+ */
+ @ParameterizedTest
+ @EnumSource(HashID.Size.class)
+ public void testHashForByteInput(HashID.Size size) {
+ final int count = 8;
+ Random random = new Random();
+ for (int i = 0; i < count; i++) {
+ final String message = random.ints(50, 120)
+ .filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97))
+ .limit((32 + (i * 4)))
+ .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
+ .toString();
+ final byte[] originalData = message.getBytes(StandardCharsets.UTF_8);
+ final byte[] hashBytes = HashID.hash(originalData, size);
+ assertEquals(hashBytes.length, size.byteSize());
+ }
+ }
+
+ /**
+ * Test HashID of all sizes for String type input message.
+ */
+ @ParameterizedTest
+ @EnumSource(HashID.Size.class)
+ public void testHashForStringInput(HashID.Size size) {
+ final int count = 8;
+ Random random = new Random();
+ for (int i = 0; i < count; i++) {
+ final String message = random.ints(50, 120)
+ .filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97))
+ .limit((32 + (i * 4)))
+ .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
+ .toString();
+ final byte[] hashBytes = HashID.hash(message, size);
+ assertEquals(hashBytes.length, size.byteSize());
+ }
+ }
+
+ /**
+ * Test expected hash values for all bit sizes.
+ */
+ @Test
+ public void testHashValues() {
+ Map> expectedValuesMap = new HashMap>();
+ Map hash32ExpectedValues = new HashMap() {
+ {
+ put("Hudi", "FB6A3F92");
+ put("Data lake", "99913A4D");
+ put("Data Lake", "6F7DAD6A");
+ put("Col1", "B4393B9A");
+ put("A", "CDD946CE");
+ put("2021/10/28/", "BBD4FDB2");
+ }
+ };
+ expectedValuesMap.put(HashID.Size.BITS_32, hash32ExpectedValues);
+
+ Map hash64ExpectedValues = new HashMap() {
+ {
+ put("Hudi", "F7727B9A28379071");
+ put("Data lake", "52BC72D592EBCAE5");
+ put("Data Lake", "5ED19AF9FD746E3E");
+ put("Col1", "22FB1DD2F4784D31");
+ put("A", "EBF88350484B5AA7");
+ put("2021/10/28/", "2A9399AF6E7C8B12");
+ }
+ };
+ expectedValuesMap.put(HashID.Size.BITS_128, hash64ExpectedValues);
+
+ Map hash128ExpectedValues = new HashMap() {
+ {
+ put("Hudi", "09DAB749F255311C1C9EF6DD7B790170");
+ put("Data lake", "7F2FC1EA445FC81F67CAA25EC9089C08");
+ put("Data Lake", "9D2CEF0D61B02848C528A070ED75C570");
+ put("Col1", "EC0FFE21E704DE2A580661C59A81D453");
+ put("A", "7FC56270E7A70FA81A5935B72EACBE29");
+ put("2021/10/28/", "1BAE8F04F44CB7ACF2458EF5219742DC");
+ }
+ };
+ expectedValuesMap.put(HashID.Size.BITS_128, hash128ExpectedValues);
+
+ for (Map.Entry> allSizeEntries : expectedValuesMap.entrySet()) {
+ for (Map.Entry sizeEntry : allSizeEntries.getValue().entrySet()) {
+ final byte[] actualHashBytes = HashID.hash(sizeEntry.getKey(), allSizeEntries.getKey());
+ final byte[] expectedHashBytes = DatatypeConverter.parseHexBinary(sizeEntry.getValue());
+ assertTrue(Arrays.equals(expectedHashBytes, actualHashBytes));
+ }
+ }
+ }
+}