[HUDI-1295] Hash ID generator util for Hudi table columns, partition and files (#3884)
* [HUDI-1295] Hash ID generator util for Hudi table columns, partition and files - Adding a new utility class HashID to generate 32,64,128 bits hashes for any given message of string or byte array type. This class internally uses MessageDigest and xxhash libraries. - Adding stateful hash holders for Hudi table columns, partition and files to pass around for metaindex and to convert to base64encoded strings whenever needed
This commit is contained in:
committed by
GitHub
parent
5b1992a92d
commit
f6326693ef
@@ -248,5 +248,13 @@
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<!-- LZ4 Hash Utils -->
|
||||
<dependency>
|
||||
<groupId>org.lz4</groupId>
|
||||
<artifactId>lz4-java</artifactId>
|
||||
<version>1.8.0</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</project>
|
||||
|
||||
@@ -26,11 +26,11 @@ public final class Base64CodecUtil {
|
||||
/**
|
||||
* Decodes data from the input string into using the encoding scheme.
|
||||
*
|
||||
* @param serString
|
||||
* @param encodedString - Base64 encoded string to decode
|
||||
* @return A newly-allocated byte array containing the decoded bytes.
|
||||
*/
|
||||
public static byte[] decode(String serString) {
|
||||
return Base64.getDecoder().decode(serString.getBytes(StandardCharsets.UTF_8));
|
||||
public static byte[] decode(String encodedString) {
|
||||
return Base64.getDecoder().decode(encodedString.getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util.hash;
|
||||
|
||||
import org.apache.hudi.common.util.Base64CodecUtil;
|
||||
|
||||
/**
|
||||
* A stateful Hoodie object ID representing any table column.
|
||||
*/
|
||||
public class ColumnID extends HoodieID {
|
||||
|
||||
private static final Type TYPE = Type.COLUMN;
|
||||
private static final HashID.Size ID_COLUMN_HASH_SIZE = HashID.Size.BITS_64;
|
||||
private final byte[] hash;
|
||||
|
||||
public ColumnID(final String message) {
|
||||
this.hash = HashID.hash(message, ID_COLUMN_HASH_SIZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int bits() {
|
||||
return ID_COLUMN_HASH_SIZE.byteSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] asBytes() {
|
||||
return this.hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String asBase64EncodedString() {
|
||||
return Base64CodecUtil.encode(this.hash);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new String(this.hash);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Type getType() {
|
||||
return TYPE;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util.hash;
|
||||
|
||||
import org.apache.hudi.common.util.Base64CodecUtil;
|
||||
|
||||
/**
|
||||
* Hoodie object ID representing any file.
|
||||
*/
|
||||
public class FileID extends HoodieID {
|
||||
|
||||
private static final Type TYPE = Type.FILE;
|
||||
private static final HashID.Size ID_FILE_HASH_SIZE = HashID.Size.BITS_128;
|
||||
private final byte[] hash;
|
||||
|
||||
public FileID(final String message) {
|
||||
this.hash = HashID.hash(message, ID_FILE_HASH_SIZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int bits() {
|
||||
return ID_FILE_HASH_SIZE.byteSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] asBytes() {
|
||||
return this.hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String asBase64EncodedString() {
|
||||
return Base64CodecUtil.encode(this.hash);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new String(this.hash);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Type getType() {
|
||||
return TYPE;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,132 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util.hash;
|
||||
|
||||
import net.jpountz.xxhash.XXHash32;
|
||||
import net.jpountz.xxhash.XXHash64;
|
||||
import net.jpountz.xxhash.XXHashFactory;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
|
||||
/**
|
||||
* A stateless Hash class which generates ID for the desired bit count.
|
||||
*/
|
||||
public class HashID implements Serializable {
|
||||
|
||||
private static final String MD5_ALGORITHM_NAME = "MD5";
|
||||
private static final int HASH_SEED = 0xdabadaba;
|
||||
|
||||
/**
|
||||
* Represents HashID size in bits.
|
||||
*/
|
||||
public enum Size {
|
||||
BITS_32(32),
|
||||
BITS_64(64),
|
||||
BITS_128(128);
|
||||
|
||||
private final int bits;
|
||||
|
||||
Size(int bitCount) {
|
||||
this.bits = bitCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get this Hash size in bytes.
|
||||
*
|
||||
* @return Bytes needed to represent this size
|
||||
*/
|
||||
public int byteSize() {
|
||||
return (((this.bits - 1) / Byte.SIZE) + 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get this Hash size in bits.
|
||||
*
|
||||
* @return bits needed to represent the size
|
||||
*/
|
||||
public int bits() {
|
||||
return this.bits;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HashSize{" + bits + "}";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the hash value for a string message and for the desired @{@link Size}.
|
||||
*
|
||||
* @param message - String message to get the hash value for
|
||||
* @param bits - @{@link Size} of the hash value
|
||||
* @return Hash value for the message as byte array
|
||||
*/
|
||||
public static byte[] hash(final String message, final Size bits) {
|
||||
return hash(message.getBytes(StandardCharsets.UTF_8), bits);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the hash value for a byte array and for the desired @{@link Size}.
|
||||
*
|
||||
* @param messageBytes - Byte array message to get the hash value for
|
||||
* @param bits - @{@link Size} of the hash value
|
||||
* @return Hash value for the message as byte array
|
||||
*/
|
||||
public static byte[] hash(final byte[] messageBytes, final Size bits) {
|
||||
switch (bits) {
|
||||
case BITS_32:
|
||||
case BITS_64:
|
||||
return getXXHash(messageBytes, bits);
|
||||
case BITS_128:
|
||||
return getMD5Hash(messageBytes);
|
||||
default:
|
||||
throw new IllegalArgumentException("Unexpected Hash size bits: " + bits);
|
||||
}
|
||||
}
|
||||
|
||||
private static byte[] getXXHash(final byte[] message, final Size bits) {
|
||||
XXHashFactory factory = XXHashFactory.fastestInstance();
|
||||
switch (bits) {
|
||||
case BITS_32:
|
||||
XXHash32 hash32 = factory.hash32();
|
||||
return Bytes.toBytes(hash32.hash(message, 0, message.length, HASH_SEED));
|
||||
case BITS_64:
|
||||
XXHash64 hash64 = factory.hash64();
|
||||
return Bytes.toBytes(hash64.hash(message, 0, message.length, HASH_SEED));
|
||||
default:
|
||||
throw new HoodieIOException("XX" + bits + " hash is unsupported!");
|
||||
}
|
||||
}
|
||||
|
||||
private static byte[] getMD5Hash(final byte[] message) throws HoodieIOException {
|
||||
try {
|
||||
MessageDigest messageDigest = MessageDigest.getInstance(MD5_ALGORITHM_NAME);
|
||||
messageDigest.update(message);
|
||||
return messageDigest.digest();
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new HoodieIOException("Failed to create MD5 Hash: " + e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util.hash;
|
||||
|
||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* A serializable ID that can be used to identify any Hoodie table fields and resources.
|
||||
*/
|
||||
public abstract class HoodieID implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
/**
|
||||
* Supported ID types.
|
||||
*/
|
||||
public enum Type {
|
||||
COLUMN("HoodieColumnID"),
|
||||
PARTITION("HoodiePartitionID"),
|
||||
FILE("HoodieFileID");
|
||||
|
||||
private final String name;
|
||||
|
||||
Type(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Type{name='" + name + "'}";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of bits representing this ID in memory.
|
||||
* <p>
|
||||
* Note: Will be in multiples of 8 only.
|
||||
*
|
||||
* @return The number of bits in this ID
|
||||
*/
|
||||
public abstract int bits();
|
||||
|
||||
/**
|
||||
* Get this ID as a byte array.
|
||||
*
|
||||
* @return A byte array representing this ID
|
||||
*/
|
||||
public abstract byte[] asBytes();
|
||||
|
||||
/**
|
||||
* Get the String version of this ID.
|
||||
*
|
||||
* @return String version of this ID.
|
||||
*/
|
||||
public abstract String toString();
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public String asBase64EncodedString() {
|
||||
throw new HoodieNotSupportedException("Unsupported hash for " + getType());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the ID type.
|
||||
*
|
||||
* @return This ID type
|
||||
*/
|
||||
protected abstract Type getType();
|
||||
|
||||
/**
|
||||
* Is this ID a ColumnID type ?
|
||||
*
|
||||
* @return True if this ID of ColumnID type
|
||||
*/
|
||||
public final boolean isColumnID() {
|
||||
return (getType() == Type.COLUMN);
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this ID a Partition type ?
|
||||
*
|
||||
* @return True if this ID of PartitionID type
|
||||
*/
|
||||
public final boolean isPartition() {
|
||||
return (getType() == Type.PARTITION);
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this ID a FileID type ?
|
||||
*
|
||||
* @return True if this ID of FileID type
|
||||
*/
|
||||
public final boolean isFileID() {
|
||||
return (getType() == Type.FILE);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util.hash;
|
||||
|
||||
import org.apache.hudi.common.util.Base64CodecUtil;
|
||||
|
||||
/**
|
||||
* Hoodie object ID representing any partition.
|
||||
*/
|
||||
public class PartitionID extends HoodieID {
|
||||
|
||||
private static final Type TYPE = Type.PARTITION;
|
||||
private static final HashID.Size ID_PARTITION_HASH_SIZE = HashID.Size.BITS_64;
|
||||
private final byte[] hash;
|
||||
|
||||
public PartitionID(final String message) {
|
||||
this.hash = HashID.hash(message, ID_PARTITION_HASH_SIZE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int bits() {
|
||||
return ID_PARTITION_HASH_SIZE.byteSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] asBytes() {
|
||||
return this.hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String asBase64EncodedString() {
|
||||
return Base64CodecUtil.encode(this.hash);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new String(this.hash);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Type getType() {
|
||||
return TYPE;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,127 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util.hash;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.EnumSource;
|
||||
|
||||
import javax.xml.bind.DatatypeConverter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestHashID {
|
||||
|
||||
/**
|
||||
* Test HashID of all sizes for ByteArray type input message.
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@EnumSource(HashID.Size.class)
|
||||
public void testHashForByteInput(HashID.Size size) {
|
||||
final int count = 8;
|
||||
Random random = new Random();
|
||||
for (int i = 0; i < count; i++) {
|
||||
final String message = random.ints(50, 120)
|
||||
.filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97))
|
||||
.limit((32 + (i * 4)))
|
||||
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
|
||||
.toString();
|
||||
final byte[] originalData = message.getBytes(StandardCharsets.UTF_8);
|
||||
final byte[] hashBytes = HashID.hash(originalData, size);
|
||||
assertEquals(hashBytes.length, size.byteSize());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test HashID of all sizes for String type input message.
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@EnumSource(HashID.Size.class)
|
||||
public void testHashForStringInput(HashID.Size size) {
|
||||
final int count = 8;
|
||||
Random random = new Random();
|
||||
for (int i = 0; i < count; i++) {
|
||||
final String message = random.ints(50, 120)
|
||||
.filter(j -> (j <= 57 || j >= 65) && (j <= 90 || j >= 97))
|
||||
.limit((32 + (i * 4)))
|
||||
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
|
||||
.toString();
|
||||
final byte[] hashBytes = HashID.hash(message, size);
|
||||
assertEquals(hashBytes.length, size.byteSize());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test expected hash values for all bit sizes.
|
||||
*/
|
||||
@Test
|
||||
public void testHashValues() {
|
||||
Map<HashID.Size, Map<String, String>> expectedValuesMap = new HashMap<HashID.Size, Map<String, String>>();
|
||||
Map<String, String> hash32ExpectedValues = new HashMap<String, String>() {
|
||||
{
|
||||
put("Hudi", "FB6A3F92");
|
||||
put("Data lake", "99913A4D");
|
||||
put("Data Lake", "6F7DAD6A");
|
||||
put("Col1", "B4393B9A");
|
||||
put("A", "CDD946CE");
|
||||
put("2021/10/28/", "BBD4FDB2");
|
||||
}
|
||||
};
|
||||
expectedValuesMap.put(HashID.Size.BITS_32, hash32ExpectedValues);
|
||||
|
||||
Map<String, String> hash64ExpectedValues = new HashMap<String, String>() {
|
||||
{
|
||||
put("Hudi", "F7727B9A28379071");
|
||||
put("Data lake", "52BC72D592EBCAE5");
|
||||
put("Data Lake", "5ED19AF9FD746E3E");
|
||||
put("Col1", "22FB1DD2F4784D31");
|
||||
put("A", "EBF88350484B5AA7");
|
||||
put("2021/10/28/", "2A9399AF6E7C8B12");
|
||||
}
|
||||
};
|
||||
expectedValuesMap.put(HashID.Size.BITS_128, hash64ExpectedValues);
|
||||
|
||||
Map<String, String> hash128ExpectedValues = new HashMap<String, String>() {
|
||||
{
|
||||
put("Hudi", "09DAB749F255311C1C9EF6DD7B790170");
|
||||
put("Data lake", "7F2FC1EA445FC81F67CAA25EC9089C08");
|
||||
put("Data Lake", "9D2CEF0D61B02848C528A070ED75C570");
|
||||
put("Col1", "EC0FFE21E704DE2A580661C59A81D453");
|
||||
put("A", "7FC56270E7A70FA81A5935B72EACBE29");
|
||||
put("2021/10/28/", "1BAE8F04F44CB7ACF2458EF5219742DC");
|
||||
}
|
||||
};
|
||||
expectedValuesMap.put(HashID.Size.BITS_128, hash128ExpectedValues);
|
||||
|
||||
for (Map.Entry<HashID.Size, Map<String, String>> allSizeEntries : expectedValuesMap.entrySet()) {
|
||||
for (Map.Entry<String, String> sizeEntry : allSizeEntries.getValue().entrySet()) {
|
||||
final byte[] actualHashBytes = HashID.hash(sizeEntry.getKey(), allSizeEntries.getKey());
|
||||
final byte[] expectedHashBytes = DatatypeConverter.parseHexBinary(sizeEntry.getValue());
|
||||
assertTrue(Arrays.equals(expectedHashBytes, actualHashBytes));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user