1
0

[HUDI-1295] Hash ID generator util for Hudi table columns, partition and files (#3884)

* [HUDI-1295] Hash ID generator util for Hudi table columns, partition and files

- Adding a new utility class HashID to generate 32,64,128 bits hashes for any
  given message of string or byte array type. This class internally uses
  MessageDigest and xxhash libraries.

- Adding stateful hash holders for Hudi table columns, partition and files to
  pass around for metaindex and to convert to base64encoded strings whenever
  needed
This commit is contained in:
Manoj Govindassamy
2021-10-29 16:19:38 -07:00
committed by GitHub
parent 5b1992a92d
commit f6326693ef
8 changed files with 571 additions and 3 deletions

View File

@@ -26,11 +26,11 @@ public final class Base64CodecUtil {
/**
* Decodes data from the input string into using the encoding scheme.
*
* @param serString
* @param encodedString - Base64 encoded string to decode
* @return A newly-allocated byte array containing the decoded bytes.
*/
public static byte[] decode(String serString) {
return Base64.getDecoder().decode(serString.getBytes(StandardCharsets.UTF_8));
public static byte[] decode(String encodedString) {
return Base64.getDecoder().decode(encodedString.getBytes(StandardCharsets.UTF_8));
}
/**

View File

@@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.util.hash;
import org.apache.hudi.common.util.Base64CodecUtil;
/**
* A stateful Hoodie object ID representing any table column.
*/
public class ColumnID extends HoodieID {
private static final Type TYPE = Type.COLUMN;
private static final HashID.Size ID_COLUMN_HASH_SIZE = HashID.Size.BITS_64;
private final byte[] hash;
public ColumnID(final String message) {
this.hash = HashID.hash(message, ID_COLUMN_HASH_SIZE);
}
@Override
public int bits() {
return ID_COLUMN_HASH_SIZE.byteSize();
}
@Override
public byte[] asBytes() {
return this.hash;
}
@Override
public String asBase64EncodedString() {
return Base64CodecUtil.encode(this.hash);
}
@Override
public String toString() {
return new String(this.hash);
}
@Override
protected Type getType() {
return TYPE;
}
}

View File

@@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.util.hash;
import org.apache.hudi.common.util.Base64CodecUtil;
/**
* Hoodie object ID representing any file.
*/
public class FileID extends HoodieID {
private static final Type TYPE = Type.FILE;
private static final HashID.Size ID_FILE_HASH_SIZE = HashID.Size.BITS_128;
private final byte[] hash;
public FileID(final String message) {
this.hash = HashID.hash(message, ID_FILE_HASH_SIZE);
}
@Override
public int bits() {
return ID_FILE_HASH_SIZE.byteSize();
}
@Override
public byte[] asBytes() {
return this.hash;
}
@Override
public String asBase64EncodedString() {
return Base64CodecUtil.encode(this.hash);
}
@Override
public String toString() {
return new String(this.hash);
}
@Override
protected Type getType() {
return TYPE;
}
}

View File

@@ -0,0 +1,132 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.util.hash;
import net.jpountz.xxhash.XXHash32;
import net.jpountz.xxhash.XXHash64;
import net.jpountz.xxhash.XXHashFactory;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hudi.exception.HoodieIOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
/**
* A stateless Hash class which generates ID for the desired bit count.
*/
public class HashID implements Serializable {
private static final String MD5_ALGORITHM_NAME = "MD5";
private static final int HASH_SEED = 0xdabadaba;
/**
* Represents HashID size in bits.
*/
public enum Size {
BITS_32(32),
BITS_64(64),
BITS_128(128);
private final int bits;
Size(int bitCount) {
this.bits = bitCount;
}
/**
* Get this Hash size in bytes.
*
* @return Bytes needed to represent this size
*/
public int byteSize() {
return (((this.bits - 1) / Byte.SIZE) + 1);
}
/**
* Get this Hash size in bits.
*
* @return bits needed to represent the size
*/
public int bits() {
return this.bits;
}
@Override
public String toString() {
return "HashSize{" + bits + "}";
}
}
/**
* Get the hash value for a string message and for the desired @{@link Size}.
*
* @param message - String message to get the hash value for
* @param bits - @{@link Size} of the hash value
* @return Hash value for the message as byte array
*/
public static byte[] hash(final String message, final Size bits) {
return hash(message.getBytes(StandardCharsets.UTF_8), bits);
}
/**
* Get the hash value for a byte array and for the desired @{@link Size}.
*
* @param messageBytes - Byte array message to get the hash value for
* @param bits - @{@link Size} of the hash value
* @return Hash value for the message as byte array
*/
public static byte[] hash(final byte[] messageBytes, final Size bits) {
switch (bits) {
case BITS_32:
case BITS_64:
return getXXHash(messageBytes, bits);
case BITS_128:
return getMD5Hash(messageBytes);
default:
throw new IllegalArgumentException("Unexpected Hash size bits: " + bits);
}
}
private static byte[] getXXHash(final byte[] message, final Size bits) {
XXHashFactory factory = XXHashFactory.fastestInstance();
switch (bits) {
case BITS_32:
XXHash32 hash32 = factory.hash32();
return Bytes.toBytes(hash32.hash(message, 0, message.length, HASH_SEED));
case BITS_64:
XXHash64 hash64 = factory.hash64();
return Bytes.toBytes(hash64.hash(message, 0, message.length, HASH_SEED));
default:
throw new HoodieIOException("XX" + bits + " hash is unsupported!");
}
}
private static byte[] getMD5Hash(final byte[] message) throws HoodieIOException {
try {
MessageDigest messageDigest = MessageDigest.getInstance(MD5_ALGORITHM_NAME);
messageDigest.update(message);
return messageDigest.digest();
} catch (NoSuchAlgorithmException e) {
throw new HoodieIOException("Failed to create MD5 Hash: " + e);
}
}
}

View File

@@ -0,0 +1,117 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.util.hash;
import org.apache.hudi.exception.HoodieNotSupportedException;
import java.io.Serializable;
/**
* A serializable ID that can be used to identify any Hoodie table fields and resources.
*/
public abstract class HoodieID implements Serializable {
private static final long serialVersionUID = 1L;
/**
* Supported ID types.
*/
public enum Type {
COLUMN("HoodieColumnID"),
PARTITION("HoodiePartitionID"),
FILE("HoodieFileID");
private final String name;
Type(final String name) {
this.name = name;
}
@Override
public String toString() {
return "Type{name='" + name + "'}";
}
}
/**
* Get the number of bits representing this ID in memory.
* <p>
* Note: Will be in multiples of 8 only.
*
* @return The number of bits in this ID
*/
public abstract int bits();
/**
* Get this ID as a byte array.
*
* @return A byte array representing this ID
*/
public abstract byte[] asBytes();
/**
* Get the String version of this ID.
*
* @return String version of this ID.
*/
public abstract String toString();
/**
*
*/
public String asBase64EncodedString() {
throw new HoodieNotSupportedException("Unsupported hash for " + getType());
}
/**
* Get the ID type.
*
* @return This ID type
*/
protected abstract Type getType();
/**
* Is this ID a ColumnID type ?
*
* @return True if this ID of ColumnID type
*/
public final boolean isColumnID() {
return (getType() == Type.COLUMN);
}
/**
* Is this ID a Partition type ?
*
* @return True if this ID of PartitionID type
*/
public final boolean isPartition() {
return (getType() == Type.PARTITION);
}
/**
* Is this ID a FileID type ?
*
* @return True if this ID of FileID type
*/
public final boolean isFileID() {
return (getType() == Type.FILE);
}
}

View File

@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.util.hash;
import org.apache.hudi.common.util.Base64CodecUtil;
/**
* Hoodie object ID representing any partition.
*/
public class PartitionID extends HoodieID {
private static final Type TYPE = Type.PARTITION;
private static final HashID.Size ID_PARTITION_HASH_SIZE = HashID.Size.BITS_64;
private final byte[] hash;
public PartitionID(final String message) {
this.hash = HashID.hash(message, ID_PARTITION_HASH_SIZE);
}
@Override
public int bits() {
return ID_PARTITION_HASH_SIZE.byteSize();
}
@Override
public byte[] asBytes() {
return this.hash;
}
@Override
public String asBase64EncodedString() {
return Base64CodecUtil.encode(this.hash);
}
@Override
public String toString() {
return new String(this.hash);
}
@Override
protected Type getType() {
return TYPE;
}
}