1
0

[HUDI-2814] Make Z-index more generic Column-Stats Index (#4106)

This commit is contained in:
Alexey Kudinkin
2021-12-10 14:56:09 -08:00
committed by GitHub
parent 72901a33a1
commit 2d864f7524
23 changed files with 892 additions and 1244 deletions

View File

@@ -79,7 +79,7 @@ public class HoodieTableMetaClient implements Serializable {
public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux";
public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap";
public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat";
public static final String ZINDEX_NAME = ".zindex";
public static final String COLUMN_STATISTICS_INDEX_NAME = ".colstatsindex";
public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH
+ Path.SEPARATOR + ".partitions";
public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR
@@ -178,10 +178,10 @@ public class HoodieTableMetaClient implements Serializable {
}
/**
* @return z-index path
* @return Column Statistics index path
*/
public String getZindexPath() {
return new Path(metaPath, ZINDEX_NAME).toString();
public String getColumnStatsIndexPath() {
return new Path(metaPath, COLUMN_STATISTICS_INDEX_NAME).toString();
}
/**

View File

@@ -0,0 +1,191 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import java.nio.charset.Charset;
public class BinaryUtil {
/**
* Lexicographically compare two arrays.
* copy from hbase
* @param buffer1 left operand
* @param buffer2 right operand
* @param offset1 Where to start comparing in the left buffer
* @param offset2 Where to start comparing in the right buffer
* @param length1 How much to compare from the left buffer
* @param length2 How much to compare from the right buffer
* @return 0 if equal, < 0 if left is less than right, etc.
*/
public static int compareTo(byte[] buffer1, int offset1, int length1,
byte[] buffer2, int offset2, int length2) {
// Short circuit equal case
if (buffer1 == buffer2
&& offset1 == offset2
&& length1 == length2) {
return 0;
}
// Bring WritableComparator code local
int end1 = offset1 + length1;
int end2 = offset2 + length2;
for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) {
int a = (buffer1[i] & 0xff);
int b = (buffer2[j] & 0xff);
if (a != b) {
return a - b;
}
}
return length1 - length2;
}
public static byte[] paddingTo8Byte(byte[] a) {
if (a.length == 8) {
return a;
}
if (a.length > 8) {
byte[] result = new byte[8];
System.arraycopy(a, 0, result, 0, 8);
return result;
}
int paddingSize = 8 - a.length;
byte[] result = new byte[8];
for (int i = 0; i < paddingSize; i++) {
result[i] = 0;
}
System.arraycopy(a, 0, result, paddingSize, a.length);
return result;
}
/**
* Interleaving array bytes.
* Interleaving means take one bit from the first matrix element, one bit
* from the next, etc, then take the second bit from the first matrix
* element, second bit from the second, all the way to the last bit of the
* last element. Combine those bits in that order into a single BigInteger,
* @param buffer candidate element to do interleaving
* @return byte size of candidate element
*/
public static byte[] interleaving(byte[][] buffer, int size) {
int candidateSize = buffer.length;
byte[] result = new byte[size * candidateSize];
int resBitPos = 0;
int totalBits = size * 8;
for (int bitStep = 0; bitStep < totalBits; bitStep++) {
int currentBytePos = (int) Math.floor(bitStep / 8);
int currentBitPos = bitStep % 8;
for (int i = 0; i < candidateSize; i++) {
int tempResBytePos = (int) Math.floor(resBitPos / 8);
int tempResBitPos = resBitPos % 8;
result[tempResBytePos] = updatePos(result[tempResBytePos], tempResBitPos, buffer[i][currentBytePos], currentBitPos);
resBitPos++;
}
}
return result;
}
public static byte updatePos(byte a, int apos, byte b, int bpos) {
byte temp = (byte) (b & (1 << (7 - bpos)));
if (apos < bpos) {
temp = (byte) (temp << (bpos - apos));
}
if (apos > bpos) {
temp = (byte) (temp >> (apos - bpos));
}
byte atemp = (byte) (a & (1 << (7 - apos)));
if ((byte) (atemp ^ temp) == 0) {
return a;
}
return (byte) (a ^ (1 << (7 - apos)));
}
public static byte[] toBytes(int val) {
byte[] b = new byte[4];
for (int i = 3; i > 0; i--) {
b[i] = (byte) val;
val >>>= 8;
}
b[0] = (byte) val;
return b;
}
public static byte[] toBytes(long val) {
long temp = val;
byte[] b = new byte[8];
for (int i = 7; i > 0; i--) {
b[i] = (byte) temp;
temp >>>= 8;
}
b[0] = (byte) temp;
return b;
}
public static byte[] toBytes(final double d) {
return toBytes(Double.doubleToRawLongBits(d));
}
public static byte[] intTo8Byte(int a) {
int temp = a;
temp = temp ^ (1 << 31);
return paddingTo8Byte(toBytes(temp));
}
public static byte[] byteTo8Byte(byte a) {
return paddingTo8Byte(new byte[] { a });
}
public static byte[] longTo8Byte(long a) {
long temp = a;
temp = temp ^ (1L << 63);
return toBytes(temp);
}
public static byte[] doubleTo8Byte(double a) {
byte[] temp = toBytes(a);
if (a > 0) {
temp[0] = (byte) (temp[0] ^ (1 << 7));
}
if (a < 0) {
for (int i = 0; i < temp.length; i++) {
temp[i] = (byte) ~temp[i];
}
}
return temp;
}
public static byte[] utf8To8Byte(String a) {
return paddingTo8Byte(a.getBytes(Charset.forName("utf-8")));
}
public static Long convertStringToLong(String a) {
byte[] bytes = utf8To8Byte(a);
return convertBytesToLong(bytes);
}
public static long convertBytesToLong(byte[] bytes) {
byte[] paddedBytes = paddingTo8Byte(bytes);
long temp = 0L;
for (int i = 7; i >= 0; i--) {
temp = temp | (((long) paddedBytes[i] & 0xff) << (7 - i) * 8);
}
return temp;
}
}

View File

@@ -18,15 +18,17 @@
package org.apache.hudi.common.util;
import java.util.Properties;
import org.apache.hudi.common.util.collection.Pair;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@@ -35,6 +37,36 @@ public class CollectionUtils {
public static final Properties EMPTY_PROPERTIES = new Properties();
/**
* Combines provided {@link List}s into one
*/
public static <E> List<E> combine(List<E> one, List<E> another) {
ArrayList<E> combined = new ArrayList<>(one);
combined.addAll(another);
return combined;
}
/**
* Returns difference b/w {@code one} {@link Set} of elements and {@code another}
*/
public static <E> Set<E> diff(Set<E> one, Set<E> another) {
Set<E> diff = new HashSet<>(one);
diff.removeAll(another);
return diff;
}
/**
* Returns difference b/w {@code one} {@link List} of elements and {@code another}
*
* NOTE: This is less optimal counterpart to {@link #diff(Set, Set)}, accepting {@link List}
* as a holding collection to support duplicate elements use-cases
*/
public static <E> List<E> diff(List<E> one, List<E> another) {
List<E> diff = new ArrayList<>(one);
diff.removeAll(another);
return diff;
}
/**
* Determines whether two iterators contain equal elements in the same order. More specifically,
* this method returns {@code true} if {@code iterator1} and {@code iterator2} contain the same

View File

@@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import javax.annotation.Nonnull;
import java.util.Arrays;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
public final class TypeUtils {
private TypeUtils() {}
/**
* Maps values from the provided Enum's {@link Class} into corresponding values,
* extracted by provided {@code valueMapper}
*/
public static <EnumT extends Enum<EnumT>> Map<String, EnumT> getValueToEnumMap(
@Nonnull Class<EnumT> klass,
@Nonnull Function<EnumT, String> valueMapper
) {
return Arrays.stream(klass.getEnumConstants())
.collect(Collectors.toMap(valueMapper, Function.identity()));
}
}