1
0

[HUDI-2814] Make Z-index more generic Column-Stats Index (#4106)

This commit is contained in:
Alexey Kudinkin
2021-12-10 14:56:09 -08:00
committed by GitHub
parent 72901a33a1
commit 2d864f7524
23 changed files with 892 additions and 1244 deletions

View File

@@ -24,13 +24,15 @@ import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.EngineType;
import org.apache.hudi.common.util.TypeUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieNotSupportedException;
import javax.annotation.Nonnull;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
/**
@@ -520,11 +522,15 @@ public class HoodieClusteringConfig extends HoodieConfig {
}
/**
* strategy types for build z-ordering/space-filling curves.
* Type of a strategy for building Z-order/Hilbert space-filling curves.
*/
public enum BuildCurveStrategyType {
DIRECT("direct"),
SAMPLE("sample");
private static final Map<String, BuildCurveStrategyType> VALUE_TO_ENUM_MAP =
TypeUtils.getValueToEnumMap(BuildCurveStrategyType.class, e -> e.value);
private final String value;
BuildCurveStrategyType(String value) {
@@ -532,42 +538,39 @@ public class HoodieClusteringConfig extends HoodieConfig {
}
public static BuildCurveStrategyType fromValue(String value) {
switch (value.toLowerCase(Locale.ROOT)) {
case "direct":
return DIRECT;
case "sample":
return SAMPLE;
default:
throw new HoodieException("Invalid value of Type.");
BuildCurveStrategyType enumValue = VALUE_TO_ENUM_MAP.get(value);
if (enumValue == null) {
throw new HoodieException(String.format("Invalid value (%s)", value));
}
return enumValue;
}
}
/**
* strategy types for optimize layout for hudi data.
* Layout optimization strategies such as Z-order/Hilbert space-curves, etc
*/
public enum BuildLayoutOptimizationStrategy {
public enum LayoutOptimizationStrategy {
ZORDER("z-order"),
HILBERT("hilbert");
private static final Map<String, LayoutOptimizationStrategy> VALUE_TO_ENUM_MAP =
TypeUtils.getValueToEnumMap(LayoutOptimizationStrategy.class, e -> e.value);
private final String value;
BuildLayoutOptimizationStrategy(String value) {
LayoutOptimizationStrategy(String value) {
this.value = value;
}
public String toCustomString() {
return value;
}
public static BuildLayoutOptimizationStrategy fromValue(String value) {
switch (value.toLowerCase(Locale.ROOT)) {
case "z-order":
return ZORDER;
case "hilbert":
return HILBERT;
default:
throw new HoodieException("Invalid value of Type.");
@Nonnull
public static LayoutOptimizationStrategy fromValue(String value) {
LayoutOptimizationStrategy enumValue = VALUE_TO_ENUM_MAP.get(value);
if (enumValue == null) {
throw new HoodieException(String.format("Invalid value (%s)", value));
}
return enumValue;
}
}
}

View File

@@ -1,191 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.optimize;
import java.nio.charset.Charset;
public class ZOrderingUtil {
/**
* Lexicographically compare two arrays.
* copy from hbase
* @param buffer1 left operand
* @param buffer2 right operand
* @param offset1 Where to start comparing in the left buffer
* @param offset2 Where to start comparing in the right buffer
* @param length1 How much to compare from the left buffer
* @param length2 How much to compare from the right buffer
* @return 0 if equal, < 0 if left is less than right, etc.
*/
public static int compareTo(byte[] buffer1, int offset1, int length1,
byte[] buffer2, int offset2, int length2) {
// Short circuit equal case
if (buffer1 == buffer2
&& offset1 == offset2
&& length1 == length2) {
return 0;
}
// Bring WritableComparator code local
int end1 = offset1 + length1;
int end2 = offset2 + length2;
for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) {
int a = (buffer1[i] & 0xff);
int b = (buffer2[j] & 0xff);
if (a != b) {
return a - b;
}
}
return length1 - length2;
}
public static byte[] paddingTo8Byte(byte[] a) {
if (a.length == 8) {
return a;
}
if (a.length > 8) {
byte[] result = new byte[8];
System.arraycopy(a, 0, result, 0, 8);
return result;
}
int paddingSize = 8 - a.length;
byte[] result = new byte[8];
for (int i = 0; i < paddingSize; i++) {
result[i] = 0;
}
System.arraycopy(a, 0, result, paddingSize, a.length);
return result;
}
/**
* Interleaving array bytes.
* Interleaving means take one bit from the first matrix element, one bit
* from the next, etc, then take the second bit from the first matrix
* element, second bit from the second, all the way to the last bit of the
* last element. Combine those bits in that order into a single BigInteger,
* @param buffer candidate element to do interleaving
* @return byte size of candidate element
*/
public static byte[] interleaving(byte[][] buffer, int size) {
int candidateSize = buffer.length;
byte[] result = new byte[size * candidateSize];
int resBitPos = 0;
int totalBits = size * 8;
for (int bitStep = 0; bitStep < totalBits; bitStep++) {
int currentBytePos = (int) Math.floor(bitStep / 8);
int currentBitPos = bitStep % 8;
for (int i = 0; i < candidateSize; i++) {
int tempResBytePos = (int) Math.floor(resBitPos / 8);
int tempResBitPos = resBitPos % 8;
result[tempResBytePos] = updatePos(result[tempResBytePos], tempResBitPos, buffer[i][currentBytePos], currentBitPos);
resBitPos++;
}
}
return result;
}
public static byte updatePos(byte a, int apos, byte b, int bpos) {
byte temp = (byte) (b & (1 << (7 - bpos)));
if (apos < bpos) {
temp = (byte) (temp << (bpos - apos));
}
if (apos > bpos) {
temp = (byte) (temp >> (apos - bpos));
}
byte atemp = (byte) (a & (1 << (7 - apos)));
if ((byte) (atemp ^ temp) == 0) {
return a;
}
return (byte) (a ^ (1 << (7 - apos)));
}
public static byte[] toBytes(int val) {
byte[] b = new byte[4];
for (int i = 3; i > 0; i--) {
b[i] = (byte) val;
val >>>= 8;
}
b[0] = (byte) val;
return b;
}
public static byte[] toBytes(long val) {
long temp = val;
byte[] b = new byte[8];
for (int i = 7; i > 0; i--) {
b[i] = (byte) temp;
temp >>>= 8;
}
b[0] = (byte) temp;
return b;
}
public static byte[] toBytes(final double d) {
return toBytes(Double.doubleToRawLongBits(d));
}
public static byte[] intTo8Byte(int a) {
int temp = a;
temp = temp ^ (1 << 31);
return paddingTo8Byte(toBytes(temp));
}
public static byte[] byteTo8Byte(byte a) {
return paddingTo8Byte(new byte[] { a });
}
public static byte[] longTo8Byte(long a) {
long temp = a;
temp = temp ^ (1L << 63);
return toBytes(temp);
}
public static byte[] doubleTo8Byte(double a) {
byte[] temp = toBytes(a);
if (a > 0) {
temp[0] = (byte) (temp[0] ^ (1 << 7));
}
if (a < 0) {
for (int i = 0; i < temp.length; i++) {
temp[i] = (byte) ~temp[i];
}
}
return temp;
}
public static byte[] utf8To8Byte(String a) {
return paddingTo8Byte(a.getBytes(Charset.forName("utf-8")));
}
public static Long convertStringToLong(String a) {
byte[] bytes = utf8To8Byte(a);
return convertBytesToLong(bytes);
}
public static long convertBytesToLong(byte[] bytes) {
byte[] paddedBytes = paddingTo8Byte(bytes);
long temp = 0L;
for (int i = 7; i >= 0; i--) {
temp = temp | (((long) paddedBytes[i] & 0xff) << (7 - i) * 8);
}
return temp;
}
}

View File

@@ -246,7 +246,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
public abstract HoodieWriteMetadata<O> insertOverwriteTable(HoodieEngineContext context, String instantTime, I records);
/**
* Updates Metadata Indexes (like Z-Index)
* Updates Metadata Indexes (like Column Stats index)
* TODO rebase onto metadata table (post RFC-27)
*
* @param context instance of {@link HoodieEngineContext}

View File

@@ -74,13 +74,17 @@ public abstract class PartitionAwareClusteringPlanStrategy<T extends HoodieRecor
return Option.empty();
}
List<HoodieClusteringGroup> clusteringGroups = getEngineContext().flatMap(partitionPaths,
partitionPath -> {
List<FileSlice> fileSlicesEligible = getFileSlicesEligibleForClustering(partitionPath).collect(Collectors.toList());
return buildClusteringGroupsForPartition(partitionPath, fileSlicesEligible).limit(getWriteConfig().getClusteringMaxNumGroups());
},
partitionPaths.size())
.stream().limit(getWriteConfig().getClusteringMaxNumGroups()).collect(Collectors.toList());
List<HoodieClusteringGroup> clusteringGroups = getEngineContext()
.flatMap(
partitionPaths,
partitionPath -> {
List<FileSlice> fileSlicesEligible = getFileSlicesEligibleForClustering(partitionPath).collect(Collectors.toList());
return buildClusteringGroupsForPartition(partitionPath, fileSlicesEligible).limit(getWriteConfig().getClusteringMaxNumGroups());
},
partitionPaths.size())
.stream()
.limit(getWriteConfig().getClusteringMaxNumGroups())
.collect(Collectors.toList());
if (clusteringGroups.isEmpty()) {
LOG.info("No data available to cluster");

View File

@@ -1,154 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.optimize;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestZOrderingUtil {
@Test
public void testIntConvert() {
// test Int
int[] testInt = new int[] {-1, 1, -2, 10000, -100000, 2, Integer.MAX_VALUE, Integer.MIN_VALUE};
List<OrginValueWrapper<Integer>> valueWrappers = new ArrayList<>();
List<ConvertResultWrapper<Integer>> convertResultWrappers = new ArrayList<>();
for (int i = 0; i < testInt.length; i++) {
valueWrappers.add(new OrginValueWrapper<>(i, testInt[i]));
convertResultWrappers.add(new ConvertResultWrapper<>(i, ZOrderingUtil.intTo8Byte(testInt[i])));
}
Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue)));
Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length)));
for (int i = 0; i < testInt.length; i++) {
assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index);
}
}
@Test
public void testLongConvert() {
// test Long
long[] testLong = new long[] {-1L, 1L, -2L, 10000L, -100000L, 2L, Long.MAX_VALUE, Long.MIN_VALUE};
List<OrginValueWrapper<Long>> valueWrappers = new ArrayList<>();
List<ConvertResultWrapper<Long>> convertResultWrappers = new ArrayList<>();
for (int i = 0; i < testLong.length; i++) {
valueWrappers.add(new OrginValueWrapper<>((long)i, testLong[i]));
convertResultWrappers.add(new ConvertResultWrapper<>((long)i, ZOrderingUtil.longTo8Byte(testLong[i])));
}
Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue)));
Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length)));
for (int i = 0; i < testLong.length; i++) {
assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index);
}
}
@Test
public void testDoubleConvert() {
// test Long
double[] testDouble = new double[] {-1.00d, 1.05d, -2.3d, 10000.002d, -100000.7d, 2.9d, Double.MAX_VALUE};
List<OrginValueWrapper<Double>> valueWrappers = new ArrayList<>();
List<ConvertResultWrapper<Double>> convertResultWrappers = new ArrayList<>();
for (int i = 0; i < testDouble.length; i++) {
valueWrappers.add(new OrginValueWrapper<>((Double)(i * 1.0), testDouble[i]));
convertResultWrappers.add(new ConvertResultWrapper<>((Double)(i * 1.0), ZOrderingUtil.doubleTo8Byte(testDouble[i])));
}
Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue)));
Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length)));
for (int i = 0; i < testDouble.length; i++) {
assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index);
}
}
@Test
public void testFloatConvert() {
// test Long
float[] testDouble = new float[] {-1.00f, 1.05f, -2.3f, 10000.002f, -100000.7f, 2.9f, Float.MAX_VALUE, Float.MIN_VALUE};
List<OrginValueWrapper<Float>> valueWrappers = new ArrayList<>();
List<ConvertResultWrapper<Float>> convertResultWrappers = new ArrayList<>();
for (int i = 0; i < testDouble.length; i++) {
valueWrappers.add(new OrginValueWrapper<>((float)(i * 1.0), testDouble[i]));
convertResultWrappers.add(new ConvertResultWrapper<>((float)(i * 1.0), ZOrderingUtil.doubleTo8Byte((double) testDouble[i])));
}
Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue)));
Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length)));
for (int i = 0; i < testDouble.length; i++) {
assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index);
}
}
private class ConvertResultWrapper<T> {
T index;
byte[] result;
public ConvertResultWrapper(T index, byte[] result) {
this.index = index;
this.result = result;
}
}
private class OrginValueWrapper<T> {
T index;
T originValue;
public OrginValueWrapper(T index, T originValue) {
this.index = index;
this.originValue = originValue;
}
}
@Test
public void testConvertBytesToLong() {
long[] tests = new long[] {Long.MIN_VALUE, -1L, 0, 1L, Long.MAX_VALUE};
for (int i = 0; i < tests.length; i++) {
assertEquals(ZOrderingUtil.convertBytesToLong(convertLongToBytes(tests[i])), tests[i]);
}
}
@Test
public void testConvertBytesToLongWithPadding() {
byte[] bytes = new byte[2];
bytes[0] = 2;
bytes[1] = 127;
assertEquals(ZOrderingUtil.convertBytesToLong(bytes), 2 * 256 + 127);
}
private byte[] convertLongToBytes(long num) {
byte[] byteNum = new byte[8];
for (int i = 0; i < 8; i++) {
int offset = 64 - (i + 1) * 8;
byteNum[i] = (byte) ((num >> offset) & 0xff);
}
return byteNum;
}
}