[HUDI-2814] Make Z-index more generic Column-Stats Index (#4106)
This commit is contained in:
@@ -24,13 +24,15 @@ import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.engine.EngineType;
|
||||
import org.apache.hudi.common.util.TypeUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
|
||||
/**
|
||||
@@ -520,11 +522,15 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
}
|
||||
|
||||
/**
|
||||
* strategy types for build z-ordering/space-filling curves.
|
||||
* Type of a strategy for building Z-order/Hilbert space-filling curves.
|
||||
*/
|
||||
public enum BuildCurveStrategyType {
|
||||
DIRECT("direct"),
|
||||
SAMPLE("sample");
|
||||
|
||||
private static final Map<String, BuildCurveStrategyType> VALUE_TO_ENUM_MAP =
|
||||
TypeUtils.getValueToEnumMap(BuildCurveStrategyType.class, e -> e.value);
|
||||
|
||||
private final String value;
|
||||
|
||||
BuildCurveStrategyType(String value) {
|
||||
@@ -532,42 +538,39 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
}
|
||||
|
||||
public static BuildCurveStrategyType fromValue(String value) {
|
||||
switch (value.toLowerCase(Locale.ROOT)) {
|
||||
case "direct":
|
||||
return DIRECT;
|
||||
case "sample":
|
||||
return SAMPLE;
|
||||
default:
|
||||
throw new HoodieException("Invalid value of Type.");
|
||||
BuildCurveStrategyType enumValue = VALUE_TO_ENUM_MAP.get(value);
|
||||
if (enumValue == null) {
|
||||
throw new HoodieException(String.format("Invalid value (%s)", value));
|
||||
}
|
||||
|
||||
return enumValue;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* strategy types for optimize layout for hudi data.
|
||||
* Layout optimization strategies such as Z-order/Hilbert space-curves, etc
|
||||
*/
|
||||
public enum BuildLayoutOptimizationStrategy {
|
||||
public enum LayoutOptimizationStrategy {
|
||||
ZORDER("z-order"),
|
||||
HILBERT("hilbert");
|
||||
|
||||
private static final Map<String, LayoutOptimizationStrategy> VALUE_TO_ENUM_MAP =
|
||||
TypeUtils.getValueToEnumMap(LayoutOptimizationStrategy.class, e -> e.value);
|
||||
|
||||
private final String value;
|
||||
|
||||
BuildLayoutOptimizationStrategy(String value) {
|
||||
LayoutOptimizationStrategy(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public String toCustomString() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public static BuildLayoutOptimizationStrategy fromValue(String value) {
|
||||
switch (value.toLowerCase(Locale.ROOT)) {
|
||||
case "z-order":
|
||||
return ZORDER;
|
||||
case "hilbert":
|
||||
return HILBERT;
|
||||
default:
|
||||
throw new HoodieException("Invalid value of Type.");
|
||||
@Nonnull
|
||||
public static LayoutOptimizationStrategy fromValue(String value) {
|
||||
LayoutOptimizationStrategy enumValue = VALUE_TO_ENUM_MAP.get(value);
|
||||
if (enumValue == null) {
|
||||
throw new HoodieException(String.format("Invalid value (%s)", value));
|
||||
}
|
||||
|
||||
return enumValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,191 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.optimize;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
public class ZOrderingUtil {
|
||||
|
||||
/**
|
||||
* Lexicographically compare two arrays.
|
||||
* copy from hbase
|
||||
* @param buffer1 left operand
|
||||
* @param buffer2 right operand
|
||||
* @param offset1 Where to start comparing in the left buffer
|
||||
* @param offset2 Where to start comparing in the right buffer
|
||||
* @param length1 How much to compare from the left buffer
|
||||
* @param length2 How much to compare from the right buffer
|
||||
* @return 0 if equal, < 0 if left is less than right, etc.
|
||||
*/
|
||||
public static int compareTo(byte[] buffer1, int offset1, int length1,
|
||||
byte[] buffer2, int offset2, int length2) {
|
||||
// Short circuit equal case
|
||||
if (buffer1 == buffer2
|
||||
&& offset1 == offset2
|
||||
&& length1 == length2) {
|
||||
return 0;
|
||||
}
|
||||
// Bring WritableComparator code local
|
||||
int end1 = offset1 + length1;
|
||||
int end2 = offset2 + length2;
|
||||
for (int i = offset1, j = offset2; i < end1 && j < end2; i++, j++) {
|
||||
int a = (buffer1[i] & 0xff);
|
||||
int b = (buffer2[j] & 0xff);
|
||||
if (a != b) {
|
||||
return a - b;
|
||||
}
|
||||
}
|
||||
return length1 - length2;
|
||||
}
|
||||
|
||||
public static byte[] paddingTo8Byte(byte[] a) {
|
||||
if (a.length == 8) {
|
||||
return a;
|
||||
}
|
||||
if (a.length > 8) {
|
||||
byte[] result = new byte[8];
|
||||
System.arraycopy(a, 0, result, 0, 8);
|
||||
return result;
|
||||
}
|
||||
int paddingSize = 8 - a.length;
|
||||
byte[] result = new byte[8];
|
||||
for (int i = 0; i < paddingSize; i++) {
|
||||
result[i] = 0;
|
||||
}
|
||||
System.arraycopy(a, 0, result, paddingSize, a.length);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Interleaving array bytes.
|
||||
* Interleaving means take one bit from the first matrix element, one bit
|
||||
* from the next, etc, then take the second bit from the first matrix
|
||||
* element, second bit from the second, all the way to the last bit of the
|
||||
* last element. Combine those bits in that order into a single BigInteger,
|
||||
* @param buffer candidate element to do interleaving
|
||||
* @return byte size of candidate element
|
||||
*/
|
||||
public static byte[] interleaving(byte[][] buffer, int size) {
|
||||
int candidateSize = buffer.length;
|
||||
byte[] result = new byte[size * candidateSize];
|
||||
int resBitPos = 0;
|
||||
int totalBits = size * 8;
|
||||
for (int bitStep = 0; bitStep < totalBits; bitStep++) {
|
||||
int currentBytePos = (int) Math.floor(bitStep / 8);
|
||||
int currentBitPos = bitStep % 8;
|
||||
|
||||
for (int i = 0; i < candidateSize; i++) {
|
||||
int tempResBytePos = (int) Math.floor(resBitPos / 8);
|
||||
int tempResBitPos = resBitPos % 8;
|
||||
result[tempResBytePos] = updatePos(result[tempResBytePos], tempResBitPos, buffer[i][currentBytePos], currentBitPos);
|
||||
resBitPos++;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static byte updatePos(byte a, int apos, byte b, int bpos) {
|
||||
byte temp = (byte) (b & (1 << (7 - bpos)));
|
||||
if (apos < bpos) {
|
||||
temp = (byte) (temp << (bpos - apos));
|
||||
}
|
||||
if (apos > bpos) {
|
||||
temp = (byte) (temp >> (apos - bpos));
|
||||
}
|
||||
byte atemp = (byte) (a & (1 << (7 - apos)));
|
||||
if ((byte) (atemp ^ temp) == 0) {
|
||||
return a;
|
||||
}
|
||||
return (byte) (a ^ (1 << (7 - apos)));
|
||||
}
|
||||
|
||||
public static byte[] toBytes(int val) {
|
||||
byte[] b = new byte[4];
|
||||
for (int i = 3; i > 0; i--) {
|
||||
b[i] = (byte) val;
|
||||
val >>>= 8;
|
||||
}
|
||||
b[0] = (byte) val;
|
||||
return b;
|
||||
}
|
||||
|
||||
public static byte[] toBytes(long val) {
|
||||
long temp = val;
|
||||
byte[] b = new byte[8];
|
||||
for (int i = 7; i > 0; i--) {
|
||||
b[i] = (byte) temp;
|
||||
temp >>>= 8;
|
||||
}
|
||||
b[0] = (byte) temp;
|
||||
return b;
|
||||
}
|
||||
|
||||
public static byte[] toBytes(final double d) {
|
||||
return toBytes(Double.doubleToRawLongBits(d));
|
||||
}
|
||||
|
||||
public static byte[] intTo8Byte(int a) {
|
||||
int temp = a;
|
||||
temp = temp ^ (1 << 31);
|
||||
return paddingTo8Byte(toBytes(temp));
|
||||
}
|
||||
|
||||
public static byte[] byteTo8Byte(byte a) {
|
||||
return paddingTo8Byte(new byte[] { a });
|
||||
}
|
||||
|
||||
public static byte[] longTo8Byte(long a) {
|
||||
long temp = a;
|
||||
temp = temp ^ (1L << 63);
|
||||
return toBytes(temp);
|
||||
}
|
||||
|
||||
public static byte[] doubleTo8Byte(double a) {
|
||||
byte[] temp = toBytes(a);
|
||||
if (a > 0) {
|
||||
temp[0] = (byte) (temp[0] ^ (1 << 7));
|
||||
}
|
||||
if (a < 0) {
|
||||
for (int i = 0; i < temp.length; i++) {
|
||||
temp[i] = (byte) ~temp[i];
|
||||
}
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
|
||||
public static byte[] utf8To8Byte(String a) {
|
||||
return paddingTo8Byte(a.getBytes(Charset.forName("utf-8")));
|
||||
}
|
||||
|
||||
public static Long convertStringToLong(String a) {
|
||||
byte[] bytes = utf8To8Byte(a);
|
||||
return convertBytesToLong(bytes);
|
||||
}
|
||||
|
||||
public static long convertBytesToLong(byte[] bytes) {
|
||||
byte[] paddedBytes = paddingTo8Byte(bytes);
|
||||
long temp = 0L;
|
||||
for (int i = 7; i >= 0; i--) {
|
||||
temp = temp | (((long) paddedBytes[i] & 0xff) << (7 - i) * 8);
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -246,7 +246,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
|
||||
public abstract HoodieWriteMetadata<O> insertOverwriteTable(HoodieEngineContext context, String instantTime, I records);
|
||||
|
||||
/**
|
||||
* Updates Metadata Indexes (like Z-Index)
|
||||
* Updates Metadata Indexes (like Column Stats index)
|
||||
* TODO rebase onto metadata table (post RFC-27)
|
||||
*
|
||||
* @param context instance of {@link HoodieEngineContext}
|
||||
|
||||
@@ -74,13 +74,17 @@ public abstract class PartitionAwareClusteringPlanStrategy<T extends HoodieRecor
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
List<HoodieClusteringGroup> clusteringGroups = getEngineContext().flatMap(partitionPaths,
|
||||
partitionPath -> {
|
||||
List<FileSlice> fileSlicesEligible = getFileSlicesEligibleForClustering(partitionPath).collect(Collectors.toList());
|
||||
return buildClusteringGroupsForPartition(partitionPath, fileSlicesEligible).limit(getWriteConfig().getClusteringMaxNumGroups());
|
||||
},
|
||||
partitionPaths.size())
|
||||
.stream().limit(getWriteConfig().getClusteringMaxNumGroups()).collect(Collectors.toList());
|
||||
List<HoodieClusteringGroup> clusteringGroups = getEngineContext()
|
||||
.flatMap(
|
||||
partitionPaths,
|
||||
partitionPath -> {
|
||||
List<FileSlice> fileSlicesEligible = getFileSlicesEligibleForClustering(partitionPath).collect(Collectors.toList());
|
||||
return buildClusteringGroupsForPartition(partitionPath, fileSlicesEligible).limit(getWriteConfig().getClusteringMaxNumGroups());
|
||||
},
|
||||
partitionPaths.size())
|
||||
.stream()
|
||||
.limit(getWriteConfig().getClusteringMaxNumGroups())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (clusteringGroups.isEmpty()) {
|
||||
LOG.info("No data available to cluster");
|
||||
|
||||
@@ -1,154 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.optimize;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
public class TestZOrderingUtil {
|
||||
|
||||
@Test
|
||||
public void testIntConvert() {
|
||||
// test Int
|
||||
int[] testInt = new int[] {-1, 1, -2, 10000, -100000, 2, Integer.MAX_VALUE, Integer.MIN_VALUE};
|
||||
List<OrginValueWrapper<Integer>> valueWrappers = new ArrayList<>();
|
||||
List<ConvertResultWrapper<Integer>> convertResultWrappers = new ArrayList<>();
|
||||
for (int i = 0; i < testInt.length; i++) {
|
||||
valueWrappers.add(new OrginValueWrapper<>(i, testInt[i]));
|
||||
convertResultWrappers.add(new ConvertResultWrapper<>(i, ZOrderingUtil.intTo8Byte(testInt[i])));
|
||||
}
|
||||
|
||||
Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue)));
|
||||
|
||||
Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length)));
|
||||
|
||||
for (int i = 0; i < testInt.length; i++) {
|
||||
assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLongConvert() {
|
||||
// test Long
|
||||
long[] testLong = new long[] {-1L, 1L, -2L, 10000L, -100000L, 2L, Long.MAX_VALUE, Long.MIN_VALUE};
|
||||
List<OrginValueWrapper<Long>> valueWrappers = new ArrayList<>();
|
||||
List<ConvertResultWrapper<Long>> convertResultWrappers = new ArrayList<>();
|
||||
for (int i = 0; i < testLong.length; i++) {
|
||||
valueWrappers.add(new OrginValueWrapper<>((long)i, testLong[i]));
|
||||
convertResultWrappers.add(new ConvertResultWrapper<>((long)i, ZOrderingUtil.longTo8Byte(testLong[i])));
|
||||
}
|
||||
|
||||
Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue)));
|
||||
|
||||
Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length)));
|
||||
|
||||
for (int i = 0; i < testLong.length; i++) {
|
||||
assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDoubleConvert() {
|
||||
// test Long
|
||||
double[] testDouble = new double[] {-1.00d, 1.05d, -2.3d, 10000.002d, -100000.7d, 2.9d, Double.MAX_VALUE};
|
||||
List<OrginValueWrapper<Double>> valueWrappers = new ArrayList<>();
|
||||
List<ConvertResultWrapper<Double>> convertResultWrappers = new ArrayList<>();
|
||||
for (int i = 0; i < testDouble.length; i++) {
|
||||
valueWrappers.add(new OrginValueWrapper<>((Double)(i * 1.0), testDouble[i]));
|
||||
convertResultWrappers.add(new ConvertResultWrapper<>((Double)(i * 1.0), ZOrderingUtil.doubleTo8Byte(testDouble[i])));
|
||||
}
|
||||
|
||||
Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue)));
|
||||
|
||||
Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length)));
|
||||
|
||||
for (int i = 0; i < testDouble.length; i++) {
|
||||
assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFloatConvert() {
|
||||
// test Long
|
||||
float[] testDouble = new float[] {-1.00f, 1.05f, -2.3f, 10000.002f, -100000.7f, 2.9f, Float.MAX_VALUE, Float.MIN_VALUE};
|
||||
List<OrginValueWrapper<Float>> valueWrappers = new ArrayList<>();
|
||||
List<ConvertResultWrapper<Float>> convertResultWrappers = new ArrayList<>();
|
||||
for (int i = 0; i < testDouble.length; i++) {
|
||||
valueWrappers.add(new OrginValueWrapper<>((float)(i * 1.0), testDouble[i]));
|
||||
convertResultWrappers.add(new ConvertResultWrapper<>((float)(i * 1.0), ZOrderingUtil.doubleTo8Byte((double) testDouble[i])));
|
||||
}
|
||||
|
||||
Collections.sort(valueWrappers, ((o1, o2) -> o1.originValue.compareTo(o2.originValue)));
|
||||
|
||||
Collections.sort(convertResultWrappers, ((o1, o2) -> ZOrderingUtil.compareTo(o1.result, 0, o1.result.length, o2.result, 0, o2.result.length)));
|
||||
|
||||
for (int i = 0; i < testDouble.length; i++) {
|
||||
assertEquals(valueWrappers.get(i).index, convertResultWrappers.get(i).index);
|
||||
}
|
||||
}
|
||||
|
||||
private class ConvertResultWrapper<T> {
|
||||
T index;
|
||||
byte[] result;
|
||||
public ConvertResultWrapper(T index, byte[] result) {
|
||||
this.index = index;
|
||||
this.result = result;
|
||||
}
|
||||
}
|
||||
|
||||
private class OrginValueWrapper<T> {
|
||||
T index;
|
||||
T originValue;
|
||||
public OrginValueWrapper(T index, T originValue) {
|
||||
this.index = index;
|
||||
this.originValue = originValue;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConvertBytesToLong() {
|
||||
long[] tests = new long[] {Long.MIN_VALUE, -1L, 0, 1L, Long.MAX_VALUE};
|
||||
for (int i = 0; i < tests.length; i++) {
|
||||
assertEquals(ZOrderingUtil.convertBytesToLong(convertLongToBytes(tests[i])), tests[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConvertBytesToLongWithPadding() {
|
||||
byte[] bytes = new byte[2];
|
||||
bytes[0] = 2;
|
||||
bytes[1] = 127;
|
||||
assertEquals(ZOrderingUtil.convertBytesToLong(bytes), 2 * 256 + 127);
|
||||
}
|
||||
|
||||
private byte[] convertLongToBytes(long num) {
|
||||
byte[] byteNum = new byte[8];
|
||||
for (int i = 0; i < 8; i++) {
|
||||
int offset = 64 - (i + 1) * 8;
|
||||
byteNum[i] = (byte) ((num >> offset) & 0xff);
|
||||
}
|
||||
return byteNum;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user