1
0

[HUDI-2101][RFC-28] support z-order for hudi (#3330)

* [HUDI-2101]support z-order for hudi

* Renaming some configs for consistency/simplicity.

* Minor code cleanups

Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
xiarixiaoyao
2021-11-03 00:31:57 +08:00
committed by GitHub
parent f9bc3e03e5
commit d194643b49
22 changed files with 2140 additions and 10 deletions

View File

@@ -0,0 +1,99 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.model;
import org.apache.parquet.schema.PrimitiveStringifier;
import java.util.Objects;
/**
* Hoodie Range metadata.
*/
public class HoodieColumnRangeMetadata<T> {
private final String filePath;
private final String columnName;
private final T minValue;
private final T maxValue;
private final long numNulls;
private final PrimitiveStringifier stringifier;
public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, final long numNulls, final PrimitiveStringifier stringifier) {
this.filePath = filePath;
this.columnName = columnName;
this.minValue = minValue;
this.maxValue = maxValue;
this.numNulls = numNulls;
this.stringifier = stringifier;
}
public String getFilePath() {
return this.filePath;
}
public String getColumnName() {
return this.columnName;
}
public T getMinValue() {
return this.minValue;
}
public T getMaxValue() {
return this.maxValue;
}
public PrimitiveStringifier getStringifier() {
return stringifier;
}
public long getNumNulls() {
return numNulls;
}
@Override
public boolean equals(final Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
final HoodieColumnRangeMetadata<?> that = (HoodieColumnRangeMetadata<?>) o;
return Objects.equals(getFilePath(), that.getFilePath())
&& Objects.equals(getColumnName(), that.getColumnName())
&& Objects.equals(getMinValue(), that.getMinValue())
&& Objects.equals(getMaxValue(), that.getMaxValue())
&& Objects.equals(getNumNulls(), that.getNumNulls());
}
@Override
public int hashCode() {
return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNumNulls());
}
@Override
public String toString() {
return "HoodieColumnRangeMetadata{"
+ "filePath ='" + filePath + '\''
+ "columnName='" + columnName + '\''
+ ", minValue=" + minValue
+ ", maxValue=" + maxValue
+ ", numNulls=" + numNulls + '}';
}
}

View File

@@ -79,6 +79,7 @@ public class HoodieTableMetaClient implements Serializable {
public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux";
public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap";
public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat";
public static final String ZINDEX_NAME = ".zindex";
public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH
+ Path.SEPARATOR + ".partitions";
public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR
@@ -176,6 +177,13 @@ public class HoodieTableMetaClient implements Serializable {
return metaPath;
}
/**
* @return z-index path
*/
public String getZindexPath() {
return new Path(metaPath, ZINDEX_NAME).toString();
}
/**
* @return Temp Folder path
*/

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.common.util;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.exception.HoodieIOException;
@@ -41,12 +42,14 @@ import org.apache.parquet.schema.MessageType;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* Utility functions involving with parquet.
@@ -277,4 +280,59 @@ public class ParquetUtils extends BaseFileUtils {
return candidateKeys.contains(recordKey);
}
}
/**
* Parse min/max statistics stored in parquet footers for all columns.
*/
public Collection<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata(Configuration conf, Path parquetFilePath, List<String> cols) {
ParquetMetadata metadata = readMetadata(conf, parquetFilePath);
// collect stats from all parquet blocks
Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap = metadata.getBlocks().stream().flatMap(blockMetaData -> {
return blockMetaData.getColumns().stream().filter(f -> cols.contains(f.getPath().toDotString())).map(columnChunkMetaData ->
new HoodieColumnRangeMetadata<>(parquetFilePath.getName(), columnChunkMetaData.getPath().toDotString(),
columnChunkMetaData.getStatistics().genericGetMin(),
columnChunkMetaData.getStatistics().genericGetMax(),
columnChunkMetaData.getStatistics().getNumNulls(),
columnChunkMetaData.getPrimitiveType().stringifier()));
}).collect(Collectors.groupingBy(e -> e.getColumnName()));
// we only intend to keep file level statistics.
return new ArrayList<>(columnToStatsListMap.values().stream()
.map(blocks -> getColumnRangeInFile(blocks))
.collect(Collectors.toList()));
}
private HoodieColumnRangeMetadata<Comparable> getColumnRangeInFile(final List<HoodieColumnRangeMetadata<Comparable>> blockRanges) {
if (blockRanges.size() == 1) {
// only one block in parquet file. we can just return that range.
return blockRanges.get(0);
} else {
// there are multiple blocks. Compute min(block_mins) and max(block_maxs)
return blockRanges.stream().reduce((b1, b2) -> combineRanges(b1, b2)).get();
}
}
private HoodieColumnRangeMetadata<Comparable> combineRanges(HoodieColumnRangeMetadata<Comparable> range1,
HoodieColumnRangeMetadata<Comparable> range2) {
final Comparable minValue;
final Comparable maxValue;
if (range1.getMinValue() != null && range2.getMinValue() != null) {
minValue = range1.getMinValue().compareTo(range2.getMinValue()) < 0 ? range1.getMinValue() : range2.getMinValue();
} else if (range1.getMinValue() == null) {
minValue = range2.getMinValue();
} else {
minValue = range1.getMinValue();
}
if (range1.getMaxValue() != null && range2.getMaxValue() != null) {
maxValue = range1.getMaxValue().compareTo(range2.getMaxValue()) < 0 ? range2.getMaxValue() : range1.getMaxValue();
} else if (range1.getMaxValue() == null) {
maxValue = range2.getMaxValue();
} else {
maxValue = range1.getMaxValue();
}
return new HoodieColumnRangeMetadata<>(range1.getFilePath(),
range1.getColumnName(), minValue, maxValue, range1.getNumNulls() + range2.getNumNulls(), range1.getStringifier());
}
}