[HUDI-2101][RFC-28] support z-order for hudi (#3330)
* [HUDI-2101]support z-order for hudi * Renaming some configs for consistency/simplicity. * Minor code cleanups Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.model;
|
||||
|
||||
import org.apache.parquet.schema.PrimitiveStringifier;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Hoodie Range metadata.
|
||||
*/
|
||||
public class HoodieColumnRangeMetadata<T> {
|
||||
private final String filePath;
|
||||
private final String columnName;
|
||||
private final T minValue;
|
||||
private final T maxValue;
|
||||
private final long numNulls;
|
||||
private final PrimitiveStringifier stringifier;
|
||||
|
||||
public HoodieColumnRangeMetadata(final String filePath, final String columnName, final T minValue, final T maxValue, final long numNulls, final PrimitiveStringifier stringifier) {
|
||||
this.filePath = filePath;
|
||||
this.columnName = columnName;
|
||||
this.minValue = minValue;
|
||||
this.maxValue = maxValue;
|
||||
this.numNulls = numNulls;
|
||||
this.stringifier = stringifier;
|
||||
}
|
||||
|
||||
public String getFilePath() {
|
||||
return this.filePath;
|
||||
}
|
||||
|
||||
public String getColumnName() {
|
||||
return this.columnName;
|
||||
}
|
||||
|
||||
public T getMinValue() {
|
||||
return this.minValue;
|
||||
}
|
||||
|
||||
public T getMaxValue() {
|
||||
return this.maxValue;
|
||||
}
|
||||
|
||||
public PrimitiveStringifier getStringifier() {
|
||||
return stringifier;
|
||||
}
|
||||
|
||||
public long getNumNulls() {
|
||||
return numNulls;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
final HoodieColumnRangeMetadata<?> that = (HoodieColumnRangeMetadata<?>) o;
|
||||
return Objects.equals(getFilePath(), that.getFilePath())
|
||||
&& Objects.equals(getColumnName(), that.getColumnName())
|
||||
&& Objects.equals(getMinValue(), that.getMinValue())
|
||||
&& Objects.equals(getMaxValue(), that.getMaxValue())
|
||||
&& Objects.equals(getNumNulls(), that.getNumNulls());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(getColumnName(), getMinValue(), getMaxValue(), getNumNulls());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HoodieColumnRangeMetadata{"
|
||||
+ "filePath ='" + filePath + '\''
|
||||
+ "columnName='" + columnName + '\''
|
||||
+ ", minValue=" + minValue
|
||||
+ ", maxValue=" + maxValue
|
||||
+ ", numNulls=" + numNulls + '}';
|
||||
}
|
||||
}
|
||||
@@ -79,6 +79,7 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
public static final String AUXILIARYFOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".aux";
|
||||
public static final String BOOTSTRAP_INDEX_ROOT_FOLDER_PATH = AUXILIARYFOLDER_NAME + Path.SEPARATOR + ".bootstrap";
|
||||
public static final String HEARTBEAT_FOLDER_NAME = METAFOLDER_NAME + Path.SEPARATOR + ".heartbeat";
|
||||
public static final String ZINDEX_NAME = ".zindex";
|
||||
public static final String BOOTSTRAP_INDEX_BY_PARTITION_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH
|
||||
+ Path.SEPARATOR + ".partitions";
|
||||
public static final String BOOTSTRAP_INDEX_BY_FILE_ID_FOLDER_PATH = BOOTSTRAP_INDEX_ROOT_FOLDER_PATH + Path.SEPARATOR
|
||||
@@ -176,6 +177,13 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
return metaPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return z-index path
|
||||
*/
|
||||
public String getZindexPath() {
|
||||
return new Path(metaPath, ZINDEX_NAME).toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Temp Folder path
|
||||
*/
|
||||
|
||||
@@ -20,6 +20,7 @@ package org.apache.hudi.common.util;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieColumnRangeMetadata;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
@@ -41,12 +42,14 @@ import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Utility functions involving with parquet.
|
||||
@@ -277,4 +280,59 @@ public class ParquetUtils extends BaseFileUtils {
|
||||
return candidateKeys.contains(recordKey);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse min/max statistics stored in parquet footers for all columns.
|
||||
*/
|
||||
public Collection<HoodieColumnRangeMetadata<Comparable>> readRangeFromParquetMetadata(Configuration conf, Path parquetFilePath, List<String> cols) {
|
||||
ParquetMetadata metadata = readMetadata(conf, parquetFilePath);
|
||||
// collect stats from all parquet blocks
|
||||
Map<String, List<HoodieColumnRangeMetadata<Comparable>>> columnToStatsListMap = metadata.getBlocks().stream().flatMap(blockMetaData -> {
|
||||
return blockMetaData.getColumns().stream().filter(f -> cols.contains(f.getPath().toDotString())).map(columnChunkMetaData ->
|
||||
new HoodieColumnRangeMetadata<>(parquetFilePath.getName(), columnChunkMetaData.getPath().toDotString(),
|
||||
columnChunkMetaData.getStatistics().genericGetMin(),
|
||||
columnChunkMetaData.getStatistics().genericGetMax(),
|
||||
columnChunkMetaData.getStatistics().getNumNulls(),
|
||||
columnChunkMetaData.getPrimitiveType().stringifier()));
|
||||
}).collect(Collectors.groupingBy(e -> e.getColumnName()));
|
||||
|
||||
// we only intend to keep file level statistics.
|
||||
return new ArrayList<>(columnToStatsListMap.values().stream()
|
||||
.map(blocks -> getColumnRangeInFile(blocks))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
private HoodieColumnRangeMetadata<Comparable> getColumnRangeInFile(final List<HoodieColumnRangeMetadata<Comparable>> blockRanges) {
|
||||
if (blockRanges.size() == 1) {
|
||||
// only one block in parquet file. we can just return that range.
|
||||
return blockRanges.get(0);
|
||||
} else {
|
||||
// there are multiple blocks. Compute min(block_mins) and max(block_maxs)
|
||||
return blockRanges.stream().reduce((b1, b2) -> combineRanges(b1, b2)).get();
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieColumnRangeMetadata<Comparable> combineRanges(HoodieColumnRangeMetadata<Comparable> range1,
|
||||
HoodieColumnRangeMetadata<Comparable> range2) {
|
||||
final Comparable minValue;
|
||||
final Comparable maxValue;
|
||||
if (range1.getMinValue() != null && range2.getMinValue() != null) {
|
||||
minValue = range1.getMinValue().compareTo(range2.getMinValue()) < 0 ? range1.getMinValue() : range2.getMinValue();
|
||||
} else if (range1.getMinValue() == null) {
|
||||
minValue = range2.getMinValue();
|
||||
} else {
|
||||
minValue = range1.getMinValue();
|
||||
}
|
||||
|
||||
if (range1.getMaxValue() != null && range2.getMaxValue() != null) {
|
||||
maxValue = range1.getMaxValue().compareTo(range2.getMaxValue()) < 0 ? range2.getMaxValue() : range1.getMaxValue();
|
||||
} else if (range1.getMaxValue() == null) {
|
||||
maxValue = range2.getMaxValue();
|
||||
} else {
|
||||
maxValue = range1.getMaxValue();
|
||||
}
|
||||
|
||||
return new HoodieColumnRangeMetadata<>(range1.getFilePath(),
|
||||
range1.getColumnName(), minValue, maxValue, range1.getNumNulls() + range2.getNumNulls(), range1.getStringifier());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user