1
0

[HUDI-4353] Column stats data skipping for flink (#6026)

This commit is contained in:
Danny Chan
2022-07-03 08:29:31 +08:00
committed by GitHub
parent bdf73b2650
commit 47792a3186
30 changed files with 1930 additions and 81 deletions

View File

@@ -18,11 +18,10 @@
package org.apache.hudi.sink.utils;
import org.apache.flink.configuration.Configuration;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.flink.configuration.Configuration;
import org.junit.jupiter.api.Test;
import java.lang.reflect.Method;

View File

@@ -58,7 +58,7 @@ public class TestFileIndex {
conf.setBoolean(FlinkOptions.METADATA_ENABLED, true);
conf.setBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING, hiveStylePartitioning);
TestData.writeData(TestData.DATA_SET_INSERT, conf);
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf);
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE);
List<String> partitionKeys = Collections.singletonList("partition");
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", hiveStylePartitioning);
assertTrue(partitions.stream().allMatch(m -> m.size() == 1));
@@ -79,7 +79,7 @@ public class TestFileIndex {
conf.setString(FlinkOptions.KEYGEN_CLASS_NAME, NonpartitionedAvroKeyGenerator.class.getName());
conf.setBoolean(FlinkOptions.METADATA_ENABLED, true);
TestData.writeData(TestData.DATA_SET_INSERT, conf);
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf);
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE);
List<String> partitionKeys = Collections.singletonList("");
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", false);
assertThat(partitions.size(), is(0));
@@ -94,7 +94,7 @@ public class TestFileIndex {
void testFileListingEmptyTable(boolean enableMetadata) {
Configuration conf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath());
conf.setBoolean(FlinkOptions.METADATA_ENABLED, enableMetadata);
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf);
FileIndex fileIndex = FileIndex.instance(new Path(tempFile.getAbsolutePath()), conf, TestConfigurations.ROW_TYPE);
List<String> partitionKeys = Collections.singletonList("partition");
List<Map<String, String>> partitions = fileIndex.getPartitions(partitionKeys, "default", false);
assertThat(partitions.size(), is(0));

View File

@@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.source.stats;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.utils.TestConfigurations;
import org.apache.hudi.utils.TestData;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.RowData;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.io.File;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.jupiter.api.Assertions.assertThrows;
/**
* Test cases for {@link ColumnStatsIndices}.
*/
public class TestColumnStatsIndices {
@TempDir
File tempFile;
@Test
void testTransposeColumnStatsIndex() throws Exception {
final String path = tempFile.getAbsolutePath();
Configuration conf = TestConfigurations.getDefaultConf(path);
conf.setBoolean(FlinkOptions.METADATA_ENABLED, true);
conf.setBoolean(FlinkOptions.READ_DATA_SKIPPING_ENABLED, true);
conf.setString("hoodie.metadata.index.column.stats.enable", "true");
HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder()
.enable(true)
.withMetadataIndexColumnStats(true)
.build();
TestData.writeData(TestData.DATA_SET_INSERT, conf);
// explicit query columns
String[] queryColumns1 = {"uuid", "age"};
List<RowData> indexRows1 = ColumnStatsIndices.readColumnStatsIndex(path, metadataConfig, queryColumns1);
Pair<List<RowData>, String[]> transposedIndexTable1 = ColumnStatsIndices
.transposeColumnStatsIndex(indexRows1, queryColumns1, TestConfigurations.ROW_TYPE);
assertThat("The schema columns should sort by natural order",
Arrays.toString(transposedIndexTable1.getRight()), is("[age, uuid]"));
List<RowData> transposed1 = filterOutFileNames(transposedIndexTable1.getLeft());
assertThat(transposed1.size(), is(4));
final String expected = "["
+ "+I(2,18,20,0,id5,id6,0), "
+ "+I(2,23,33,0,id1,id2,0), "
+ "+I(2,31,53,0,id3,id4,0), "
+ "+I(2,44,56,0,id7,id8,0)]";
assertThat(transposed1.toString(), is(expected));
// no query columns, only for tests
assertThrows(IllegalArgumentException.class,
() -> ColumnStatsIndices.readColumnStatsIndex(path, metadataConfig, new String[0]));
}
private static List<RowData> filterOutFileNames(List<RowData> indexRows) {
return indexRows.stream().map(row -> {
GenericRowData gr = (GenericRowData) row;
GenericRowData converted = new GenericRowData(gr.getArity() - 1);
for (int i = 1; i < gr.getArity(); i++) {
converted.setField(i - 1, gr.getField(i));
}
return converted;
})
// sort by age min values
.sorted(Comparator.comparingInt(r -> r.getInt(1)))
.collect(Collectors.toList());
}
}

View File

@@ -0,0 +1,374 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.source.stats;
import org.apache.hudi.utils.TestData;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.apache.flink.table.expressions.FieldReferenceExpression;
import org.apache.flink.table.expressions.ValueLiteralExpression;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.RowType;
import org.junit.jupiter.api.Test;
import java.util.Arrays;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
/**
* Test cases for {@link ExpressionEvaluator}.
*/
public class TestExpressionEvaluator {
private static final DataType ROW_DATA_TYPE = DataTypes.ROW(
DataTypes.FIELD("f_tinyint", DataTypes.TINYINT()),
DataTypes.FIELD("f_smallint", DataTypes.SMALLINT()),
DataTypes.FIELD("f_int", DataTypes.INT()),
DataTypes.FIELD("f_long", DataTypes.BIGINT()),
DataTypes.FIELD("f_float", DataTypes.FLOAT()),
DataTypes.FIELD("f_double", DataTypes.DOUBLE()),
DataTypes.FIELD("f_boolean", DataTypes.BOOLEAN()),
DataTypes.FIELD("f_decimal", DataTypes.DECIMAL(10, 2)),
DataTypes.FIELD("f_bytes", DataTypes.VARBINARY(10)),
DataTypes.FIELD("f_string", DataTypes.VARCHAR(10)),
DataTypes.FIELD("f_time", DataTypes.TIME(3)),
DataTypes.FIELD("f_date", DataTypes.DATE()),
DataTypes.FIELD("f_timestamp", DataTypes.TIMESTAMP(3))
).notNull();
private static final DataType INDEX_ROW_DATA_TYPE = DataTypes.ROW(
DataTypes.FIELD("file_name", DataTypes.STRING()),
DataTypes.FIELD("value_cnt", DataTypes.BIGINT()),
DataTypes.FIELD("f_int_min", DataTypes.INT()),
DataTypes.FIELD("f_int_max", DataTypes.INT()),
DataTypes.FIELD("f_int_null_cnt", DataTypes.BIGINT()),
DataTypes.FIELD("f_string_min", DataTypes.VARCHAR(10)),
DataTypes.FIELD("f_string_max", DataTypes.VARCHAR(10)),
DataTypes.FIELD("f_string_null_cnt", DataTypes.BIGINT()),
DataTypes.FIELD("f_timestamp_min", DataTypes.TIMESTAMP(3)),
DataTypes.FIELD("f_timestamp_max", DataTypes.TIMESTAMP(3)),
DataTypes.FIELD("f_timestamp_null_cnt", DataTypes.BIGINT())
).notNull();
private static final RowType INDEX_ROW_TYPE = (RowType) INDEX_ROW_DATA_TYPE.getLogicalType();
@Test
void testEqualTo() {
ExpressionEvaluator.EqualTo equalTo = ExpressionEvaluator.EqualTo.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
RowData indexRow1 = intIndexRow(11, 13);
equalTo.bindFieldReference(rExpr)
.bindVal(vExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
assertTrue(equalTo.eval(), "11 < 12 < 13");
RowData indexRow2 = intIndexRow(12, 13);
equalTo.bindColStats(indexRow2, queryFields(2), rExpr);
assertTrue(equalTo.eval(), "12 <= 12 < 13");
RowData indexRow3 = intIndexRow(11, 12);
equalTo.bindColStats(indexRow3, queryFields(2), rExpr);
assertTrue(equalTo.eval(), "11 < 12 <= 12");
RowData indexRow4 = intIndexRow(10, 11);
equalTo.bindColStats(indexRow4, queryFields(2), rExpr);
assertFalse(equalTo.eval(), "11 < 12");
RowData indexRow5 = intIndexRow(13, 14);
equalTo.bindColStats(indexRow5, queryFields(2), rExpr);
assertFalse(equalTo.eval(), "12 < 13");
RowData indexRow6 = intIndexRow(null, null);
equalTo.bindColStats(indexRow6, queryFields(2), rExpr);
assertFalse(equalTo.eval(), "12 <> null");
equalTo.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
assertFalse(equalTo.eval(), "null <> null");
}
@Test
void testNotEqualTo() {
ExpressionEvaluator.NotEqualTo notEqualTo = ExpressionEvaluator.NotEqualTo.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
RowData indexRow1 = intIndexRow(11, 13);
notEqualTo.bindFieldReference(rExpr)
.bindVal(vExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
assertTrue(notEqualTo.eval(), "11 <> 12 && 12 <> 13");
RowData indexRow2 = intIndexRow(12, 13);
notEqualTo.bindColStats(indexRow2, queryFields(2), rExpr);
assertTrue(notEqualTo.eval(), "12 <> 13");
RowData indexRow3 = intIndexRow(11, 12);
notEqualTo.bindColStats(indexRow3, queryFields(2), rExpr);
assertTrue(notEqualTo.eval(), "11 <> 12");
RowData indexRow4 = intIndexRow(10, 11);
notEqualTo.bindColStats(indexRow4, queryFields(2), rExpr);
assertTrue(notEqualTo.eval(), "10 <> 12 and 11 < 12");
RowData indexRow5 = intIndexRow(13, 14);
notEqualTo.bindColStats(indexRow5, queryFields(2), rExpr);
assertTrue(notEqualTo.eval(), "12 <> 13 and 12 <> 14");
RowData indexRow6 = intIndexRow(null, null);
notEqualTo.bindColStats(indexRow6, queryFields(2), rExpr);
assertTrue(notEqualTo.eval(), "12 <> null");
notEqualTo.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
assertTrue(notEqualTo.eval(), "null <> null");
}
@Test
void testIsNull() {
ExpressionEvaluator.IsNull isNull = ExpressionEvaluator.IsNull.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
RowData indexRow1 = intIndexRow(11, 13);
isNull.bindFieldReference(rExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
assertTrue(isNull.eval(), "2 nulls");
RowData indexRow2 = intIndexRow(12, 13, 0L);
isNull.bindColStats(indexRow2, queryFields(2), rExpr);
assertFalse(isNull.eval(), "0 nulls");
}
@Test
void testIsNotNull() {
ExpressionEvaluator.IsNotNull isNotNull = ExpressionEvaluator.IsNotNull.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
RowData indexRow1 = intIndexRow(11, 13);
isNotNull.bindFieldReference(rExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
assertTrue(isNotNull.eval(), "min 11 is not null");
RowData indexRow2 = intIndexRow(null, null, 0L);
isNotNull.bindColStats(indexRow2, queryFields(2), rExpr);
assertTrue(isNotNull.eval(), "min is null and 0 nulls");
}
@Test
void testLessThan() {
ExpressionEvaluator.LessThan lessThan = ExpressionEvaluator.LessThan.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
RowData indexRow1 = intIndexRow(11, 13);
lessThan.bindFieldReference(rExpr)
.bindVal(vExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
assertTrue(lessThan.eval(), "12 < 13");
RowData indexRow2 = intIndexRow(12, 13);
lessThan.bindColStats(indexRow2, queryFields(2), rExpr);
assertFalse(lessThan.eval(), "min 12 = 12");
RowData indexRow3 = intIndexRow(11, 12);
lessThan.bindColStats(indexRow3, queryFields(2), rExpr);
assertTrue(lessThan.eval(), "11 < 12");
RowData indexRow4 = intIndexRow(10, 11);
lessThan.bindColStats(indexRow4, queryFields(2), rExpr);
assertTrue(lessThan.eval(), "11 < 12");
RowData indexRow5 = intIndexRow(13, 14);
lessThan.bindColStats(indexRow5, queryFields(2), rExpr);
assertFalse(lessThan.eval(), "12 < min 13");
RowData indexRow6 = intIndexRow(null, null);
lessThan.bindColStats(indexRow6, queryFields(2), rExpr);
assertFalse(lessThan.eval(), "12 <> null");
lessThan.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
assertFalse(lessThan.eval(), "null <> null");
}
@Test
void testGreaterThan() {
ExpressionEvaluator.GreaterThan greaterThan = ExpressionEvaluator.GreaterThan.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
RowData indexRow1 = intIndexRow(11, 13);
greaterThan.bindFieldReference(rExpr)
.bindVal(vExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
assertTrue(greaterThan.eval(), "12 < 13");
RowData indexRow2 = intIndexRow(12, 13);
greaterThan.bindColStats(indexRow2, queryFields(2), rExpr);
assertTrue(greaterThan.eval(), "12 < 13");
RowData indexRow3 = intIndexRow(11, 12);
greaterThan.bindColStats(indexRow3, queryFields(2), rExpr);
assertFalse(greaterThan.eval(), "max 12 = 12");
RowData indexRow4 = intIndexRow(10, 11);
greaterThan.bindColStats(indexRow4, queryFields(2), rExpr);
assertFalse(greaterThan.eval(), "max 11 < 12");
RowData indexRow5 = intIndexRow(13, 14);
greaterThan.bindColStats(indexRow5, queryFields(2), rExpr);
assertTrue(greaterThan.eval(), "12 < 13");
RowData indexRow6 = intIndexRow(null, null);
greaterThan.bindColStats(indexRow6, queryFields(2), rExpr);
assertFalse(greaterThan.eval(), "12 <> null");
greaterThan.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
assertFalse(greaterThan.eval(), "null <> null");
}
@Test
void testLessThanOrEqual() {
ExpressionEvaluator.LessThanOrEqual lessThanOrEqual = ExpressionEvaluator.LessThanOrEqual.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
RowData indexRow1 = intIndexRow(11, 13);
lessThanOrEqual.bindFieldReference(rExpr)
.bindVal(vExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
assertTrue(lessThanOrEqual.eval(), "11 < 12");
RowData indexRow2 = intIndexRow(12, 13);
lessThanOrEqual.bindColStats(indexRow2, queryFields(2), rExpr);
assertTrue(lessThanOrEqual.eval(), "min 12 = 12");
RowData indexRow3 = intIndexRow(11, 12);
lessThanOrEqual.bindColStats(indexRow3, queryFields(2), rExpr);
assertTrue(lessThanOrEqual.eval(), "max 12 = 12");
RowData indexRow4 = intIndexRow(10, 11);
lessThanOrEqual.bindColStats(indexRow4, queryFields(2), rExpr);
assertTrue(lessThanOrEqual.eval(), "max 11 < 12");
RowData indexRow5 = intIndexRow(13, 14);
lessThanOrEqual.bindColStats(indexRow5, queryFields(2), rExpr);
assertFalse(lessThanOrEqual.eval(), "12 < 13");
RowData indexRow6 = intIndexRow(null, null);
lessThanOrEqual.bindColStats(indexRow6, queryFields(2), rExpr);
assertFalse(lessThanOrEqual.eval(), "12 <> null");
lessThanOrEqual.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
assertFalse(lessThanOrEqual.eval(), "null <> null");
}
@Test
void testGreaterThanOrEqual() {
ExpressionEvaluator.GreaterThanOrEqual greaterThanOrEqual = ExpressionEvaluator.GreaterThanOrEqual.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
ValueLiteralExpression vExpr = new ValueLiteralExpression(12);
RowData indexRow1 = intIndexRow(11, 13);
greaterThanOrEqual.bindFieldReference(rExpr)
.bindVal(vExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
assertTrue(greaterThanOrEqual.eval(), "12 < 13");
RowData indexRow2 = intIndexRow(12, 13);
greaterThanOrEqual.bindColStats(indexRow2, queryFields(2), rExpr);
assertTrue(greaterThanOrEqual.eval(), "min 12 = 12");
RowData indexRow3 = intIndexRow(11, 12);
greaterThanOrEqual.bindColStats(indexRow3, queryFields(2), rExpr);
assertTrue(greaterThanOrEqual.eval(), "max 12 = 12");
RowData indexRow4 = intIndexRow(10, 11);
greaterThanOrEqual.bindColStats(indexRow4, queryFields(2), rExpr);
assertFalse(greaterThanOrEqual.eval(), "max 11 < 12");
RowData indexRow5 = intIndexRow(13, 14);
greaterThanOrEqual.bindColStats(indexRow5, queryFields(2), rExpr);
assertTrue(greaterThanOrEqual.eval(), "12 < 13");
RowData indexRow6 = intIndexRow(null, null);
greaterThanOrEqual.bindColStats(indexRow6, queryFields(2), rExpr);
assertFalse(greaterThanOrEqual.eval(), "12 <> null");
greaterThanOrEqual.bindVal(new ValueLiteralExpression(null, DataTypes.INT()));
assertFalse(greaterThanOrEqual.eval(), "null <> null");
}
@Test
void testIn() {
ExpressionEvaluator.In in = ExpressionEvaluator.In.getInstance();
FieldReferenceExpression rExpr = new FieldReferenceExpression("f_int", DataTypes.INT(), 2, 2);
RowData indexRow1 = intIndexRow(11, 13);
in.bindFieldReference(rExpr)
.bindColStats(indexRow1, queryFields(2), rExpr);
in.bindVals(12);
assertTrue(in.eval(), "11 < 12 < 13");
RowData indexRow2 = intIndexRow(12, 13);
in.bindColStats(indexRow2, queryFields(2), rExpr);
assertTrue(in.eval(), "min 12 = 12");
RowData indexRow3 = intIndexRow(11, 12);
in.bindColStats(indexRow3, queryFields(2), rExpr);
assertTrue(in.eval(), "max 12 = 12");
RowData indexRow4 = intIndexRow(10, 11);
in.bindColStats(indexRow4, queryFields(2), rExpr);
assertFalse(in.eval(), "max 11 < 12");
RowData indexRow5 = intIndexRow(13, 14);
in.bindColStats(indexRow5, queryFields(2), rExpr);
assertFalse(in.eval(), "12 < 13");
RowData indexRow6 = intIndexRow(null, null);
in.bindColStats(indexRow6, queryFields(2), rExpr);
assertFalse(in.eval(), "12 <> null");
in.bindVals((Object) null);
assertFalse(in.eval(), "null <> null");
}
private static RowData intIndexRow(Integer minVal, Integer maxVal) {
return intIndexRow(minVal, maxVal, 2L);
}
private static RowData intIndexRow(Integer minVal, Integer maxVal, Long nullCnt) {
return indexRow(StringData.fromString("f1"), 100L,
minVal, maxVal, nullCnt,
StringData.fromString("1"), StringData.fromString("100"), 5L,
TimestampData.fromEpochMillis(1), TimestampData.fromEpochMillis(100), 3L);
}
private static RowData indexRow(Object... fields) {
return TestData.insertRow(INDEX_ROW_TYPE, fields);
}
private static RowType.RowField[] queryFields(int... pos) {
List<RowType.RowField> fields = ((RowType) ROW_DATA_TYPE.getLogicalType()).getFields();
return Arrays.stream(pos).mapToObj(fields::get).toArray(RowType.RowField[]::new);
}
}

View File

@@ -1256,6 +1256,37 @@ public class ITTestHoodieDataSource extends AbstractTestBase {
assertRowsEquals(partitionResult, "[+I[1, 2022-02-02]]");
}
@Test
void testWriteAndReadWithDataSkipping() {
TableEnvironment tableEnv = batchTableEnv;
String hoodieTableDDL = sql("t1")
.option(FlinkOptions.PATH, tempFile.getAbsolutePath())
.option(FlinkOptions.METADATA_ENABLED, true)
.option("hoodie.metadata.index.column.stats.enable", true)
.option(FlinkOptions.READ_DATA_SKIPPING_ENABLED, true)
.end();
tableEnv.executeSql(hoodieTableDDL);
execInsertSql(tableEnv, TestSQL.INSERT_T1);
List<Row> result1 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT);
// apply filters
List<Row> result2 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1 where uuid > 'id5' and age > 20").execute().collect());
assertRowsEquals(result2, "["
+ "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], "
+ "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]");
// filter by timestamp
List<Row> result3 = CollectionUtil.iterableToList(
() -> tableEnv.sqlQuery("select * from t1 where ts > TIMESTAMP '1970-01-01 00:00:05'").execute().collect());
assertRowsEquals(result3, "["
+ "+I[id6, Emma, 20, 1970-01-01T00:00:06, par3], "
+ "+I[id7, Bob, 44, 1970-01-01T00:00:07, par4], "
+ "+I[id8, Han, 56, 1970-01-01T00:00:08, par4]]");
}
// -------------------------------------------------------------------------
// Utilities
// -------------------------------------------------------------------------

View File

@@ -28,6 +28,7 @@ import org.apache.flink.api.common.io.FileInputFormat;
import org.apache.flink.api.common.io.InputFormat;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.data.RowData;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.function.ThrowingSupplier;
@@ -38,7 +39,6 @@ import org.slf4j.LoggerFactory;
import java.io.File;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;
@@ -76,22 +76,18 @@ public class TestHoodieTableSource {
Arrays.asList(conf.getString(FlinkOptions.PARTITION_PATH_FIELD).split(",")),
"default-par",
conf);
Path[] paths = tableSource.getReadPaths();
assertNotNull(paths);
String[] names = Arrays.stream(paths).map(Path::getName)
.sorted(Comparator.naturalOrder()).toArray(String[]::new);
assertThat(Arrays.toString(names), is("[par1, par2, par3, par4]"));
FileStatus[] fileStatuses = tableSource.getReadFiles();
assertNotNull(fileStatuses);
assertThat(fileStatuses.length, is(4));
// apply partition pruning
Map<String, String> partitions = new HashMap<>();
partitions.put("partition", "par1");
tableSource.applyPartitions(Collections.singletonList(partitions));
Path[] paths2 = tableSource.getReadPaths();
assertNotNull(paths2);
String[] names2 = Arrays.stream(paths2).map(Path::getName)
.sorted(Comparator.naturalOrder()).toArray(String[]::new);
assertThat(Arrays.toString(names2), is("[par1]"));
FileStatus[] fileStatuses2 = tableSource.getReadFiles();
assertNotNull(fileStatuses2);
assertThat(fileStatuses2.length, is(1));
}
@Test

View File

@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utils;
import org.apache.hudi.avro.model.HoodieMetadataRecord;
import org.apache.hudi.metadata.HoodieMetadataPayload;
import org.apache.hudi.util.AvroSchemaConverter;
import org.apache.avro.Schema;
import org.apache.flink.table.types.DataType;
import org.junit.jupiter.api.Test;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
/**
* Test cases for {@link org.apache.hudi.util.AvroSchemaConverter}.
*/
public class TestAvroSchemaConverter {
@Test
void testUnionSchemaWithMultipleRecordTypes() {
Schema schema = HoodieMetadataRecord.SCHEMA$;
DataType dataType = AvroSchemaConverter.convertToDataType(schema);
int pos = HoodieMetadataRecord.SCHEMA$.getField(HoodieMetadataPayload.SCHEMA_FIELD_ID_COLUMN_STATS).pos();
final String expected = "ROW<"
+ "`fileName` STRING, "
+ "`columnName` STRING, "
+ "`minValue` ROW<`wrapper` RAW('java.lang.Object', ?) NOT NULL>, "
+ "`maxValue` ROW<`wrapper` RAW('java.lang.Object', ?) NOT NULL>, "
+ "`valueCount` BIGINT, "
+ "`nullCount` BIGINT, "
+ "`totalSize` BIGINT, "
+ "`totalUncompressedSize` BIGINT, "
+ "`isDeleted` BOOLEAN NOT NULL>";
assertThat(dataType.getChildren().get(pos).toString(), is(expected));
}
}

View File

@@ -318,6 +318,9 @@ public class TestConfigurations {
}
}
/**
* Tool to construct the catalog DDL.
*/
public static class Catalog {
private final String catalogName;
private String catalogPath = ".";

View File

@@ -97,6 +97,6 @@ public class TestUtils {
public static StreamReadMonitoringFunction getMonitorFunc(Configuration conf) {
final String basePath = conf.getString(FlinkOptions.PATH);
return new StreamReadMonitoringFunction(conf, new Path(basePath), 1024 * 1024L, null);
return new StreamReadMonitoringFunction(conf, new Path(basePath), TestConfigurations.ROW_TYPE, 1024 * 1024L, null);
}
}

View File

@@ -18,11 +18,11 @@
package org.apache.hudi.utils;
import org.apache.flink.configuration.Configuration;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
import org.apache.hudi.util.ViewStorageProperties;
import org.apache.flink.configuration.Configuration;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;