1
0

[HUDI-1591] Implement Spark's FileIndex for Hudi to support queries via Hudi DataSource using non-globbed table path and partition pruning (#2651)

This commit is contained in:
pengzhiwei
2021-04-02 02:12:28 +08:00
committed by GitHub
parent 9804662bc8
commit 684622c7c9
22 changed files with 1074 additions and 82 deletions

View File

@@ -44,7 +44,7 @@ import java.util.stream.Collectors;
public class CustomAvroKeyGenerator extends BaseKeyGenerator {
private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
private static final String SPLIT_REGEX = ":";
public static final String SPLIT_REGEX = ":";
/**
* Used as a part of config in CustomKeyGenerator.java.
@@ -117,8 +117,4 @@ public class CustomAvroKeyGenerator extends BaseKeyGenerator {
public String getDefaultPartitionPathSeparator() {
return DEFAULT_PARTITION_PATH_SEPARATOR;
}
public String getSplitRegex() {
return SPLIT_REGEX;
}
}

View File

@@ -90,7 +90,7 @@ public class CustomKeyGenerator extends BuiltinKeyGenerator {
return "";
}
for (String field : getPartitionPathFields()) {
String[] fieldWithType = field.split(customAvroKeyGenerator.getSplitRegex());
String[] fieldWithType = field.split(customAvroKeyGenerator.SPLIT_REGEX);
if (fieldWithType.length != 2) {
throw new HoodieKeyGeneratorException("Unable to find field names for partition path in proper format");
}

View File

@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.execution.datasources
import java.util.TimeZone
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.execution.datasources.PartitioningUtils.PartitionValues
import org.apache.spark.sql.types.DataType
trait SparkParsePartitionUtil extends Serializable {
def parsePartition(
path: Path,
typeInference: Boolean,
basePaths: Set[Path],
userSpecifiedDataTypes: Map[String, DataType],
timeZone: TimeZone): Option[PartitionValues]
}