[HUDI-1591] Implement Spark's FileIndex for Hudi to support queries via Hudi DataSource using non-globbed table path and partition pruning (#2651)

2021-04-02 02:12:28 +08:00
parent 9804662bc8
commit 684622c7c9
22 changed files with 1074 additions and 82 deletions
--- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/CustomAvroKeyGenerator.java
+++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/keygen/CustomAvroKeyGenerator.java
@@ -44,7 +44,7 @@ import java.util.stream.Collectors;
 public class CustomAvroKeyGenerator extends BaseKeyGenerator {

  private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
-  private static final String SPLIT_REGEX = ":";
+  public static final String SPLIT_REGEX = ":";

  /**
   * Used as a part of config in CustomKeyGenerator.java.
@@ -117,8 +117,4 @@ public class CustomAvroKeyGenerator extends BaseKeyGenerator {
  public String getDefaultPartitionPathSeparator() {
    return DEFAULT_PARTITION_PATH_SEPARATOR;
  }
-
-  public String getSplitRegex() {
-    return SPLIT_REGEX;
-  }
 }
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/CustomKeyGenerator.java
@@ -90,7 +90,7 @@ public class CustomKeyGenerator extends BuiltinKeyGenerator {
      return "";
    }
    for (String field : getPartitionPathFields()) {
-      String[] fieldWithType = field.split(customAvroKeyGenerator.getSplitRegex());
+      String[] fieldWithType = field.split(customAvroKeyGenerator.SPLIT_REGEX);
      if (fieldWithType.length != 2) {
        throw new HoodieKeyGeneratorException("Unable to find field names for partition path in proper format");
      }
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/SparkParsePartitionUtil.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/spark/sql/execution/datasources/SparkParsePartitionUtil.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.util.TimeZone
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.sql.execution.datasources.PartitioningUtils.PartitionValues
+import org.apache.spark.sql.types.DataType
+
+trait SparkParsePartitionUtil extends Serializable {
+
+  def parsePartition(
+    path: Path,
+    typeInference: Boolean,
+    basePaths: Set[Path],
+    userSpecifiedDataTypes: Map[String, DataType],
+    timeZone: TimeZone): Option[PartitionValues]
+}