[HUDI-3985] Refactor DLASyncTool to support read hoodie table as spark datasource table (#5532)

2022-05-20 22:25:32 +08:00
parent c7576f7613
commit 85b146d3d5
26 changed files with 1281 additions and 974 deletions
--- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java
+++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/HiveSyncTool.java
@@ -27,9 +27,8 @@ import org.apache.hudi.common.util.StringUtils;
 import org.apache.hudi.exception.HoodieException;
 import org.apache.hudi.exception.InvalidTableException;
 import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
-import org.apache.hudi.hive.util.ConfigUtils;
+import org.apache.hudi.sync.common.util.ConfigUtils;
 import org.apache.hudi.hive.util.HiveSchemaUtil;
-import org.apache.hudi.hive.util.Parquet2SparkSchemaUtils;
 import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
 import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
 import org.apache.hudi.sync.common.AbstractSyncTool;
@@ -43,20 +42,13 @@ import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;
-import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
-import org.apache.parquet.schema.PrimitiveType;
-import org.apache.parquet.schema.Type;

 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;

-import static org.apache.parquet.schema.OriginalType.UTF8;
-import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
-
 /**
 * Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
 * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive-sync.jar HiveSyncTool [args]
@@ -248,8 +240,9 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
    Map<String, String> tableProperties = ConfigUtils.toMap(hiveSyncConfig.tableProperties);
    Map<String, String> serdeProperties = ConfigUtils.toMap(hiveSyncConfig.serdeProperties);
    if (hiveSyncConfig.syncAsSparkDataSourceTable) {
-      Map<String, String> sparkTableProperties = getSparkTableProperties(hiveSyncConfig.sparkSchemaLengthThreshold, schema);
-      Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized);
+      Map<String, String> sparkTableProperties = getSparkTableProperties(hiveSyncConfig.partitionFields,
+          hiveSyncConfig.sparkVersion, hiveSyncConfig.sparkSchemaLengthThreshold, schema);
+      Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized, hiveSyncConfig.basePath);
      tableProperties.putAll(sparkTableProperties);
      serdeProperties.putAll(sparkSerdeProperties);
    }
@@ -309,75 +302,6 @@ public class HiveSyncTool extends AbstractSyncTool implements AutoCloseable {
    return schemaChanged;
  }

-  /**
-   * Get Spark Sql related table properties. This is used for spark datasource table.
-   * @param schema  The schema to write to the table.
-   * @return A new parameters added the spark's table properties.
-   */
-  private Map<String, String> getSparkTableProperties(int schemaLengthThreshold, MessageType schema)  {
-    // Convert the schema and partition info used by spark sql to hive table properties.
-    // The following code refers to the spark code in
-    // https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
-    GroupType originGroupType = schema.asGroupType();
-    List<String> partitionNames = hiveSyncConfig.partitionFields;
-    List<Type> partitionCols = new ArrayList<>();
-    List<Type> dataCols = new ArrayList<>();
-    Map<String, Type> column2Field = new HashMap<>();
-
-    for (Type field : originGroupType.getFields()) {
-      column2Field.put(field.getName(), field);
-    }
-    // Get partition columns and data columns.
-    for (String partitionName : partitionNames) {
-      // Default the unknown partition fields to be String.
-      // Keep the same logical with HiveSchemaUtil#getPartitionKeyType.
-      partitionCols.add(column2Field.getOrDefault(partitionName,
-              new PrimitiveType(Type.Repetition.REQUIRED, BINARY, partitionName, UTF8)));
-    }
-
-    for (Type field : originGroupType.getFields()) {
-      if (!partitionNames.contains(field.getName())) {
-        dataCols.add(field);
-      }
-    }
-
-    List<Type> reOrderedFields = new ArrayList<>();
-    reOrderedFields.addAll(dataCols);
-    reOrderedFields.addAll(partitionCols);
-    GroupType reOrderedType = new GroupType(originGroupType.getRepetition(), originGroupType.getName(), reOrderedFields);
-
-    Map<String, String> sparkProperties = new HashMap<>();
-    sparkProperties.put("spark.sql.sources.provider", "hudi");
-    if (!StringUtils.isNullOrEmpty(hiveSyncConfig.sparkVersion)) {
-      sparkProperties.put("spark.sql.create.version", hiveSyncConfig.sparkVersion);
-    }
-    // Split the schema string to multi-parts according the schemaLengthThreshold size.
-    String schemaString = Parquet2SparkSchemaUtils.convertToSparkSchemaJson(reOrderedType);
-    int numSchemaPart = (schemaString.length() + schemaLengthThreshold - 1) / schemaLengthThreshold;
-    sparkProperties.put("spark.sql.sources.schema.numParts", String.valueOf(numSchemaPart));
-    // Add each part of schema string to sparkProperties
-    for (int i = 0; i < numSchemaPart; i++) {
-      int start = i * schemaLengthThreshold;
-      int end = Math.min(start + schemaLengthThreshold, schemaString.length());
-      sparkProperties.put("spark.sql.sources.schema.part." + i, schemaString.substring(start, end));
-    }
-    // Add partition columns
-    if (!partitionNames.isEmpty()) {
-      sparkProperties.put("spark.sql.sources.schema.numPartCols", String.valueOf(partitionNames.size()));
-      for (int i = 0; i < partitionNames.size(); i++) {
-        sparkProperties.put("spark.sql.sources.schema.partCol." + i, partitionNames.get(i));
-      }
-    }
-    return sparkProperties;
-  }
-
-  private Map<String, String> getSparkSerdeProperties(boolean readAsOptimized) {
-    Map<String, String> sparkSerdeProperties = new HashMap<>();
-    sparkSerdeProperties.put("path", hiveSyncConfig.basePath);
-    sparkSerdeProperties.put(ConfigUtils.IS_QUERY_AS_RO_TABLE, String.valueOf(readAsOptimized));
-    return sparkSerdeProperties;
-  }
-
  /**
   * Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the
   * partition path does not match, it updates the partition path).
--- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/ConfigUtils.java
+++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/ConfigUtils.java
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.hive.util;
-
-import java.util.HashMap;
-import java.util.Map;
-import org.apache.hudi.common.util.StringUtils;
-
-public class ConfigUtils {
-  /**
-   * Config stored in hive serde properties to tell query engine (spark/flink) to
-   * read the table as a read-optimized table when this config is true.
-   */
-  public static final String IS_QUERY_AS_RO_TABLE = "hoodie.query.as.ro.table";
-
-  /**
-   * Convert the key-value config to a map.The format of the config
-   * is a key-value pair just like "k1=v1\nk2=v2\nk3=v3".
-   * @param keyValueConfig
-   * @return
-   */
-  public static Map<String, String> toMap(String keyValueConfig) {
-    if (StringUtils.isNullOrEmpty(keyValueConfig)) {
-      return new HashMap<>();
-    }
-    String[] keyvalues = keyValueConfig.split("\n");
-    Map<String, String> tableProperties = new HashMap<>();
-    for (String keyValue : keyvalues) {
-      String[] keyValueArray = keyValue.split("=");
-      if (keyValueArray.length == 1 || keyValueArray.length == 2) {
-        String key = keyValueArray[0].trim();
-        String value = keyValueArray.length == 2 ? keyValueArray[1].trim() : "";
-        tableProperties.put(key, value);
-      } else {
-        throw new IllegalArgumentException("Bad key-value config: " + keyValue + ", must be the"
-          + " format 'key = value'");
-      }
-    }
-    return tableProperties;
-  }
-
-  /**
-   * Convert map config to key-value string.The format of the config
-   * is a key-value pair just like "k1=v1\nk2=v2\nk3=v3".
-   * @param config
-   * @return
-   */
-  public static String configToString(Map<String, String> config) {
-    if (config == null) {
-      return null;
-    }
-    StringBuilder sb = new StringBuilder();
-    for (Map.Entry<String, String> entry : config.entrySet()) {
-      if (sb.length() > 0) {
-        sb.append("\n");
-      }
-      sb.append(entry.getKey()).append("=").append(entry.getValue());
-    }
-    return sb.toString();
-  }
-
-}
--- a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/Parquet2SparkSchemaUtils.java
+++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/util/Parquet2SparkSchemaUtils.java
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hudi.hive.util;
-
-import org.apache.hudi.common.util.ValidationUtils;
-import org.apache.parquet.schema.GroupType;
-import org.apache.parquet.schema.OriginalType;
-import org.apache.parquet.schema.PrimitiveType;
-import org.apache.parquet.schema.Type;
-
-import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
-
-/**
- * Convert the parquet schema to spark schema' json string.
- * This code is refer to org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter
- * in spark project.
- */
-public class Parquet2SparkSchemaUtils {
-
-  public static String convertToSparkSchemaJson(GroupType parquetSchema) {
-    String fieldsJsonString = parquetSchema.getFields().stream().map(field -> {
-      switch (field.getRepetition()) {
-        case OPTIONAL:
-          return "{\"name\":\"" + field.getName() + "\",\"type\":" + convertFieldType(field)
-                  + ",\"nullable\":true,\"metadata\":{}}";
-        case REQUIRED:
-          return "{\"name\":\"" + field.getName() + "\",\"type\":" + convertFieldType(field)
-                  + ",\"nullable\":false,\"metadata\":{}}";
-        case REPEATED:
-          String arrayType = arrayType(field, false);
-          return "{\"name\":\"" + field.getName() + "\",\"type\":" + arrayType
-                  + ",\"nullable\":false,\"metadata\":{}}";
-        default:
-          throw new UnsupportedOperationException("Unsupport convert " + field + " to spark sql type");
-      }
-    }).reduce((a, b) -> a + "," + b).orElse("");
-    return "{\"type\":\"struct\",\"fields\":[" + fieldsJsonString + "]}";
-  }
-
-  private static String convertFieldType(Type field) {
-    if (field instanceof PrimitiveType) {
-      return "\"" + convertPrimitiveType((PrimitiveType) field) + "\"";
-    } else {
-      assert field instanceof GroupType;
-      return convertGroupField((GroupType) field);
-    }
-  }
-
-  private static String convertPrimitiveType(PrimitiveType field) {
-    PrimitiveType.PrimitiveTypeName typeName = field.getPrimitiveTypeName();
-    OriginalType originalType = field.getOriginalType();
-
-    switch (typeName) {
-      case BOOLEAN: return "boolean";
-      case FLOAT: return "float";
-      case DOUBLE: return "double";
-      case INT32:
-        if (originalType == null) {
-          return "integer";
-        }
-        switch (originalType) {
-          case INT_8: return "byte";
-          case INT_16: return "short";
-          case INT_32: return "integer";
-          case DATE: return "date";
-          case DECIMAL:
-            return "decimal(" + field.getDecimalMetadata().getPrecision() + ","
-                    + field.getDecimalMetadata().getScale() + ")";
-          default: throw new UnsupportedOperationException("Unsupport convert " + typeName + " to spark sql type");
-        }
-      case INT64:
-        if (originalType == null) {
-          return "long";
-        }
-        switch (originalType)  {
-          case INT_64: return "long";
-          case DECIMAL:
-            return "decimal(" + field.getDecimalMetadata().getPrecision() + ","
-                    + field.getDecimalMetadata().getScale() + ")";
-          case TIMESTAMP_MICROS:
-          case TIMESTAMP_MILLIS:
-            return "timestamp";
-          default:
-            throw new UnsupportedOperationException("Unsupport convert " + typeName + " to spark sql type");
-        }
-      case INT96: return "timestamp";
-
-      case BINARY:
-        if (originalType == null) {
-          return "binary";
-        }
-        switch (originalType) {
-          case UTF8:
-          case  ENUM:
-          case  JSON:
-            return "string";
-          case BSON: return "binary";
-          case DECIMAL:
-            return "decimal(" + field.getDecimalMetadata().getPrecision() + ","
-                    + field.getDecimalMetadata().getScale() + ")";
-          default:
-            throw new UnsupportedOperationException("Unsupport convert " + typeName + " to spark sql type");
-        }
-
-      case FIXED_LEN_BYTE_ARRAY:
-        switch (originalType) {
-          case DECIMAL:
-            return "decimal(" + field.getDecimalMetadata().getPrecision() + ","
-                  + field.getDecimalMetadata().getScale() + ")";
-          default:
-            throw new UnsupportedOperationException("Unsupport convert " + typeName + " to spark sql type");
-        }
-      default:
-        throw new UnsupportedOperationException("Unsupport convert " + typeName + " to spark sql type");
-    }
-  }
-
-  private static String convertGroupField(GroupType field) {
-    if (field.getOriginalType() == null) {
-      return convertToSparkSchemaJson(field);
-    }
-    switch (field.getOriginalType()) {
-      case LIST:
-        ValidationUtils.checkArgument(field.getFieldCount() == 1, "Illegal List type: " + field);
-        Type repeatedType = field.getType(0);
-        if (isElementType(repeatedType, field.getName())) {
-          return arrayType(repeatedType, false);
-        } else {
-          Type elementType = repeatedType.asGroupType().getType(0);
-          boolean optional = elementType.isRepetition(OPTIONAL);
-          return arrayType(elementType, optional);
-        }
-      case MAP:
-      case MAP_KEY_VALUE:
-        GroupType keyValueType = field.getType(0).asGroupType();
-        Type keyType = keyValueType.getType(0);
-        Type valueType = keyValueType.getType(1);
-        boolean valueOptional = valueType.isRepetition(OPTIONAL);
-        return "{\"type\":\"map\", \"keyType\":" + convertFieldType(keyType)
-                + ",\"valueType\":" + convertFieldType(valueType)
-                + ",\"valueContainsNull\":" + valueOptional + "}";
-      default:
-        throw new UnsupportedOperationException("Unsupport convert " + field + " to spark sql type");
-    }
-  }
-
-  private static String arrayType(Type elementType, boolean containsNull) {
-    return "{\"type\":\"array\", \"elementType\":" + convertFieldType(elementType) + ",\"containsNull\":" + containsNull + "}";
-  }
-
-  private static boolean isElementType(Type repeatedType, String parentName) {
-    return repeatedType.isPrimitive() || repeatedType.asGroupType().getFieldCount() > 1
-      || repeatedType.getName().equals("array") || repeatedType.getName().equals(parentName + "_tuple");
-  }
-}
--- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java
+++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestHiveSyncTool.java
@@ -28,7 +28,7 @@ import org.apache.hudi.common.util.Option;
 import org.apache.hudi.common.util.StringUtils;
 import org.apache.hudi.common.util.collection.ImmutablePair;
 import org.apache.hudi.hive.testutils.HiveTestUtil;
-import org.apache.hudi.hive.util.ConfigUtils;
+import org.apache.hudi.sync.common.util.ConfigUtils;
 import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
 import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;

--- a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestParquet2SparkSchemaUtils.java
+++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestParquet2SparkSchemaUtils.java
@@ -18,7 +18,7 @@

 package org.apache.hudi.hive;

-import org.apache.hudi.hive.util.Parquet2SparkSchemaUtils;
+import org.apache.hudi.sync.common.util.Parquet2SparkSchemaUtils;
 import org.apache.spark.sql.execution.SparkSqlParser;
 import org.apache.spark.sql.execution.datasources.parquet.SparkToParquetSchemaConverter;
 import org.apache.spark.sql.internal.SQLConf;