[HUDI-3125] spark-sql write timestamp directly (#4471)

2022-01-09 15:43:25 +08:00
parent 0d8ca8da4e
commit 36790709f7
5 changed files with 188 additions and 7 deletions
--- a/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java
+++ b/hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/keygen/RowKeyGeneratorHelper.java
@@ -27,6 +27,8 @@ import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;

+import java.sql.Timestamp;
+import java.time.Instant;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
@@ -106,7 +108,8 @@ public class RowKeyGeneratorHelper {
        if (fieldPos == -1 || row.isNullAt(fieldPos)) {
          val = HUDI_DEFAULT_PARTITION_PATH;
        } else {
-          val = row.getAs(field).toString();
+          Object data = row.get(fieldPos);
+          val = convertToTimestampIfInstant(data).toString();
          if (val.isEmpty()) {
            val = HUDI_DEFAULT_PARTITION_PATH;
          }
@@ -115,11 +118,12 @@ public class RowKeyGeneratorHelper {
          val = field + "=" + val;
        }
      } else { // nested
-        Object nestedVal = getNestedFieldVal(row, partitionPathPositions.get(field));
-        if (nestedVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER) || nestedVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) {
+        Object data = getNestedFieldVal(row, partitionPathPositions.get(field));
+        data = convertToTimestampIfInstant(data);
+        if (data.toString().contains(NULL_RECORDKEY_PLACEHOLDER) || data.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) {
          val = hiveStylePartitioning ? field + "=" + HUDI_DEFAULT_PARTITION_PATH : HUDI_DEFAULT_PARTITION_PATH;
        } else {
-          val = hiveStylePartitioning ? field + "=" + nestedVal.toString() : nestedVal.toString();
+          val = hiveStylePartitioning ? field + "=" + data.toString() : data.toString();
        }
      }
      return val;
@@ -266,4 +270,11 @@ public class RowKeyGeneratorHelper {
    }
    return positions;
  }
+
+  private static Object convertToTimestampIfInstant(Object data) {
+    if (data instanceof Instant) {
+      return Timestamp.from((Instant) data);
+    }
+    return data;
+  }
 }
--- a/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala
+++ b/hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/AvroConversionHelper.scala
@@ -20,6 +20,7 @@ package org.apache.hudi

 import java.nio.ByteBuffer
 import java.sql.{Date, Timestamp}
+import java.time.Instant

 import org.apache.avro.Conversions.DecimalConversion
 import org.apache.avro.LogicalTypes.{TimestampMicros, TimestampMillis}
@@ -301,9 +302,17 @@ object AvroConversionHelper {
          }.orNull
        }
      case TimestampType => (item: Any) =>
-        // Convert time to microseconds since spark-avro by default converts TimestampType to
-        // Avro Logical TimestampMicros
-        Option(item).map(_.asInstanceOf[Timestamp].getTime * 1000).orNull
+        if (item == null) {
+          null
+        } else {
+          val timestamp = item match {
+            case i: Instant => Timestamp.from(i)
+            case t: Timestamp => t
+          }
+          // Convert time to microseconds since spark-avro by default converts TimestampType to
+          // Avro Logical TimestampMicros
+          timestamp.getTime * 1000
+        }
      case DateType => (item: Any) =>
        Option(item).map(_.asInstanceOf[Date].toLocalDate.toEpochDay.toInt).orNull
      case ArrayType(elementType, _) =>