[HUDI-3206] Unify Hive's MOR implementations to avoid duplication (#4559)

Unify Hive's MOR implementations to avoid duplication to avoid duplication across implementations for different file-formats (Parquet, HFile, etc) - Extracted HoodieRealtimeFileInputFormatBase (extending COW HoodieFileInputFormatBase base) - Rebased Parquet, HFile implementations onto HoodieRealtimeFileInputFormatBase - Tidying up
2022-02-07 11:06:28 -08:00
parent 773b317983
commit 3f263b82ce
10 changed files with 460 additions and 415 deletions
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestColumnStatsIndex.scala
@@ -20,23 +20,20 @@ package org.apache.hudi.functional

 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}
-import org.apache.hudi.common.fs.FSUtils
-import org.apache.hudi.common.model.HoodieColumnRangeMetadata
 import org.apache.hudi.common.util.ParquetUtils
 import org.apache.hudi.index.columnstats.ColumnStatsIndexHelper
 import org.apache.hudi.testutils.HoodieClientTestBase
+import org.apache.spark.sql._
 import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.functions.typedLit
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, RowFactory, SaveMode, SparkSession, functions}
 import org.junit.jupiter.api.Assertions.{assertEquals, assertNotNull, assertTrue}
-import org.junit.jupiter.api.{AfterEach, BeforeEach, Disabled, Test}
+import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}

 import java.math.BigInteger
-import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 import scala.collection.JavaConverters._
-import scala.util.{Random, Success}
+import scala.util.Random

 class TestColumnStatsIndex extends HoodieClientTestBase {
  var spark: SparkSession = _
@@ -354,11 +351,10 @@ class TestColumnStatsIndex extends HoodieClientTestBase {
        .distinct()
        .collect()
        .map(_.getString(0))
-        .sorted

    val uuidToIdx: UserDefinedFunction = functions.udf((fileName: String) => {
-      val (uuid, idx) = uuids.zipWithIndex.find { case (uuid, _) => fileName.contains(uuid) }.get
-      fileName.replace(uuid, idx.toString)
+      val uuid = uuids.find(uuid => fileName.contains(uuid)).get
+      fileName.replace(uuid, "xxx")
    })

    ds.withColumn("file", uuidToIdx(ds("file")))
@@ -409,12 +405,11 @@ class TestColumnStatsIndex extends HoodieClientTestBase {
      .mkString("\n")

  private def sort(df: DataFrame): DataFrame = {
-    // Since upon parsing JSON, Spark re-order columns in lexicographical order
-    // of their names, we have to shuffle new Z-index table columns order to match
-    // Rows are sorted by filename as well to avoid
    val sortedCols = df.columns.sorted
+    // Sort dataset by the first 2 columns (to minimize non-determinism in case multiple files have the same
+    // value of the first column)
    df.select(sortedCols.head, sortedCols.tail: _*)
-      .sort("file")
+      .sort("c1_maxValue", "c1_minValue")
  }

 }