[HUDI-3979] Optimize out mandatory columns when no merging is performed (#5430)

For MOR, when no merging is performed there is no point in reading either primary-key or pre-combine-key values (unless query is referencing these). Avoiding reading these allows to potentially save substantial resources wasted for reading it out.
2022-07-22 15:32:44 -07:00
parent 6b84384022
commit 39f2a06c85
8 changed files with 186 additions and 84 deletions
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieRelations.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/TestHoodieRelations.scala
@@ -51,6 +51,4 @@ class TestHoodieRelations {
      requiredStructSchema.fields.toSeq
    )
  }
-
-
 }
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestParquetColumnProjection.scala
@@ -54,7 +54,7 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
    DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key -> classOf[NonpartitionedKeyGenerator].getName
  )

-  @Disabled("HUDI-3896")
+  @Disabled("Currently disabled b/c of the fallback to HadoopFsRelation")
  @Test
  def testBaseFileOnlyViewRelation(): Unit = {
    val tablePath = s"$basePath/cow"
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/spark/sql/hudi/TestInsertTable.scala
@@ -24,6 +24,7 @@ import org.apache.hudi.config.HoodieWriteConfig
 import org.apache.hudi.exception.HoodieDuplicateKeyException
 import org.apache.hudi.keygen.ComplexKeyGenerator
 import org.apache.spark.sql.SaveMode
+import org.apache.spark.sql.internal.SQLConf

 import java.io.File