[HUDI-3200] deprecate hoodie.file.index.enable and unify to use BaseFileOnlyViewRelation to handle (#4798)

2022-02-15 09:38:01 +08:00
parent 0a97a9893a
commit 3b401d839c
8 changed files with 87 additions and 103 deletions
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java
@@ -222,14 +222,21 @@ public class TableSchemaResolver {
  }

  /**
-   * Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the last commit.
+   * Gets the schema for a hoodie table in Avro format from the HoodieCommitMetadata of the last commit with valid schema.
   *
   * @return Avro schema for this table
   */
  private Option<Schema> getTableSchemaFromCommitMetadata(boolean includeMetadataFields) {
-    HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
-    if (timeline.lastInstant().isPresent()) {
-      return getTableSchemaFromCommitMetadata(timeline.lastInstant().get(), includeMetadataFields);
+    Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantAndCommitMetadata =
+        metaClient.getActiveTimeline().getLastCommitMetadataWithValidSchema();
+    if (instantAndCommitMetadata.isPresent()) {
+      HoodieCommitMetadata commitMetadata = instantAndCommitMetadata.get().getRight();
+      String schemaStr = commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY);
+      Schema schema = new Schema.Parser().parse(schemaStr);
+      if (includeMetadataFields) {
+        schema = HoodieAvroUtils.addMetadataFields(schema, hasOperationField);
+      }
+      return Option.of(schema);
    } else {
      return Option.empty();
    }
@@ -519,7 +526,6 @@ public class TableSchemaResolver {
      Schema tableAvroSchema = getTableAvroSchemaFromDataFile();
      return tableAvroSchema.getField(HoodieRecord.OPERATION_METADATA_FIELD) != null;
    } catch (Exception e) {
-      LOG.warn("Failed to read operation field from avro schema", e);
      return false;
    }
  }
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieActiveTimeline.java
@@ -24,6 +24,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.timeline.HoodieInstant.State;
 import org.apache.hudi.common.util.FileIOUtils;
 import org.apache.hudi.common.util.Option;
+import org.apache.hudi.common.util.StringUtils;
 import org.apache.hudi.common.util.ValidationUtils;
 import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.exception.HoodieIOException;
@@ -259,6 +260,26 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
    return readDataFromPath(detailPath);
  }

+  /**
+   * Get the last instant with valid schema, and convert this to HoodieCommitMetadata
+   */
+  public Option<Pair<HoodieInstant, HoodieCommitMetadata>> getLastCommitMetadataWithValidSchema() {
+    List<HoodieInstant> completed = getCommitsTimeline().filterCompletedInstants().getInstants()
+        .sorted(Comparator.comparing(HoodieInstant::getTimestamp).reversed()).collect(Collectors.toList());
+    for (HoodieInstant instant : completed) {
+      try {
+        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
+            getInstantDetails(instant).get(), HoodieCommitMetadata.class);
+        if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY))) {
+          return Option.of(Pair.of(instant, commitMetadata));
+        }
+      } catch (IOException e) {
+        LOG.warn("Failed to convert instant to HoodieCommitMetadata: " + instant.toString());
+      }
+    }
+    return Option.empty();
+  }
+
  /**
   * Get the last instant with valid data, and convert this to HoodieCommitMetadata
   */
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyViewRelation.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/BaseFileOnlyViewRelation.scala
@@ -18,21 +18,19 @@

 package org.apache.hudi

+import org.apache.hadoop.fs.Path
+
 import org.apache.hudi.common.table.HoodieTableMetaClient
-import org.apache.hudi.common.table.TableSchemaResolver
+import org.apache.hudi.hadoop.HoodieROTablePathFilter

 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.avro.SchemaConverters
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Literal
-import org.apache.spark.sql.execution.datasources.{PartitionedFile, _}
-import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal}
+import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionedFile}
 import org.apache.spark.sql.{Row, SQLContext}
-import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
+import org.apache.spark.sql.sources.{BaseRelation, Filter}
 import org.apache.spark.sql.types.{BooleanType, StructType}

-import scala.util.Try
-
 /**
 * The implement of [[BaseRelation]], which is used to respond to query that only touches the base files(Parquet),
 * like query COW tables in Snapshot-Query and Read_Optimized mode and MOR tables in Read_Optimized mode.
@@ -41,16 +39,10 @@ class BaseFileOnlyViewRelation(
    sqlContext: SQLContext,
    metaClient: HoodieTableMetaClient,
    optParams: Map[String, String],
-    userSchema: Option[StructType]
+    userSchema: Option[StructType],
+    globPaths: Seq[Path]
  ) extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) with SparkAdapterSupport {

-  private val fileIndex = HoodieFileIndex(sparkSession,
-    metaClient,
-    userSchema,
-    optParams,
-    FileStatusCache.getOrCreate(sqlContext.sparkSession)
-  )
-
  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false")

@@ -63,21 +55,10 @@ class BaseFileOnlyViewRelation(
      }
      (splited.flatMap(_._1), splited.flatMap(_._2))
    }
+    val partitionFiles = getPartitionFiles(partitionFilters, dataFilters)

-    val partitionFiles = fileIndex.listFiles(partitionFilters, dataFilters).flatMap { partition =>
-      partition.files.flatMap { file =>
-        HoodieDataSourceHelper.splitFiles(
-          sparkSession = sparkSession,
-          file = file,
-          partitionValues = partition.values
-        )
-      }
-    }
-    val emptyPartitionFiles = partitionFiles.map{ f =>
-      PartitionedFile(InternalRow.empty, f.filePath, f.start, f.length)
-    }
    val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes
-    val filePartitions = sparkAdapter.getFilePartitions(sparkSession, emptyPartitionFiles, maxSplitBytes)
+    val filePartitions = sparkAdapter.getFilePartitions(sparkSession, partitionFiles, maxSplitBytes)

    val requiredSchemaParquetReader = HoodieDataSourceHelper.buildHoodieParquetReader(
      sparkSession = sparkSession,
@@ -92,4 +73,34 @@ class BaseFileOnlyViewRelation(
    new HoodieFileScanRDD(sparkSession, requiredColumns, tableStructSchema,
      requiredSchemaParquetReader, filePartitions)
  }
+
+  private def getPartitionFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionedFile] = {
+    val partitionDirectories = if (globPaths.isEmpty) {
+      val hoodieFileIndex = HoodieFileIndex(sparkSession, metaClient, userSchema, optParams,
+        FileStatusCache.getOrCreate(sqlContext.sparkSession))
+      hoodieFileIndex.listFiles(partitionFilters, dataFilters)
+    } else {
+      sqlContext.sparkContext.hadoopConfiguration.setClass(
+        "mapreduce.input.pathFilter.class",
+        classOf[HoodieROTablePathFilter],
+        classOf[org.apache.hadoop.fs.PathFilter])
+
+      val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sparkSession, globPaths)
+      inMemoryFileIndex.listFiles(partitionFilters, dataFilters)
+    }
+
+    val partitionFiles = partitionDirectories.flatMap { partition =>
+      partition.files.flatMap { file =>
+        HoodieDataSourceHelper.splitFiles(
+          sparkSession = sparkSession,
+          file = file,
+          partitionValues = partition.values
+        )
+      }
+    }
+
+    partitionFiles.map{ f =>
+      PartitionedFile(InternalRow.empty, f.filePath, f.start, f.length)
+    }
+  }
 }
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DataSourceOptions.scala
@@ -75,6 +75,7 @@ object DataSourceReadOptions {
  val ENABLE_HOODIE_FILE_INDEX: ConfigProperty[Boolean] = ConfigProperty
    .key("hoodie.file.index.enable")
    .defaultValue(true)
+    .deprecatedAfter("0.11.0")
    .withDocumentation("Enables use of the spark file index implementation for Hudi, "
      + "that speeds up listing of large tables.")

--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala
@@ -85,20 +85,15 @@ class DefaultSource extends RelationProvider
    val allPaths = path.map(p => Seq(p)).getOrElse(Seq()) ++ readPaths

    val fs = FSUtils.getFs(allPaths.head, sqlContext.sparkContext.hadoopConfiguration)
-    // Use the HoodieFileIndex only if the 'path' is not globbed.
-    // Or else we use the original way to read hoodie table.
-    val enableFileIndex = optParams.get(ENABLE_HOODIE_FILE_INDEX.key)
-      .map(_.toBoolean).getOrElse(ENABLE_HOODIE_FILE_INDEX.defaultValue)
-    val useHoodieFileIndex = enableFileIndex && path.isDefined && !path.get.contains("*") &&
-      !parameters.contains(DataSourceReadOptions.READ_PATHS.key)
-    val globPaths = if (useHoodieFileIndex) {
-      None
+
+    val globPaths = if (path.exists(_.contains("*")) || readPaths.nonEmpty) {
+      HoodieSparkUtils.checkAndGlobPathIfNecessary(allPaths, fs)
    } else {
-      Some(HoodieSparkUtils.checkAndGlobPathIfNecessary(allPaths, fs))
+      Seq.empty
    }
    // Get the table base path
-    val tablePath = if (globPaths.isDefined) {
-      DataSourceUtils.getTablePath(fs, globPaths.get.toArray)
+    val tablePath = if (globPaths.nonEmpty) {
+      DataSourceUtils.getTablePath(fs, globPaths.toArray)
    } else {
      DataSourceUtils.getTablePath(fs, Array(new Path(path.get)))
    }
@@ -118,8 +113,7 @@ class DefaultSource extends RelationProvider
        case (COPY_ON_WRITE, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) |
             (COPY_ON_WRITE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) |
             (MERGE_ON_READ, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) =>
-          getBaseFileOnlyView(useHoodieFileIndex, sqlContext, parameters, userSchema, tablePath,
-            readPaths, metaClient)
+          new BaseFileOnlyViewRelation(sqlContext, metaClient, parameters, userSchema, globPaths)

        case (COPY_ON_WRITE, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) =>
          new IncrementalRelation(sqlContext, parameters, userSchema, metaClient)
@@ -183,55 +177,6 @@ class DefaultSource extends RelationProvider

  override def shortName(): String = "hudi_v1"

-  private def getBaseFileOnlyView(useHoodieFileIndex: Boolean,
-                                  sqlContext: SQLContext,
-                                  optParams: Map[String, String],
-                                  schema: Option[StructType],
-                                  tablePath: String,
-                                  extraReadPaths: Seq[String],
-                                  metaClient: HoodieTableMetaClient): BaseRelation = {
-    log.info("Loading Base File Only View  with options :" + optParams)
-    val (tableFileFormat, formatClassName) = metaClient.getTableConfig.getBaseFileFormat match {
-      case HoodieFileFormat.PARQUET => (new ParquetFileFormat, "parquet")
-      case HoodieFileFormat.ORC => (new OrcFileFormat, "orc")
-    }
-
-    if (useHoodieFileIndex) {
-      new BaseFileOnlyViewRelation(sqlContext, metaClient, optParams, schema)
-    } else {
-      // this is just effectively RO view only, where `path` can contain a mix of
-      // non-hoodie/hoodie path files. set the path filter up
-      sqlContext.sparkContext.hadoopConfiguration.setClass(
-        "mapreduce.input.pathFilter.class",
-        classOf[HoodieROTablePathFilter],
-        classOf[org.apache.hadoop.fs.PathFilter])
-
-      val specifySchema = if (schema.isEmpty) {
-        // Load the schema from the commit meta data.
-        // Here we should specify the schema to the latest commit schema since
-        // the table schema evolution.
-        val tableSchemaResolver = new TableSchemaResolver(metaClient)
-        try {
-          Some(AvroConversionUtils.convertAvroSchemaToStructType(tableSchemaResolver.getTableAvroSchema))
-        } catch {
-          case _: Throwable =>
-            None // If there is no commit in the table, we can not get the schema
-                 // with tableSchemaResolver, return None here.
-        }
-      } else {
-        schema
-      }
-      // simply return as a regular relation
-      DataSource.apply(
-        sparkSession = sqlContext.sparkSession,
-        paths = extraReadPaths,
-        userSpecifiedSchema = specifySchema,
-        className = formatClassName,
-        options = optParams)
-        .resolveRelation()
-    }
-  }
-
  override def sourceSchema(sqlContext: SQLContext,
                            schema: Option[StructType],
                            providerName: String,
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBootstrapRelation.scala
@@ -53,7 +53,7 @@ import scala.collection.JavaConverters._
  */
 class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext,
                              val userSchema: Option[StructType],
-                              val globPaths: Option[Seq[Path]],
+                              val globPaths: Seq[Path],
                              val metaClient: HoodieTableMetaClient,
                              val optParams: Map[String, String]) extends BaseRelation
  with PrunedFilteredScan with Logging {
@@ -155,9 +155,9 @@ class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext,

  def buildFileIndex(): HoodieBootstrapFileIndex = {
    logInfo("Building file index..")
-    val fileStatuses  = if (globPaths.isDefined) {
+    val fileStatuses  = if (globPaths.nonEmpty) {
      // Load files from the global paths if it has defined to be compatible with the original mode
-      val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(_sqlContext.sparkSession, globPaths.get)
+      val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(_sqlContext.sparkSession, globPaths)
      inMemoryFileIndex.allFiles()
    } else { // Load files by the HoodieFileIndex.
        HoodieFileIndex(sqlContext.sparkSession, metaClient, Some(schema), optParams,
--- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala
+++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala
@@ -59,7 +59,7 @@ case class HoodieMergeOnReadTableState(tableStructSchema: StructType,
 class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
                                  optParams: Map[String, String],
                                  val userSchema: Option[StructType],
-                                  val globPaths: Option[Seq[Path]],
+                                  val globPaths: Seq[Path],
                                  val metaClient: HoodieTableMetaClient)
  extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) {

@@ -139,9 +139,9 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
  }

  def buildFileIndex(filters: Array[Filter]): List[HoodieMergeOnReadFileSplit] = {
-    if (globPaths.isDefined) {
+    if (globPaths.nonEmpty) {
      // Load files from the global paths if it has defined to be compatible with the original mode
-      val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sqlContext.sparkSession, globPaths.get)
+      val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sqlContext.sparkSession, globPaths)
      val fsView = new HoodieTableFileSystemView(metaClient,
        // file-slice after pending compaction-requested instant-time is also considered valid
        metaClient.getCommitsAndCompactionTimeline.filterCompletedAndCompactionInstants,
--- a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala
+++ b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestCOWDataSource.scala
@@ -157,7 +157,7 @@ class TestCOWDataSource extends HoodieClientTestBase {

    val snapshotDF2 = spark.read.format("org.apache.hudi")
      .load(basePath + "/*/*/*/*")
-    assertEquals(snapshotDF1.count() - inputDF2.count(), snapshotDF2.count())
+    assertEquals(snapshotDF2.count(), 80)
  }

  @Test def testOverWriteModeUseReplaceAction(): Unit = {