1
0

[HUDI-4173] Fix wrong results if the user read no base files hudi table by glob paths (#5723)

This commit is contained in:
RexAn
2022-06-21 01:32:34 +08:00
committed by GitHub
parent 7601e9e4c7
commit 17ac5a4573
7 changed files with 118 additions and 22 deletions

View File

@@ -104,14 +104,22 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
val fileSlices = fileIndex.listFileSlices(convertedPartitionFilters)
buildSplits(fileSlices.values.flatten.toSeq)
} else {
// TODO refactor to avoid iterating over listed files multiple times
val partitions = listLatestBaseFiles(globPaths, convertedPartitionFilters, dataFilters)
val partitionPaths = partitions.keys.toSeq
val inMemoryFileIndex = HoodieInMemoryFileIndex.create(sparkSession, globPaths)
val partitionDirs = inMemoryFileIndex.listFiles(partitionFilters, dataFilters)
val fsView = new HoodieTableFileSystemView(metaClient, timeline, partitionDirs.flatMap(_.files).toArray)
val partitionPaths = fsView.getPartitionPaths.asScala
if (partitionPaths.isEmpty || latestInstant.isEmpty) {
// If this an empty table OR it has no completed commits yet, return
List.empty[HoodieMergeOnReadFileSplit]
} else {
val fileSlices = listFileSlices(partitionPaths)
val queryTimestamp = this.queryTimestamp.get
val fileSlices = partitionPaths.flatMap { partitionPath =>
val relativePath = getRelativePartitionPath(new Path(basePath), partitionPath)
fsView.getLatestMergedFileSlicesBeforeOrOn(relativePath, queryTimestamp).iterator().asScala.toSeq
}
buildSplits(fileSlices)
}
}
@@ -130,20 +138,6 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
HoodieMergeOnReadFileSplit(partitionedBaseFile, logFiles)
}.toList
}
private def listFileSlices(partitionPaths: Seq[Path]): Seq[FileSlice] = {
// NOTE: It's critical for us to re-use [[InMemoryFileIndex]] to make sure we're leveraging
// [[FileStatusCache]] and avoid listing the whole table again
val inMemoryFileIndex = HoodieInMemoryFileIndex.create(sparkSession, partitionPaths)
val fsView = new HoodieTableFileSystemView(metaClient, timeline, inMemoryFileIndex.allFiles.toArray)
val queryTimestamp = this.queryTimestamp.get
partitionPaths.flatMap { partitionPath =>
val relativePath = getRelativePartitionPath(new Path(basePath), partitionPath)
fsView.getLatestMergedFileSlicesBeforeOrOn(relativePath, queryTimestamp).iterator().asScala.toSeq
}
}
}
object MergeOnReadSnapshotRelation {

View File

@@ -20,9 +20,12 @@ package org.apache.spark.execution.datasources
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path, PathFilter}
import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
import org.apache.hudi.SparkAdapterSupport
import org.apache.spark.HoodieHadoopFSUtils
import org.apache.spark.metrics.source.HiveCatalogMetrics
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{InternalRow, expressions}
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BoundReference, Expression, InterpretedPredicate}
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.types.StructType
@@ -34,7 +37,77 @@ class HoodieInMemoryFileIndex(sparkSession: SparkSession,
parameters: Map[String, String],
userSpecifiedSchema: Option[StructType],
fileStatusCache: FileStatusCache = NoopCache)
extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters, userSpecifiedSchema, fileStatusCache) {
extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters, userSpecifiedSchema, fileStatusCache)
with SparkAdapterSupport {
/**
* Returns all valid files grouped into partitions when the data is partitioned. If the data is unpartitioned,
* this will return a single partition with no partition values
*
* NOTE: This method replicates the one it overrides, however it uses custom method
* that accepts files starting with "."
*/
override def listFiles(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
PartitionDirectory(InternalRow.empty, allFiles().filter(f => isDataPath(f.getPath))) :: Nil
} else {
prunePartitions(partitionFilters, partitionSpec()).map {
case PartitionPath(values, path) =>
val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
case Some(existingDir) =>
// Directory has children files in it, return them
existingDir.filter(f => isDataPath(f.getPath))
case None =>
// Directory does not exist, or has no children files
Nil
}
PartitionDirectory(values, files)
}
}
logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t"))
selectedPartitions
}
private def isDataPath(path: Path): Boolean = {
val name = path.getName
!(name.startsWith("_") && !name.contains("="))
}
private def prunePartitions(
predicates: Seq[Expression],
partitionSpec: PartitionSpec): Seq[PartitionPath] = {
val PartitionSpec(partitionColumns, partitions) = partitionSpec
val partitionColumnNames = partitionColumns.map(_.name).toSet
val partitionPruningPredicates = predicates.filter {
_.references.map(_.name).toSet.subsetOf(partitionColumnNames)
}
if (partitionPruningPredicates.nonEmpty) {
val predicate = partitionPruningPredicates.reduce(expressions.And)
val boundPredicate = sparkAdapter.createInterpretedPredicate(predicate.transform {
case a: AttributeReference =>
val index = partitionColumns.indexWhere(a.name == _.name)
BoundReference(index, partitionColumns(index).dataType, nullable = true)
})
val selected = partitions.filter {
case PartitionPath(values, _) => boundPredicate.eval(values)
}
logInfo {
val total = partitions.length
val selectedSize = selected.length
val percentPruned = (1 - selectedSize.toDouble / total.toDouble) * 100
s"Selected $selectedSize partitions out of $total, " +
s"pruned ${if (total == 0) "0" else s"$percentPruned%"} partitions."
}
selected
} else {
partitions
}
}
/**
* List leaf files of given paths. This method will submit a Spark job to do parallel