[HUDI-3457] Refactored Spark DataSource Relations to avoid code duplication (#4877)
Refactoring Spark DataSource Relations to avoid code duplication. Following Relations were in scope: - BaseFileOnlyViewRelation - MergeOnReadSnapshotRelaation - MergeOnReadIncrementalRelation
This commit is contained in:
@@ -0,0 +1,29 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi
|
||||||
|
|
||||||
|
object HoodieConversionUtils {
|
||||||
|
|
||||||
|
def toJavaOption[T](opt: Option[T]): org.apache.hudi.common.util.Option[T] =
|
||||||
|
if (opt.isDefined) org.apache.hudi.common.util.Option.of(opt.get) else org.apache.hudi.common.util.Option.empty()
|
||||||
|
|
||||||
|
def toScalaOption[T](opt: org.apache.hudi.common.util.Option[T]): Option[T] =
|
||||||
|
if (opt.isPresent) Some(opt.get) else None
|
||||||
|
|
||||||
|
}
|
||||||
@@ -118,11 +118,6 @@ object HoodieSparkUtils extends SparkAdapterSupport {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
def createInMemoryFileIndex(sparkSession: SparkSession, globbedPaths: Seq[Path]): InMemoryFileIndex = {
|
|
||||||
val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
|
|
||||||
new InMemoryFileIndex(sparkSession, globbedPaths, Map(), Option.empty, fileStatusCache)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @deprecated please use other overload [[createRdd]]
|
* @deprecated please use other overload [[createRdd]]
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -198,7 +198,7 @@ public abstract class BaseHoodieTableFileIndex {
|
|||||||
// that is under the pending compaction process, new log-file will bear the compaction's instant (on the
|
// that is under the pending compaction process, new log-file will bear the compaction's instant (on the
|
||||||
// timeline) in its name, as opposed to the base-file's commit instant. To make sure we're not filtering
|
// timeline) in its name, as opposed to the base-file's commit instant. To make sure we're not filtering
|
||||||
// such log-file we have to _always_ include pending compaction instants into consideration
|
// such log-file we have to _always_ include pending compaction instants into consideration
|
||||||
// TODO(HUDI-3302) re-evaluate whether we should not filter any commits in here
|
// TODO(HUDI-3302) re-evaluate whether we should filter any commits in here
|
||||||
HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline();
|
HoodieTimeline timeline = metaClient.getCommitsAndCompactionTimeline();
|
||||||
if (shouldIncludePendingCommits) {
|
if (shouldIncludePendingCommits) {
|
||||||
return timeline;
|
return timeline;
|
||||||
|
|||||||
@@ -509,7 +509,7 @@ public class TableSchemaResolver {
|
|||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException {
|
public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException {
|
||||||
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null);
|
try (Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null)) {
|
||||||
HoodieDataBlock lastBlock = null;
|
HoodieDataBlock lastBlock = null;
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
HoodieLogBlock block = reader.next();
|
HoodieLogBlock block = reader.next();
|
||||||
@@ -517,11 +517,8 @@ public class TableSchemaResolver {
|
|||||||
lastBlock = (HoodieDataBlock) block;
|
lastBlock = (HoodieDataBlock) block;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
reader.close();
|
return lastBlock != null ? new AvroSchemaConverter().convert(lastBlock.getSchema()) : null;
|
||||||
if (lastBlock != null) {
|
|
||||||
return new AvroSchemaConverter().convert(lastBlock.getSchema());
|
|
||||||
}
|
}
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isHasOperationField() {
|
public boolean isHasOperationField() {
|
||||||
|
|||||||
@@ -19,21 +19,11 @@
|
|||||||
package org.apache.hudi.hadoop.utils;
|
package org.apache.hudi.hadoop.utils;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
|
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
|
||||||
import org.apache.hadoop.mapred.FileSplit;
|
import org.apache.hadoop.mapred.FileSplit;
|
||||||
import org.apache.hadoop.mapred.JobConf;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
|
||||||
import org.apache.hudi.common.model.FileSlice;
|
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
|
||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
|
||||||
import org.apache.hudi.exception.HoodieException;
|
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit;
|
import org.apache.hudi.hadoop.realtime.HoodieRealtimeBootstrapBaseFileSplit;
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit;
|
import org.apache.hudi.hadoop.realtime.HoodieRealtimeFileSplit;
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo;
|
import org.apache.hudi.hadoop.realtime.HoodieVirtualKeyInfo;
|
||||||
@@ -41,14 +31,6 @@ import org.apache.hudi.hadoop.realtime.RealtimeSplit;
|
|||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import static org.apache.hudi.TypeUtils.unsafeCast;
|
import static org.apache.hudi.TypeUtils.unsafeCast;
|
||||||
|
|
||||||
public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
||||||
@@ -67,41 +49,6 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return parquet file with a list of log files in the same file group.
|
|
||||||
public static List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> groupLogsByBaseFile(Configuration conf, List<Path> partitionPaths) {
|
|
||||||
Set<Path> partitionSet = new HashSet<>(partitionPaths);
|
|
||||||
// TODO(vc): Should we handle also non-hoodie splits here?
|
|
||||||
Map<Path, HoodieTableMetaClient> partitionsToMetaClient = getTableMetaClientByPartitionPath(conf, partitionSet);
|
|
||||||
|
|
||||||
// Get all the base file and it's log files pairs in required partition paths.
|
|
||||||
List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> baseAndLogsList = new ArrayList<>();
|
|
||||||
partitionSet.forEach(partitionPath -> {
|
|
||||||
// for each partition path obtain the data & log file groupings, then map back to inputsplits
|
|
||||||
HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
|
|
||||||
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline());
|
|
||||||
String relPartitionPath = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), partitionPath);
|
|
||||||
|
|
||||||
try {
|
|
||||||
// Both commit and delta-commits are included - pick the latest completed one
|
|
||||||
Option<HoodieInstant> latestCompletedInstant =
|
|
||||||
metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants().lastInstant();
|
|
||||||
|
|
||||||
Stream<FileSlice> latestFileSlices = latestCompletedInstant
|
|
||||||
.map(instant -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, instant.getTimestamp()))
|
|
||||||
.orElse(Stream.empty());
|
|
||||||
|
|
||||||
latestFileSlices.forEach(fileSlice -> {
|
|
||||||
List<HoodieLogFile> logFilePaths = fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
|
|
||||||
baseAndLogsList.add(Pair.of(fileSlice.getBaseFile(), logFilePaths));
|
|
||||||
});
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new HoodieException("Error obtaining data file/log file grouping: " + partitionPath, e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return baseAndLogsList;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add a field to the existing fields projected.
|
* Add a field to the existing fields projected.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -0,0 +1,94 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
import org.apache.hadoop.fs.Path
|
||||||
|
import org.apache.hudi.HoodieBaseRelation.createBaseFileReader
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
|
import org.apache.spark.sql.{HoodieCatalystExpressionUtils, SQLContext}
|
||||||
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
|
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||||
|
import org.apache.spark.sql.execution.datasources._
|
||||||
|
import org.apache.spark.sql.sources.{BaseRelation, Filter}
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
|
/**
|
||||||
|
* [[BaseRelation]] implementation only reading Base files of Hudi tables, essentially supporting following querying
|
||||||
|
* modes:
|
||||||
|
* <ul>
|
||||||
|
* <li>For COW tables: Snapshot</li>
|
||||||
|
* <li>For MOR tables: Read-optimized</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* NOTE: The reason this Relation is used in liue of Spark's default [[HadoopFsRelation]] is primarily due to the
|
||||||
|
* fact that it injects real partition's path as the value of the partition field, which Hudi ultimately persists
|
||||||
|
* as part of the record payload. In some cases, however, partition path might not necessarily be equal to the
|
||||||
|
* verbatim value of the partition path field (when custom [[KeyGenerator]] is used) therefore leading to incorrect
|
||||||
|
* partition field values being written
|
||||||
|
*/
|
||||||
|
class BaseFileOnlyRelation(sqlContext: SQLContext,
|
||||||
|
metaClient: HoodieTableMetaClient,
|
||||||
|
optParams: Map[String, String],
|
||||||
|
userSchema: Option[StructType],
|
||||||
|
globPaths: Seq[Path])
|
||||||
|
extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) with SparkAdapterSupport {
|
||||||
|
|
||||||
|
override type FileSplit = HoodieBaseFileSplit
|
||||||
|
|
||||||
|
protected override def composeRDD(fileSplits: Seq[HoodieBaseFileSplit],
|
||||||
|
partitionSchema: StructType,
|
||||||
|
tableSchema: HoodieTableSchema,
|
||||||
|
requiredSchema: HoodieTableSchema,
|
||||||
|
filters: Array[Filter]): HoodieUnsafeRDD = {
|
||||||
|
val baseFileReader = createBaseFileReader(
|
||||||
|
spark = sparkSession,
|
||||||
|
partitionSchema = partitionSchema,
|
||||||
|
tableSchema = tableSchema,
|
||||||
|
requiredSchema = requiredSchema,
|
||||||
|
filters = filters,
|
||||||
|
options = optParams,
|
||||||
|
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||||
|
// to configure Parquet reader appropriately
|
||||||
|
hadoopConf = new Configuration(conf)
|
||||||
|
)
|
||||||
|
|
||||||
|
new HoodieFileScanRDD(sparkSession, baseFileReader, fileSplits)
|
||||||
|
}
|
||||||
|
|
||||||
|
protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[HoodieBaseFileSplit] = {
|
||||||
|
val partitions = listLatestBaseFiles(globPaths, partitionFilters, dataFilters)
|
||||||
|
val fileSplits = partitions.values.toSeq.flatMap { files =>
|
||||||
|
files.flatMap { file =>
|
||||||
|
// TODO move to adapter
|
||||||
|
// TODO fix, currently assuming parquet as underlying format
|
||||||
|
HoodieDataSourceHelper.splitFiles(
|
||||||
|
sparkSession = sparkSession,
|
||||||
|
file = file,
|
||||||
|
// TODO clarify why this is required
|
||||||
|
partitionValues = InternalRow.empty
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes
|
||||||
|
|
||||||
|
sparkAdapter.getFilePartitions(sparkSession, fileSplits, maxSplitBytes).map(HoodieBaseFileSplit.apply)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,141 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi
|
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration
|
|
||||||
import org.apache.hadoop.fs.Path
|
|
||||||
import org.apache.hudi.HoodieBaseRelation.createBaseFileReader
|
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
|
||||||
import org.apache.hudi.hadoop.HoodieROTablePathFilter
|
|
||||||
import org.apache.spark.rdd.RDD
|
|
||||||
import org.apache.spark.sql.{HoodieCatalystExpressionUtils, SQLContext}
|
|
||||||
import org.apache.spark.sql.catalyst.InternalRow
|
|
||||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
|
||||||
import org.apache.spark.sql.execution.datasources._
|
|
||||||
import org.apache.spark.sql.sources.{BaseRelation, Filter}
|
|
||||||
import org.apache.spark.sql.types.StructType
|
|
||||||
|
|
||||||
/**
|
|
||||||
* [[BaseRelation]] implementation only reading Base files of Hudi tables, essentially supporting following querying
|
|
||||||
* modes:
|
|
||||||
* <ul>
|
|
||||||
* <li>For COW tables: Snapshot</li>
|
|
||||||
* <li>For MOR tables: Read-optimized</li>
|
|
||||||
* </ul>
|
|
||||||
*
|
|
||||||
* NOTE: The reason this Relation is used in liue of Spark's default [[HadoopFsRelation]] is primarily due to the
|
|
||||||
* fact that it injects real partition's path as the value of the partition field, which Hudi ultimately persists
|
|
||||||
* as part of the record payload. In some cases, however, partition path might not necessarily be equal to the
|
|
||||||
* verbatim value of the partition path field (when custom [[KeyGenerator]] is used) therefore leading to incorrect
|
|
||||||
* partition field values being written
|
|
||||||
*/
|
|
||||||
class BaseFileOnlyViewRelation(sqlContext: SQLContext,
|
|
||||||
metaClient: HoodieTableMetaClient,
|
|
||||||
optParams: Map[String, String],
|
|
||||||
userSchema: Option[StructType],
|
|
||||||
globPaths: Seq[Path])
|
|
||||||
extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) with SparkAdapterSupport {
|
|
||||||
|
|
||||||
private val fileIndex = HoodieFileIndex(sparkSession, metaClient, userSchema, optParams,
|
|
||||||
FileStatusCache.getOrCreate(sqlContext.sparkSession))
|
|
||||||
|
|
||||||
override def doBuildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[InternalRow] = {
|
|
||||||
// NOTE: In case list of requested columns doesn't contain the Primary Key one, we
|
|
||||||
// have to add it explicitly so that
|
|
||||||
// - Merging could be performed correctly
|
|
||||||
// - In case 0 columns are to be fetched (for ex, when doing {@code count()} on Spark's [[Dataset]],
|
|
||||||
// Spark still fetches all the rows to execute the query correctly
|
|
||||||
//
|
|
||||||
// It's okay to return columns that have not been requested by the caller, as those nevertheless will be
|
|
||||||
// filtered out upstream
|
|
||||||
val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns)
|
|
||||||
|
|
||||||
val (requiredAvroSchema, requiredStructSchema) =
|
|
||||||
HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns)
|
|
||||||
|
|
||||||
val filterExpressions = convertToExpressions(filters)
|
|
||||||
val (partitionFilters, dataFilters) = HoodieCatalystExpressionUtils.splitPartitionAndDataPredicates(
|
|
||||||
sparkSession, filterExpressions, partitionColumns)
|
|
||||||
|
|
||||||
val filePartitions = getPartitions(partitionFilters, dataFilters)
|
|
||||||
|
|
||||||
val partitionSchema = StructType(Nil)
|
|
||||||
val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString)
|
|
||||||
val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString)
|
|
||||||
|
|
||||||
val baseFileReader = createBaseFileReader(
|
|
||||||
spark = sparkSession,
|
|
||||||
partitionSchema = partitionSchema,
|
|
||||||
tableSchema = tableSchema,
|
|
||||||
requiredSchema = requiredSchema,
|
|
||||||
filters = filters,
|
|
||||||
options = optParams,
|
|
||||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
|
||||||
// to configure Parquet reader appropriately
|
|
||||||
hadoopConf = new Configuration(conf)
|
|
||||||
)
|
|
||||||
|
|
||||||
new HoodieFileScanRDD(sparkSession, baseFileReader, filePartitions)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def getPartitions(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[FilePartition] = {
|
|
||||||
val partitionDirectories = if (globPaths.isEmpty) {
|
|
||||||
val hoodieFileIndex = HoodieFileIndex(sparkSession, metaClient, userSchema, optParams,
|
|
||||||
FileStatusCache.getOrCreate(sqlContext.sparkSession))
|
|
||||||
hoodieFileIndex.listFiles(partitionFilters, dataFilters)
|
|
||||||
} else {
|
|
||||||
sqlContext.sparkContext.hadoopConfiguration.setClass(
|
|
||||||
"mapreduce.input.pathFilter.class",
|
|
||||||
classOf[HoodieROTablePathFilter],
|
|
||||||
classOf[org.apache.hadoop.fs.PathFilter])
|
|
||||||
|
|
||||||
val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sparkSession, globPaths)
|
|
||||||
inMemoryFileIndex.listFiles(partitionFilters, dataFilters)
|
|
||||||
}
|
|
||||||
|
|
||||||
val partitions = partitionDirectories.flatMap { partition =>
|
|
||||||
partition.files.flatMap { file =>
|
|
||||||
// TODO move to adapter
|
|
||||||
// TODO fix, currently assuming parquet as underlying format
|
|
||||||
HoodieDataSourceHelper.splitFiles(
|
|
||||||
sparkSession = sparkSession,
|
|
||||||
file = file,
|
|
||||||
// TODO clarify why this is required
|
|
||||||
partitionValues = InternalRow.empty
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
val maxSplitBytes = sparkSession.sessionState.conf.filesMaxPartitionBytes
|
|
||||||
|
|
||||||
sparkAdapter.getFilePartitions(sparkSession, partitions, maxSplitBytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
private def convertToExpressions(filters: Array[Filter]): Array[Expression] = {
|
|
||||||
val catalystExpressions = HoodieSparkUtils.convertToCatalystExpressions(filters, tableStructSchema)
|
|
||||||
|
|
||||||
val failedExprs = catalystExpressions.zipWithIndex.filter { case (opt, _) => opt.isEmpty }
|
|
||||||
if (failedExprs.nonEmpty) {
|
|
||||||
val failedFilters = failedExprs.map(p => filters(p._2))
|
|
||||||
logWarning(s"Failed to convert Filters into Catalyst expressions (${failedFilters.map(_.toString)})")
|
|
||||||
}
|
|
||||||
|
|
||||||
catalystExpressions.filter(_.isDefined).map(_.get).toArray
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -107,7 +107,7 @@ class DefaultSource extends RelationProvider
|
|||||||
case (COPY_ON_WRITE, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) |
|
case (COPY_ON_WRITE, QUERY_TYPE_SNAPSHOT_OPT_VAL, false) |
|
||||||
(COPY_ON_WRITE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) |
|
(COPY_ON_WRITE, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) |
|
||||||
(MERGE_ON_READ, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) =>
|
(MERGE_ON_READ, QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, false) =>
|
||||||
new BaseFileOnlyViewRelation(sqlContext, metaClient, parameters, userSchema, globPaths)
|
new BaseFileOnlyRelation(sqlContext, metaClient, parameters, userSchema, globPaths)
|
||||||
|
|
||||||
case (COPY_ON_WRITE, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) =>
|
case (COPY_ON_WRITE, QUERY_TYPE_INCREMENTAL_OPT_VAL, _) =>
|
||||||
new IncrementalRelation(sqlContext, parameters, userSchema, metaClient)
|
new IncrementalRelation(sqlContext, parameters, userSchema, metaClient)
|
||||||
|
|||||||
@@ -20,22 +20,28 @@ package org.apache.hudi
|
|||||||
import org.apache.avro.Schema
|
import org.apache.avro.Schema
|
||||||
import org.apache.avro.generic.GenericRecord
|
import org.apache.avro.generic.GenericRecord
|
||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.{FileStatus, Path, PathFilter}
|
||||||
import org.apache.hadoop.hbase.io.hfile.CacheConfig
|
import org.apache.hadoop.hbase.io.hfile.CacheConfig
|
||||||
import org.apache.hadoop.mapred.JobConf
|
import org.apache.hadoop.mapred.JobConf
|
||||||
import org.apache.hudi.HoodieBaseRelation.isMetadataTable
|
import org.apache.hudi.HoodieBaseRelation.{getPartitionPath, isMetadataTable}
|
||||||
|
import org.apache.hudi.HoodieConversionUtils.toScalaOption
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration
|
import org.apache.hudi.common.config.SerializableConfiguration
|
||||||
import org.apache.hudi.common.fs.FSUtils
|
import org.apache.hudi.common.fs.FSUtils
|
||||||
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord}
|
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord}
|
||||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline}
|
||||||
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||||
|
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
|
||||||
import org.apache.hudi.common.util.StringUtils
|
import org.apache.hudi.common.util.StringUtils
|
||||||
|
import org.apache.hudi.hadoop.HoodieROTablePathFilter
|
||||||
import org.apache.hudi.io.storage.HoodieHFileReader
|
import org.apache.hudi.io.storage.HoodieHFileReader
|
||||||
import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata}
|
import org.apache.hudi.metadata.{HoodieMetadataPayload, HoodieTableMetadata}
|
||||||
|
import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex
|
||||||
import org.apache.spark.internal.Logging
|
import org.apache.spark.internal.Logging
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.avro.SchemaConverters
|
import org.apache.spark.sql.avro.SchemaConverters
|
||||||
import org.apache.spark.sql.catalyst.InternalRow
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
import org.apache.spark.sql.execution.datasources.PartitionedFile
|
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression}
|
||||||
|
import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionDirectory, PartitionedFile}
|
||||||
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils
|
import org.apache.spark.sql.hudi.HoodieSqlCommonUtils
|
||||||
import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
|
import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan}
|
||||||
import org.apache.spark.sql.types.StructType
|
import org.apache.spark.sql.types.StructType
|
||||||
@@ -44,6 +50,8 @@ import org.apache.spark.sql.{Row, SQLContext, SparkSession}
|
|||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.util.Try
|
import scala.util.Try
|
||||||
|
|
||||||
|
trait HoodieFileSplit {}
|
||||||
|
|
||||||
case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String)
|
case class HoodieTableSchema(structTypeSchema: StructType, avroSchemaStr: String)
|
||||||
|
|
||||||
case class HoodieTableState(recordKeyField: String,
|
case class HoodieTableState(recordKeyField: String,
|
||||||
@@ -53,36 +61,33 @@ case class HoodieTableState(recordKeyField: String,
|
|||||||
* Hoodie BaseRelation which extends [[PrunedFilteredScan]].
|
* Hoodie BaseRelation which extends [[PrunedFilteredScan]].
|
||||||
*/
|
*/
|
||||||
abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
||||||
metaClient: HoodieTableMetaClient,
|
val metaClient: HoodieTableMetaClient,
|
||||||
optParams: Map[String, String],
|
val optParams: Map[String, String],
|
||||||
userSchema: Option[StructType])
|
userSchema: Option[StructType])
|
||||||
extends BaseRelation with PrunedFilteredScan with Logging {
|
extends BaseRelation with PrunedFilteredScan with Logging {
|
||||||
|
|
||||||
|
type FileSplit <: HoodieFileSplit
|
||||||
|
|
||||||
|
imbueConfigs(sqlContext)
|
||||||
|
|
||||||
protected val sparkSession: SparkSession = sqlContext.sparkSession
|
protected val sparkSession: SparkSession = sqlContext.sparkSession
|
||||||
|
|
||||||
protected lazy val conf: Configuration = new Configuration(sqlContext.sparkContext.hadoopConfiguration)
|
protected lazy val conf: Configuration = new Configuration(sqlContext.sparkContext.hadoopConfiguration)
|
||||||
protected lazy val jobConf = new JobConf(conf)
|
protected lazy val jobConf = new JobConf(conf)
|
||||||
|
|
||||||
|
protected lazy val tableConfig: HoodieTableConfig = metaClient.getTableConfig
|
||||||
|
|
||||||
|
protected lazy val basePath: String = metaClient.getBasePath
|
||||||
|
|
||||||
// If meta fields are enabled, always prefer key from the meta field as opposed to user-specified one
|
// If meta fields are enabled, always prefer key from the meta field as opposed to user-specified one
|
||||||
// NOTE: This is historical behavior which is preserved as is
|
// NOTE: This is historical behavior which is preserved as is
|
||||||
protected lazy val recordKeyField: String =
|
protected lazy val recordKeyField: String =
|
||||||
if (metaClient.getTableConfig.populateMetaFields()) HoodieRecord.RECORD_KEY_METADATA_FIELD
|
if (tableConfig.populateMetaFields()) HoodieRecord.RECORD_KEY_METADATA_FIELD
|
||||||
else metaClient.getTableConfig.getRecordKeyFieldProp
|
else tableConfig.getRecordKeyFieldProp
|
||||||
|
|
||||||
protected lazy val preCombineFieldOpt: Option[String] = getPrecombineFieldProperty
|
protected lazy val preCombineFieldOpt: Option[String] = getPrecombineFieldProperty
|
||||||
|
|
||||||
/**
|
protected lazy val specifiedQueryTimestamp: Option[String] =
|
||||||
* @VisibleInTests
|
|
||||||
*/
|
|
||||||
lazy val mandatoryColumns: Seq[String] = {
|
|
||||||
if (isMetadataTable(metaClient)) {
|
|
||||||
Seq(HoodieMetadataPayload.KEY_FIELD_NAME, HoodieMetadataPayload.SCHEMA_FIELD_NAME_TYPE)
|
|
||||||
} else {
|
|
||||||
Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected lazy val specifiedQueryInstant: Option[String] =
|
|
||||||
optParams.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key)
|
optParams.get(DataSourceReadOptions.TIME_TRAVEL_AS_OF_INSTANT.key)
|
||||||
.map(HoodieSqlCommonUtils.formatQueryInstant)
|
.map(HoodieSqlCommonUtils.formatQueryInstant)
|
||||||
|
|
||||||
@@ -100,16 +105,40 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
|||||||
|
|
||||||
protected val tableStructSchema: StructType = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
|
protected val tableStructSchema: StructType = AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
|
||||||
|
|
||||||
protected val partitionColumns: Array[String] = metaClient.getTableConfig.getPartitionFields.orElse(Array.empty)
|
protected val partitionColumns: Array[String] = tableConfig.getPartitionFields.orElse(Array.empty)
|
||||||
|
|
||||||
protected def getPrecombineFieldProperty: Option[String] =
|
/**
|
||||||
Option(metaClient.getTableConfig.getPreCombineField)
|
* NOTE: PLEASE READ THIS CAREFULLY
|
||||||
.orElse(optParams.get(DataSourceWriteOptions.PRECOMBINE_FIELD.key)) match {
|
*
|
||||||
// NOTE: This is required to compensate for cases when empty string is used to stub
|
* Even though [[HoodieFileIndex]] initializes eagerly listing all of the files w/in the given Hudi table,
|
||||||
// property value to avoid it being set with the default value
|
* this variable itself is _lazy_ (and have to stay that way) which guarantees that it's not initialized, until
|
||||||
// TODO(HUDI-3456) cleanup
|
* it's actually accessed
|
||||||
case Some(f) if !StringUtils.isNullOrEmpty(f) => Some(f)
|
*/
|
||||||
case _ => None
|
protected lazy val fileIndex: HoodieFileIndex =
|
||||||
|
HoodieFileIndex(sparkSession, metaClient, Some(tableStructSchema), optParams,
|
||||||
|
FileStatusCache.getOrCreate(sparkSession))
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @VisibleInTests
|
||||||
|
*/
|
||||||
|
lazy val mandatoryColumns: Seq[String] = {
|
||||||
|
if (isMetadataTable(metaClient)) {
|
||||||
|
Seq(HoodieMetadataPayload.KEY_FIELD_NAME, HoodieMetadataPayload.SCHEMA_FIELD_NAME_TYPE)
|
||||||
|
} else {
|
||||||
|
// TODO this is MOR table requirement, not necessary for COW
|
||||||
|
Seq(recordKeyField) ++ preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected def timeline: HoodieTimeline =
|
||||||
|
// NOTE: We're including compaction here since it's not considering a "commit" operation
|
||||||
|
metaClient.getCommitsAndCompactionTimeline.filterCompletedInstants
|
||||||
|
|
||||||
|
protected def latestInstant: Option[HoodieInstant] =
|
||||||
|
toScalaOption(timeline.lastInstant())
|
||||||
|
|
||||||
|
protected def queryTimestamp: Option[String] = {
|
||||||
|
specifiedQueryTimestamp.orElse(toScalaOption(timeline.lastInstant()).map(i => i.getTimestamp))
|
||||||
}
|
}
|
||||||
|
|
||||||
override def schema: StructType = tableStructSchema
|
override def schema: StructType = tableStructSchema
|
||||||
@@ -130,22 +159,129 @@ abstract class HoodieBaseRelation(val sqlContext: SQLContext,
|
|||||||
* NOTE: DO NOT OVERRIDE THIS METHOD
|
* NOTE: DO NOT OVERRIDE THIS METHOD
|
||||||
*/
|
*/
|
||||||
override final def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
|
override final def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
|
||||||
|
// NOTE: In case list of requested columns doesn't contain the Primary Key one, we
|
||||||
|
// have to add it explicitly so that
|
||||||
|
// - Merging could be performed correctly
|
||||||
|
// - In case 0 columns are to be fetched (for ex, when doing {@code count()} on Spark's [[Dataset]],
|
||||||
|
// Spark still fetches all the rows to execute the query correctly
|
||||||
|
//
|
||||||
|
// It's okay to return columns that have not been requested by the caller, as those nevertheless will be
|
||||||
|
// filtered out upstream
|
||||||
|
val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns)
|
||||||
|
|
||||||
|
val (requiredAvroSchema, requiredStructSchema) =
|
||||||
|
HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns)
|
||||||
|
|
||||||
|
val filterExpressions = convertToExpressions(filters)
|
||||||
|
val (partitionFilters, dataFilters) = filterExpressions.partition(isPartitionPredicate)
|
||||||
|
|
||||||
|
val fileSplits = collectFileSplits(partitionFilters, dataFilters)
|
||||||
|
|
||||||
|
val partitionSchema = StructType(Nil)
|
||||||
|
val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString)
|
||||||
|
val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString)
|
||||||
|
|
||||||
// Here we rely on a type erasure, to workaround inherited API restriction and pass [[RDD[InternalRow]]] back as [[RDD[Row]]]
|
// Here we rely on a type erasure, to workaround inherited API restriction and pass [[RDD[InternalRow]]] back as [[RDD[Row]]]
|
||||||
// Please check [[needConversion]] scala-doc for more details
|
// Please check [[needConversion]] scala-doc for more details
|
||||||
doBuildScan(requiredColumns, filters).asInstanceOf[RDD[Row]]
|
if (fileSplits.nonEmpty)
|
||||||
|
composeRDD(fileSplits, partitionSchema, tableSchema, requiredSchema, filters).asInstanceOf[RDD[Row]]
|
||||||
|
else
|
||||||
|
sparkSession.sparkContext.emptyRDD
|
||||||
}
|
}
|
||||||
|
|
||||||
protected def doBuildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[InternalRow]
|
/**
|
||||||
|
* Composes RDD provided file splits to read from, table and partition schemas, data filters to be applied
|
||||||
|
*
|
||||||
|
* @param fileSplits file splits to be handled by the RDD
|
||||||
|
* @param partitionSchema target table's partition schema
|
||||||
|
* @param tableSchema target table's schema
|
||||||
|
* @param requiredSchema projected schema required by the reader
|
||||||
|
* @param filters data filters to be applied
|
||||||
|
* @return instance of RDD (implementing [[HoodieUnsafeRDD]])
|
||||||
|
*/
|
||||||
|
protected def composeRDD(fileSplits: Seq[FileSplit],
|
||||||
|
partitionSchema: StructType,
|
||||||
|
tableSchema: HoodieTableSchema,
|
||||||
|
requiredSchema: HoodieTableSchema,
|
||||||
|
filters: Array[Filter]): HoodieUnsafeRDD
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provided with partition and date filters collects target file splits to read records from, while
|
||||||
|
* performing pruning if necessary
|
||||||
|
*
|
||||||
|
* @param partitionFilters partition filters to be applied
|
||||||
|
* @param dataFilters data filters to be applied
|
||||||
|
* @return list of [[FileSplit]] to fetch records from
|
||||||
|
*/
|
||||||
|
protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[FileSplit]
|
||||||
|
|
||||||
|
protected def listLatestBaseFiles(globbedPaths: Seq[Path], partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Map[Path, Seq[FileStatus]] = {
|
||||||
|
val partitionDirs = if (globbedPaths.isEmpty) {
|
||||||
|
fileIndex.listFiles(partitionFilters, dataFilters)
|
||||||
|
} else {
|
||||||
|
val inMemoryFileIndex = HoodieInMemoryFileIndex.create(sparkSession, globbedPaths)
|
||||||
|
inMemoryFileIndex.listFiles(partitionFilters, dataFilters)
|
||||||
|
}
|
||||||
|
|
||||||
|
val fsView = new HoodieTableFileSystemView(metaClient, timeline, partitionDirs.flatMap(_.files).toArray)
|
||||||
|
val latestBaseFiles = fsView.getLatestBaseFiles.iterator().asScala.toList.map(_.getFileStatus)
|
||||||
|
|
||||||
|
latestBaseFiles.groupBy(getPartitionPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
protected def convertToExpressions(filters: Array[Filter]): Array[Expression] = {
|
||||||
|
val catalystExpressions = HoodieSparkUtils.convertToCatalystExpressions(filters, tableStructSchema)
|
||||||
|
|
||||||
|
val failedExprs = catalystExpressions.zipWithIndex.filter { case (opt, _) => opt.isEmpty }
|
||||||
|
if (failedExprs.nonEmpty) {
|
||||||
|
val failedFilters = failedExprs.map(p => filters(p._2))
|
||||||
|
logWarning(s"Failed to convert Filters into Catalyst expressions (${failedFilters.map(_.toString)})")
|
||||||
|
}
|
||||||
|
|
||||||
|
catalystExpressions.filter(_.isDefined).map(_.get).toArray
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks whether given expression only references partition columns
|
||||||
|
* (and involves no sub-query)
|
||||||
|
*/
|
||||||
|
protected def isPartitionPredicate(condition: Expression): Boolean = {
|
||||||
|
// Validates that the provided names both resolve to the same entity
|
||||||
|
val resolvedNameEquals = sparkSession.sessionState.analyzer.resolver
|
||||||
|
|
||||||
|
condition.references.forall { r => partitionColumns.exists(resolvedNameEquals(r.name, _)) } &&
|
||||||
|
!SubqueryExpression.hasSubquery(condition)
|
||||||
|
}
|
||||||
|
|
||||||
protected final def appendMandatoryColumns(requestedColumns: Array[String]): Array[String] = {
|
protected final def appendMandatoryColumns(requestedColumns: Array[String]): Array[String] = {
|
||||||
val missing = mandatoryColumns.filter(col => !requestedColumns.contains(col))
|
val missing = mandatoryColumns.filter(col => !requestedColumns.contains(col))
|
||||||
requestedColumns ++ missing
|
requestedColumns ++ missing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private def getPrecombineFieldProperty: Option[String] =
|
||||||
|
Option(tableConfig.getPreCombineField)
|
||||||
|
.orElse(optParams.get(DataSourceWriteOptions.PRECOMBINE_FIELD.key)) match {
|
||||||
|
// NOTE: This is required to compensate for cases when empty string is used to stub
|
||||||
|
// property value to avoid it being set with the default value
|
||||||
|
// TODO(HUDI-3456) cleanup
|
||||||
|
case Some(f) if !StringUtils.isNullOrEmpty(f) => Some(f)
|
||||||
|
case _ => None
|
||||||
|
}
|
||||||
|
|
||||||
|
private def imbueConfigs(sqlContext: SQLContext): Unit = {
|
||||||
|
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.filterPushdown", "true")
|
||||||
|
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.recordLevelFilter.enabled", "true")
|
||||||
|
// TODO(HUDI-3639) vectorized reader has to be disabled to make sure MORIncrementalRelation is working properly
|
||||||
|
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object HoodieBaseRelation {
|
object HoodieBaseRelation {
|
||||||
|
|
||||||
def isMetadataTable(metaClient: HoodieTableMetaClient) =
|
def getPartitionPath(fileStatus: FileStatus): Path =
|
||||||
|
fileStatus.getPath.getParent
|
||||||
|
|
||||||
|
def isMetadataTable(metaClient: HoodieTableMetaClient): Boolean =
|
||||||
HoodieTableMetadata.isMetadataTable(metaClient.getBasePath)
|
HoodieTableMetadata.isMetadataTable(metaClient.getBasePath)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.common.model.HoodieBaseFile
|
|||||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||||
import org.apache.hudi.exception.HoodieException
|
import org.apache.hudi.exception.HoodieException
|
||||||
|
import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex
|
||||||
import org.apache.spark.internal.Logging
|
import org.apache.spark.internal.Logging
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
import org.apache.spark.sql.catalyst.InternalRow
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
@@ -157,7 +158,7 @@ class HoodieBootstrapRelation(@transient val _sqlContext: SQLContext,
|
|||||||
logInfo("Building file index..")
|
logInfo("Building file index..")
|
||||||
val fileStatuses = if (globPaths.nonEmpty) {
|
val fileStatuses = if (globPaths.nonEmpty) {
|
||||||
// Load files from the global paths if it has defined to be compatible with the original mode
|
// Load files from the global paths if it has defined to be compatible with the original mode
|
||||||
val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(_sqlContext.sparkSession, globPaths)
|
val inMemoryFileIndex = HoodieInMemoryFileIndex.create(_sqlContext.sparkSession, globPaths)
|
||||||
inMemoryFileIndex.allFiles()
|
inMemoryFileIndex.allFiles()
|
||||||
} else { // Load files by the HoodieFileIndex.
|
} else { // Load files by the HoodieFileIndex.
|
||||||
HoodieFileIndex(sqlContext.sparkSession, metaClient, Some(schema), optParams,
|
HoodieFileIndex(sqlContext.sparkSession, metaClient, Some(schema), optParams,
|
||||||
|
|||||||
@@ -65,20 +65,6 @@ object HoodieDataSourceHelper extends PredicateHelper {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Extract the required schema from [[InternalRow]]
|
|
||||||
*/
|
|
||||||
def extractRequiredSchema(
|
|
||||||
iter: Iterator[InternalRow],
|
|
||||||
requiredSchema: StructType,
|
|
||||||
requiredFieldPos: Seq[Int]): Iterator[InternalRow] = {
|
|
||||||
val unsafeProjection = UnsafeProjection.create(requiredSchema)
|
|
||||||
val rows = iter.map { row =>
|
|
||||||
unsafeProjection(createInternalRowWithSchema(row, requiredSchema, requiredFieldPos))
|
|
||||||
}
|
|
||||||
rows
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert [[InternalRow]] to [[SpecificInternalRow]].
|
* Convert [[InternalRow]] to [[SpecificInternalRow]].
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -348,7 +348,7 @@ object HoodieFileIndex extends Logging {
|
|||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
case NonFatal(e) =>
|
case NonFatal(e) =>
|
||||||
logWarning("Fail to convert filters for TimestampBaseAvroKeyGenerator.")
|
logWarning("Fail to convert filters for TimestampBaseAvroKeyGenerator", e)
|
||||||
partitionFilters
|
partitionFilters
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,12 +24,14 @@ import org.apache.spark.sql.execution.QueryExecutionException
|
|||||||
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile, SchemaColumnConvertNotSupportedException}
|
import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile, SchemaColumnConvertNotSupportedException}
|
||||||
import org.apache.spark.{Partition, TaskContext}
|
import org.apache.spark.{Partition, TaskContext}
|
||||||
|
|
||||||
|
case class HoodieBaseFileSplit(filePartition: FilePartition) extends HoodieFileSplit
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TODO eval if we actually need it
|
* TODO eval if we actually need it
|
||||||
*/
|
*/
|
||||||
class HoodieFileScanRDD(@transient private val sparkSession: SparkSession,
|
class HoodieFileScanRDD(@transient private val sparkSession: SparkSession,
|
||||||
readFunction: PartitionedFile => Iterator[InternalRow],
|
readFunction: PartitionedFile => Iterator[InternalRow],
|
||||||
@transient fileSplits: Seq[FilePartition])
|
@transient fileSplits: Seq[HoodieBaseFileSplit])
|
||||||
extends HoodieUnsafeRDD(sparkSession.sparkContext) {
|
extends HoodieUnsafeRDD(sparkSession.sparkContext) {
|
||||||
|
|
||||||
override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
|
override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
|
||||||
@@ -77,5 +79,5 @@ class HoodieFileScanRDD(@transient private val sparkSession: SparkSession,
|
|||||||
iterator.asInstanceOf[Iterator[InternalRow]]
|
iterator.asInstanceOf[Iterator[InternalRow]]
|
||||||
}
|
}
|
||||||
|
|
||||||
override protected def getPartitions: Array[Partition] = fileSplits.toArray
|
override protected def getPartitions: Array[Partition] = fileSplits.map(_.filePartition).toArray
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ class HoodieMergeOnReadRDD(@transient sc: SparkContext,
|
|||||||
tableState: HoodieTableState,
|
tableState: HoodieTableState,
|
||||||
tableSchema: HoodieTableSchema,
|
tableSchema: HoodieTableSchema,
|
||||||
requiredSchema: HoodieTableSchema,
|
requiredSchema: HoodieTableSchema,
|
||||||
@transient fileSplits: List[HoodieMergeOnReadFileSplit])
|
@transient fileSplits: Seq[HoodieMergeOnReadFileSplit])
|
||||||
extends HoodieUnsafeRDD(sc) {
|
extends HoodieUnsafeRDD(sc) {
|
||||||
|
|
||||||
private val confBroadcast = sc.broadcast(new SerializableWritable(config))
|
private val confBroadcast = sc.broadcast(new SerializableWritable(config))
|
||||||
|
|||||||
@@ -20,96 +20,45 @@ package org.apache.hudi
|
|||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hadoop.fs.{GlobPattern, Path}
|
import org.apache.hadoop.fs.{GlobPattern, Path}
|
||||||
import org.apache.hudi.HoodieBaseRelation.createBaseFileReader
|
import org.apache.hudi.HoodieBaseRelation.createBaseFileReader
|
||||||
import org.apache.hudi.common.model.HoodieRecord
|
import org.apache.hudi.HoodieConversionUtils.toScalaOption
|
||||||
|
import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath
|
||||||
|
import org.apache.hudi.common.model.{FileSlice, HoodieRecord}
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
|
import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline}
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||||
|
import org.apache.hudi.common.util.StringUtils
|
||||||
import org.apache.hudi.exception.HoodieException
|
import org.apache.hudi.exception.HoodieException
|
||||||
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.{getCommitMetadata, getWritePartitionPaths, listAffectedFilesForCommits}
|
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.{getCommitMetadata, getWritePartitionPaths, listAffectedFilesForCommits}
|
||||||
import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes
|
import org.apache.spark.sql.SQLContext
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||||
import org.apache.spark.sql.{Row, SQLContext}
|
|
||||||
import org.apache.spark.sql.catalyst.InternalRow
|
|
||||||
import org.apache.spark.sql.execution.datasources.PartitionedFile
|
|
||||||
import org.apache.spark.sql.sources._
|
import org.apache.spark.sql.sources._
|
||||||
import org.apache.spark.sql.types.StructType
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
import scala.collection.JavaConversions._
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.collection.immutable
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Experimental.
|
* @Experimental
|
||||||
* Relation, that implements the Hoodie incremental view for Merge On Read table.
|
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
||||||
val optParams: Map[String, String],
|
optParams: Map[String, String],
|
||||||
val userSchema: Option[StructType],
|
userSchema: Option[StructType],
|
||||||
val metaClient: HoodieTableMetaClient)
|
metaClient: HoodieTableMetaClient)
|
||||||
extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) {
|
extends MergeOnReadSnapshotRelation(sqlContext, optParams, userSchema, Seq(), metaClient) with HoodieIncrementalRelationTrait {
|
||||||
|
|
||||||
private val commitTimeline = metaClient.getCommitsAndCompactionTimeline.filterCompletedInstants()
|
override type FileSplit = HoodieMergeOnReadFileSplit
|
||||||
if (commitTimeline.empty()) {
|
|
||||||
throw new HoodieException("No instants to incrementally pull")
|
override protected def timeline: HoodieTimeline = {
|
||||||
}
|
val startTimestamp = optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key)
|
||||||
if (!optParams.contains(DataSourceReadOptions.BEGIN_INSTANTTIME.key)) {
|
val endTimestamp = optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME.key, super.timeline.lastInstant().get.getTimestamp)
|
||||||
throw new HoodieException(s"Specify the begin instant time to pull from using " +
|
super.timeline.findInstantsInRange(startTimestamp, endTimestamp)
|
||||||
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME.key}")
|
|
||||||
}
|
|
||||||
if (!metaClient.getTableConfig.populateMetaFields()) {
|
|
||||||
throw new HoodieException("Incremental queries are not supported when meta fields are disabled")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private val lastInstant = commitTimeline.lastInstant().get()
|
protected override def composeRDD(fileSplits: Seq[HoodieMergeOnReadFileSplit],
|
||||||
private val mergeType = optParams.getOrElse(
|
partitionSchema: StructType,
|
||||||
DataSourceReadOptions.REALTIME_MERGE.key,
|
tableSchema: HoodieTableSchema,
|
||||||
DataSourceReadOptions.REALTIME_MERGE.defaultValue)
|
requiredSchema: HoodieTableSchema,
|
||||||
|
filters: Array[Filter]): HoodieMergeOnReadRDD = {
|
||||||
private val commitsTimelineToReturn = commitTimeline.findInstantsInRange(
|
|
||||||
optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key),
|
|
||||||
optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME.key, lastInstant.getTimestamp))
|
|
||||||
logDebug(s"${commitsTimelineToReturn.getInstants.iterator().toList.map(f => f.toString).mkString(",")}")
|
|
||||||
private val commitsToReturn = commitsTimelineToReturn.getInstants.iterator().toList
|
|
||||||
|
|
||||||
private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf)
|
|
||||||
|
|
||||||
private val fileIndex = if (commitsToReturn.isEmpty) List() else buildFileIndex()
|
|
||||||
|
|
||||||
// Record filters making sure that only records w/in the requested bounds are being fetched as part of the
|
|
||||||
// scan collected by this relation
|
|
||||||
private lazy val incrementalSpanRecordsFilters: Seq[Filter] = {
|
|
||||||
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
|
||||||
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp)
|
|
||||||
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)
|
|
||||||
Seq(isNotNullFilter, largerThanFilter, lessThanFilter)
|
|
||||||
}
|
|
||||||
|
|
||||||
override lazy val mandatoryColumns: Seq[String] = {
|
|
||||||
// NOTE: This columns are required for Incremental flow to be able to handle the rows properly, even in
|
|
||||||
// cases when no columns are requested to be fetched (for ex, when using {@code count()} API)
|
|
||||||
Seq(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD) ++
|
|
||||||
preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())
|
|
||||||
}
|
|
||||||
|
|
||||||
override def doBuildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[InternalRow] = {
|
|
||||||
if (fileIndex.isEmpty) {
|
|
||||||
sqlContext.sparkContext.emptyRDD[InternalRow]
|
|
||||||
} else {
|
|
||||||
logDebug(s"buildScan requiredColumns = ${requiredColumns.mkString(",")}")
|
|
||||||
logDebug(s"buildScan filters = ${filters.mkString(",")}")
|
|
||||||
|
|
||||||
// config to ensure the push down filter for parquet will be applied.
|
|
||||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.filterPushdown", "true")
|
|
||||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.recordLevelFilter.enabled", "true")
|
|
||||||
sqlContext.sparkSession.sessionState.conf.setConfString("spark.sql.parquet.enableVectorizedReader", "false")
|
|
||||||
|
|
||||||
val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns)
|
|
||||||
|
|
||||||
val (requiredAvroSchema, requiredStructSchema) =
|
|
||||||
HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns)
|
|
||||||
|
|
||||||
val partitionSchema = StructType(Nil)
|
|
||||||
val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString)
|
|
||||||
val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString)
|
|
||||||
|
|
||||||
val fullSchemaParquetReader = createBaseFileReader(
|
val fullSchemaParquetReader = createBaseFileReader(
|
||||||
spark = sqlContext.sparkSession,
|
spark = sqlContext.sparkSession,
|
||||||
partitionSchema = partitionSchema,
|
partitionSchema = partitionSchema,
|
||||||
@@ -122,18 +71,19 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
|||||||
//
|
//
|
||||||
// The only filtering applicable here is the filtering to make sure we're only fetching records that
|
// The only filtering applicable here is the filtering to make sure we're only fetching records that
|
||||||
// fall into incremental span of the timeline being queried
|
// fall into incremental span of the timeline being queried
|
||||||
filters = incrementalSpanRecordsFilters,
|
filters = incrementalSpanRecordFilters,
|
||||||
options = optParams,
|
options = optParams,
|
||||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||||
// to configure Parquet reader appropriately
|
// to configure Parquet reader appropriately
|
||||||
hadoopConf = new Configuration(conf)
|
hadoopConf = new Configuration(conf)
|
||||||
)
|
)
|
||||||
|
|
||||||
val requiredSchemaParquetReader = createBaseFileReader(
|
val requiredSchemaParquetReader = createBaseFileReader(
|
||||||
spark = sqlContext.sparkSession,
|
spark = sqlContext.sparkSession,
|
||||||
partitionSchema = partitionSchema,
|
partitionSchema = partitionSchema,
|
||||||
tableSchema = tableSchema,
|
tableSchema = tableSchema,
|
||||||
requiredSchema = requiredSchema,
|
requiredSchema = requiredSchema,
|
||||||
filters = filters ++ incrementalSpanRecordsFilters,
|
filters = filters ++ incrementalSpanRecordFilters,
|
||||||
options = optParams,
|
options = optParams,
|
||||||
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
// NOTE: We have to fork the Hadoop Config here as Spark will be modifying it
|
||||||
// to configure Parquet reader appropriately
|
// to configure Parquet reader appropriately
|
||||||
@@ -142,79 +92,89 @@ class MergeOnReadIncrementalRelation(sqlContext: SQLContext,
|
|||||||
|
|
||||||
val hoodieTableState = HoodieTableState(HoodieRecord.RECORD_KEY_METADATA_FIELD, preCombineFieldOpt)
|
val hoodieTableState = HoodieTableState(HoodieRecord.RECORD_KEY_METADATA_FIELD, preCombineFieldOpt)
|
||||||
|
|
||||||
// TODO implement incremental span record filtering w/in RDD to make sure returned iterator is appropriately
|
// TODO(HUDI-3639) implement incremental span record filtering w/in RDD to make sure returned iterator is appropriately
|
||||||
// filtered, since file-reader might not be capable to perform filtering
|
// filtered, since file-reader might not be capable to perform filtering
|
||||||
new HoodieMergeOnReadRDD(
|
new HoodieMergeOnReadRDD(sqlContext.sparkContext, jobConf, fullSchemaParquetReader,
|
||||||
sqlContext.sparkContext,
|
requiredSchemaParquetReader, hoodieTableState, tableSchema, requiredSchema, fileSplits)
|
||||||
jobConf,
|
}
|
||||||
fullSchemaParquetReader,
|
|
||||||
requiredSchemaParquetReader,
|
override protected def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = {
|
||||||
hoodieTableState,
|
if (includedCommits.isEmpty) {
|
||||||
tableSchema,
|
List()
|
||||||
requiredSchema,
|
} else {
|
||||||
fileIndex
|
val latestCommit = includedCommits.last.getTimestamp
|
||||||
)
|
val commitsMetadata = includedCommits.map(getCommitMetadata(_, timeline)).asJava
|
||||||
|
|
||||||
|
val modifiedFiles = listAffectedFilesForCommits(conf, new Path(metaClient.getBasePath), commitsMetadata)
|
||||||
|
val fsView = new HoodieTableFileSystemView(metaClient, timeline, modifiedFiles)
|
||||||
|
|
||||||
|
val modifiedPartitions = getWritePartitionPaths(commitsMetadata)
|
||||||
|
|
||||||
|
val fileSlices = modifiedPartitions.asScala.flatMap { relativePartitionPath =>
|
||||||
|
fsView.getLatestMergedFileSlicesBeforeOrOn(relativePartitionPath, latestCommit).iterator().asScala
|
||||||
|
}.toSeq
|
||||||
|
|
||||||
|
buildSplits(filterFileSlices(fileSlices, globPattern))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def buildFileIndex(): List[HoodieMergeOnReadFileSplit] = {
|
private def filterFileSlices(fileSlices: Seq[FileSlice], pathGlobPattern: String): Seq[FileSlice] = {
|
||||||
val metadataList = commitsToReturn.map(instant => getCommitMetadata(instant, commitsTimelineToReturn))
|
val filteredFileSlices = if (!StringUtils.isNullOrEmpty(pathGlobPattern)) {
|
||||||
val affectedFileStatus = listAffectedFilesForCommits(conf, new Path(metaClient.getBasePath), metadataList)
|
|
||||||
val fsView = new HoodieTableFileSystemView(metaClient, commitsTimelineToReturn, affectedFileStatus)
|
|
||||||
|
|
||||||
// Iterate partitions to create splits
|
|
||||||
val fileGroups = getWritePartitionPaths(metadataList).flatMap(partitionPath =>
|
|
||||||
fsView.getAllFileGroups(partitionPath).iterator()
|
|
||||||
).toList
|
|
||||||
val latestCommit = fsView.getLastInstant.get.getTimestamp
|
|
||||||
if (log.isDebugEnabled) {
|
|
||||||
fileGroups.foreach(f => logDebug(s"current file group id: " +
|
|
||||||
s"${f.getFileGroupId} and file slices ${f.getLatestFileSlice.get.toString}"))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Filter files based on user defined glob pattern
|
|
||||||
val pathGlobPattern = optParams.getOrElse(
|
|
||||||
DataSourceReadOptions.INCR_PATH_GLOB.key,
|
|
||||||
DataSourceReadOptions.INCR_PATH_GLOB.defaultValue)
|
|
||||||
val filteredFileGroup = if (!pathGlobPattern.equals(DataSourceReadOptions.INCR_PATH_GLOB.defaultValue)) {
|
|
||||||
val globMatcher = new GlobPattern("*" + pathGlobPattern)
|
val globMatcher = new GlobPattern("*" + pathGlobPattern)
|
||||||
fileGroups.filter(fg => {
|
fileSlices.filter(fileSlice => {
|
||||||
val latestFileSlice = fg.getLatestFileSlice.get
|
val path = toScalaOption(fileSlice.getBaseFile).map(_.getPath)
|
||||||
if (latestFileSlice.getBaseFile.isPresent) {
|
.orElse(toScalaOption(fileSlice.getLatestLogFile).map(_.getPath.toString))
|
||||||
globMatcher.matches(latestFileSlice.getBaseFile.get.getPath)
|
.get
|
||||||
} else {
|
globMatcher.matches(path)
|
||||||
globMatcher.matches(latestFileSlice.getLatestLogFile.get.getPath.toString)
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
} else {
|
} else {
|
||||||
fileGroups
|
fileSlices
|
||||||
}
|
}
|
||||||
|
filteredFileSlices
|
||||||
// Build HoodieMergeOnReadFileSplit.
|
|
||||||
filteredFileGroup.map(f => {
|
|
||||||
// Ensure get the base file when there is a pending compaction, which means the base file
|
|
||||||
// won't be in the latest file slice.
|
|
||||||
val baseFiles = f.getAllFileSlices.iterator().filter(slice => slice.getBaseFile.isPresent).toList
|
|
||||||
val partitionedFile = if (baseFiles.nonEmpty) {
|
|
||||||
val baseFile = baseFiles.head.getBaseFile
|
|
||||||
val filePath = MergeOnReadSnapshotRelation.getFilePath(baseFile.get.getFileStatus.getPath)
|
|
||||||
Option(PartitionedFile(InternalRow.empty, filePath, 0, baseFile.get.getFileLen))
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Option.empty
|
|
||||||
}
|
|
||||||
|
|
||||||
val logPath = if (f.getLatestFileSlice.isPresent) {
|
|
||||||
// If log path doesn't exist, we still include an empty path to avoid using
|
|
||||||
// the default parquet reader to ensure the push down filter will be applied.
|
|
||||||
Option(f.getLatestFileSlice.get().getLogFiles.iterator().toList)
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
Option.empty
|
|
||||||
}
|
|
||||||
|
|
||||||
HoodieMergeOnReadFileSplit(partitionedFile, logPath,
|
|
||||||
latestCommit, metaClient.getBasePath, maxCompactionMemoryInBytes, mergeType)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
trait HoodieIncrementalRelationTrait extends HoodieBaseRelation {
|
||||||
|
|
||||||
|
// Validate this Incremental implementation is properly configured
|
||||||
|
validate()
|
||||||
|
|
||||||
|
protected lazy val includedCommits: immutable.Seq[HoodieInstant] = timeline.getInstants.iterator().asScala.toList
|
||||||
|
|
||||||
|
// Record filters making sure that only records w/in the requested bounds are being fetched as part of the
|
||||||
|
// scan collected by this relation
|
||||||
|
protected lazy val incrementalSpanRecordFilters: Seq[Filter] = {
|
||||||
|
val isNotNullFilter = IsNotNull(HoodieRecord.COMMIT_TIME_METADATA_FIELD)
|
||||||
|
val largerThanFilter = GreaterThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, includedCommits.head.getTimestamp)
|
||||||
|
val lessThanFilter = LessThanOrEqual(HoodieRecord.COMMIT_TIME_METADATA_FIELD, includedCommits.last.getTimestamp)
|
||||||
|
|
||||||
|
Seq(isNotNullFilter, largerThanFilter, lessThanFilter)
|
||||||
|
}
|
||||||
|
|
||||||
|
override lazy val mandatoryColumns: Seq[String] = {
|
||||||
|
// NOTE: This columns are required for Incremental flow to be able to handle the rows properly, even in
|
||||||
|
// cases when no columns are requested to be fetched (for ex, when using {@code count()} API)
|
||||||
|
Seq(HoodieRecord.RECORD_KEY_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD) ++
|
||||||
|
preCombineFieldOpt.map(Seq(_)).getOrElse(Seq())
|
||||||
|
}
|
||||||
|
|
||||||
|
protected def validate(): Unit = {
|
||||||
|
if (super.timeline.empty()) {
|
||||||
|
throw new HoodieException("No instants to incrementally pull")
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.optParams.contains(DataSourceReadOptions.BEGIN_INSTANTTIME.key)) {
|
||||||
|
throw new HoodieException(s"Specify the begin instant time to pull from using " +
|
||||||
|
s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME.key}")
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.tableConfig.populateMetaFields()) {
|
||||||
|
throw new HoodieException("Incremental queries are not supported when meta fields are disabled")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected def globPattern: String =
|
||||||
|
optParams.getOrElse(DataSourceReadOptions.INCR_PATH_GLOB.key, DataSourceReadOptions.INCR_PATH_GLOB.defaultValue)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -21,16 +21,18 @@ package org.apache.hudi
|
|||||||
import org.apache.hadoop.conf.Configuration
|
import org.apache.hadoop.conf.Configuration
|
||||||
import org.apache.hadoop.fs.Path
|
import org.apache.hadoop.fs.Path
|
||||||
import org.apache.hudi.HoodieBaseRelation.createBaseFileReader
|
import org.apache.hudi.HoodieBaseRelation.createBaseFileReader
|
||||||
import org.apache.hudi.common.model.{HoodieLogFile, HoodieRecord}
|
import org.apache.hudi.HoodieConversionUtils.toScalaOption
|
||||||
|
import org.apache.hudi.MergeOnReadSnapshotRelation.getFilePath
|
||||||
|
import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath
|
||||||
|
import org.apache.hudi.common.model.{FileSlice, HoodieLogFile}
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView
|
||||||
import org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils
|
|
||||||
import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes
|
import org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.execution.datasources.HoodieInMemoryFileIndex
|
||||||
import org.apache.spark.sql.SQLContext
|
import org.apache.spark.sql.SQLContext
|
||||||
import org.apache.spark.sql.catalyst.InternalRow
|
import org.apache.spark.sql.catalyst.InternalRow
|
||||||
import org.apache.spark.sql.catalyst.expressions.Expression
|
import org.apache.spark.sql.catalyst.expressions.Expression
|
||||||
import org.apache.spark.sql.execution.datasources.{FileStatusCache, PartitionedFile}
|
import org.apache.spark.sql.execution.datasources.PartitionedFile
|
||||||
import org.apache.spark.sql.sources.Filter
|
import org.apache.spark.sql.sources.Filter
|
||||||
import org.apache.spark.sql.types.StructType
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
@@ -41,43 +43,28 @@ case class HoodieMergeOnReadFileSplit(dataFile: Option[PartitionedFile],
|
|||||||
latestCommit: String,
|
latestCommit: String,
|
||||||
tablePath: String,
|
tablePath: String,
|
||||||
maxCompactionMemoryInBytes: Long,
|
maxCompactionMemoryInBytes: Long,
|
||||||
mergeType: String)
|
mergeType: String) extends HoodieFileSplit
|
||||||
|
|
||||||
class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
||||||
optParams: Map[String, String],
|
optParams: Map[String, String],
|
||||||
val userSchema: Option[StructType],
|
userSchema: Option[StructType],
|
||||||
val globPaths: Seq[Path],
|
globPaths: Seq[Path],
|
||||||
val metaClient: HoodieTableMetaClient)
|
metaClient: HoodieTableMetaClient)
|
||||||
extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) {
|
extends HoodieBaseRelation(sqlContext, metaClient, optParams, userSchema) {
|
||||||
|
|
||||||
|
override type FileSplit = HoodieMergeOnReadFileSplit
|
||||||
|
|
||||||
private val mergeType = optParams.getOrElse(
|
private val mergeType = optParams.getOrElse(
|
||||||
DataSourceReadOptions.REALTIME_MERGE.key,
|
DataSourceReadOptions.REALTIME_MERGE.key,
|
||||||
DataSourceReadOptions.REALTIME_MERGE.defaultValue)
|
DataSourceReadOptions.REALTIME_MERGE.defaultValue)
|
||||||
|
|
||||||
private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf)
|
private val maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(jobConf)
|
||||||
|
|
||||||
override def doBuildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[InternalRow] = {
|
protected override def composeRDD(fileIndex: Seq[HoodieMergeOnReadFileSplit],
|
||||||
log.debug(s" buildScan requiredColumns = ${requiredColumns.mkString(",")}")
|
partitionSchema: StructType,
|
||||||
log.debug(s" buildScan filters = ${filters.mkString(",")}")
|
tableSchema: HoodieTableSchema,
|
||||||
|
requiredSchema: HoodieTableSchema,
|
||||||
// NOTE: In case list of requested columns doesn't contain the Primary Key one, we
|
filters: Array[Filter]): HoodieMergeOnReadRDD = {
|
||||||
// have to add it explicitly so that
|
|
||||||
// - Merging could be performed correctly
|
|
||||||
// - In case 0 columns are to be fetched (for ex, when doing {@code count()} on Spark's [[Dataset]],
|
|
||||||
// Spark still fetches all the rows to execute the query correctly
|
|
||||||
//
|
|
||||||
// It's okay to return columns that have not been requested by the caller, as those nevertheless will be
|
|
||||||
// filtered out upstream
|
|
||||||
val fetchedColumns: Array[String] = appendMandatoryColumns(requiredColumns)
|
|
||||||
|
|
||||||
val (requiredAvroSchema, requiredStructSchema) =
|
|
||||||
HoodieSparkUtils.getRequiredSchema(tableAvroSchema, fetchedColumns)
|
|
||||||
val fileIndex = buildFileIndex(filters)
|
|
||||||
|
|
||||||
val partitionSchema = StructType(Nil)
|
|
||||||
val tableSchema = HoodieTableSchema(tableStructSchema, tableAvroSchema.toString)
|
|
||||||
val requiredSchema = HoodieTableSchema(requiredStructSchema, requiredAvroSchema.toString)
|
|
||||||
|
|
||||||
val fullSchemaParquetReader = createBaseFileReader(
|
val fullSchemaParquetReader = createBaseFileReader(
|
||||||
spark = sqlContext.sparkSession,
|
spark = sqlContext.sparkSession,
|
||||||
partitionSchema = partitionSchema,
|
partitionSchema = partitionSchema,
|
||||||
@@ -93,6 +80,7 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
|||||||
// to configure Parquet reader appropriately
|
// to configure Parquet reader appropriately
|
||||||
hadoopConf = new Configuration(conf)
|
hadoopConf = new Configuration(conf)
|
||||||
)
|
)
|
||||||
|
|
||||||
val requiredSchemaParquetReader = createBaseFileReader(
|
val requiredSchemaParquetReader = createBaseFileReader(
|
||||||
spark = sqlContext.sparkSession,
|
spark = sqlContext.sparkSession,
|
||||||
partitionSchema = partitionSchema,
|
partitionSchema = partitionSchema,
|
||||||
@@ -111,90 +99,53 @@ class MergeOnReadSnapshotRelation(sqlContext: SQLContext,
|
|||||||
requiredSchemaParquetReader, tableState, tableSchema, requiredSchema, fileIndex)
|
requiredSchemaParquetReader, tableState, tableSchema, requiredSchema, fileIndex)
|
||||||
}
|
}
|
||||||
|
|
||||||
def buildFileIndex(filters: Array[Filter]): List[HoodieMergeOnReadFileSplit] = {
|
protected override def collectFileSplits(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): List[HoodieMergeOnReadFileSplit] = {
|
||||||
if (globPaths.nonEmpty) {
|
val convertedPartitionFilters =
|
||||||
// Load files from the global paths if it has defined to be compatible with the original mode
|
HoodieFileIndex.convertFilterForTimestampKeyGenerator(metaClient, partitionFilters)
|
||||||
val inMemoryFileIndex = HoodieSparkUtils.createInMemoryFileIndex(sqlContext.sparkSession, globPaths)
|
|
||||||
val fsView = new HoodieTableFileSystemView(metaClient,
|
|
||||||
// file-slice after pending compaction-requested instant-time is also considered valid
|
|
||||||
metaClient.getCommitsAndCompactionTimeline.filterCompletedAndCompactionInstants,
|
|
||||||
inMemoryFileIndex.allFiles().toArray)
|
|
||||||
val partitionPaths = fsView.getLatestBaseFiles.iterator().asScala.toList.map(_.getFileStatus.getPath.getParent)
|
|
||||||
|
|
||||||
|
if (globPaths.isEmpty) {
|
||||||
if (partitionPaths.isEmpty) { // If this an empty table, return an empty split list.
|
val fileSlices = fileIndex.listFileSlices(convertedPartitionFilters)
|
||||||
|
buildSplits(fileSlices.values.flatten.toSeq)
|
||||||
|
} else {
|
||||||
|
// TODO refactor to avoid iterating over listed files multiple times
|
||||||
|
val partitions = listLatestBaseFiles(globPaths, convertedPartitionFilters, dataFilters)
|
||||||
|
val partitionPaths = partitions.keys.toSeq
|
||||||
|
if (partitionPaths.isEmpty || latestInstant.isEmpty) {
|
||||||
|
// If this an empty table OR it has no completed commits yet, return
|
||||||
List.empty[HoodieMergeOnReadFileSplit]
|
List.empty[HoodieMergeOnReadFileSplit]
|
||||||
} else {
|
} else {
|
||||||
val lastInstant = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants.lastInstant()
|
val fileSlices = listFileSlices(partitionPaths)
|
||||||
if (!lastInstant.isPresent) { // Return empty list if the table has no commit
|
buildSplits(fileSlices)
|
||||||
List.empty
|
|
||||||
} else {
|
|
||||||
val queryInstant = specifiedQueryInstant.getOrElse(lastInstant.get().getTimestamp)
|
|
||||||
val baseAndLogsList = HoodieRealtimeInputFormatUtils.groupLogsByBaseFile(conf, partitionPaths.asJava).asScala
|
|
||||||
val fileSplits = baseAndLogsList.map(kv => {
|
|
||||||
val baseFile = kv.getLeft
|
|
||||||
val logPaths = if (kv.getRight.isEmpty) Option.empty else Option(kv.getRight.asScala.toList)
|
|
||||||
|
|
||||||
val baseDataPath = if (baseFile.isPresent) {
|
|
||||||
Some(PartitionedFile(
|
|
||||||
InternalRow.empty,
|
|
||||||
MergeOnReadSnapshotRelation.getFilePath(baseFile.get.getFileStatus.getPath),
|
|
||||||
0, baseFile.get.getFileLen)
|
|
||||||
)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
}
|
||||||
HoodieMergeOnReadFileSplit(baseDataPath, logPaths, queryInstant,
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected def buildSplits(fileSlices: Seq[FileSlice]): List[HoodieMergeOnReadFileSplit] = {
|
||||||
|
fileSlices.map { fileSlice =>
|
||||||
|
val baseFile = toScalaOption(fileSlice.getBaseFile)
|
||||||
|
val logFiles = Option(fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala.toList)
|
||||||
|
|
||||||
|
val partitionedBaseFile = baseFile.map { file =>
|
||||||
|
val filePath = getFilePath(file.getFileStatus.getPath)
|
||||||
|
PartitionedFile(InternalRow.empty, filePath, 0, file.getFileLen)
|
||||||
|
}
|
||||||
|
|
||||||
|
HoodieMergeOnReadFileSplit(partitionedBaseFile, logFiles, queryTimestamp.get,
|
||||||
metaClient.getBasePath, maxCompactionMemoryInBytes, mergeType)
|
metaClient.getBasePath, maxCompactionMemoryInBytes, mergeType)
|
||||||
}).toList
|
}.toList
|
||||||
fileSplits
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Load files by the HoodieFileIndex.
|
|
||||||
val hoodieFileIndex = HoodieFileIndex(sqlContext.sparkSession, metaClient,
|
|
||||||
Some(tableStructSchema), optParams, FileStatusCache.getOrCreate(sqlContext.sparkSession))
|
|
||||||
|
|
||||||
// Get partition filter and convert to catalyst expression
|
|
||||||
val partitionColumns = hoodieFileIndex.partitionSchema.fieldNames.toSet
|
|
||||||
val partitionFilters = filters.filter(f => f.references.forall(p => partitionColumns.contains(p)))
|
|
||||||
val partitionFilterExpression =
|
|
||||||
HoodieSparkUtils.convertToCatalystExpression(partitionFilters, tableStructSchema)
|
|
||||||
val convertedPartitionFilterExpression =
|
|
||||||
HoodieFileIndex.convertFilterForTimestampKeyGenerator(metaClient, partitionFilterExpression.toSeq)
|
|
||||||
|
|
||||||
// If convert success to catalyst expression, use the partition prune
|
|
||||||
val fileSlices = if (convertedPartitionFilterExpression.nonEmpty) {
|
|
||||||
hoodieFileIndex.listFileSlices(convertedPartitionFilterExpression)
|
|
||||||
} else {
|
|
||||||
hoodieFileIndex.listFileSlices(Seq.empty[Expression])
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fileSlices.isEmpty) {
|
private def listFileSlices(partitionPaths: Seq[Path]): Seq[FileSlice] = {
|
||||||
// If this an empty table, return an empty split list.
|
// NOTE: It's critical for us to re-use [[InMemoryFileIndex]] to make sure we're leveraging
|
||||||
List.empty[HoodieMergeOnReadFileSplit]
|
// [[FileStatusCache]] and avoid listing the whole table again
|
||||||
} else {
|
val inMemoryFileIndex = HoodieInMemoryFileIndex.create(sparkSession, partitionPaths)
|
||||||
val fileSplits = fileSlices.values.flatten.map(fileSlice => {
|
val fsView = new HoodieTableFileSystemView(metaClient, timeline, inMemoryFileIndex.allFiles.toArray)
|
||||||
val latestInstant = metaClient.getActiveTimeline.getCommitsTimeline
|
|
||||||
.filterCompletedInstants.lastInstant().get().getTimestamp
|
|
||||||
val queryInstant = specifiedQueryInstant.getOrElse(latestInstant)
|
|
||||||
|
|
||||||
val partitionedFile = if (fileSlice.getBaseFile.isPresent) {
|
val queryTimestamp = this.queryTimestamp.get
|
||||||
val baseFile = fileSlice.getBaseFile.get()
|
|
||||||
val baseFilePath = MergeOnReadSnapshotRelation.getFilePath(baseFile.getFileStatus.getPath)
|
|
||||||
Option(PartitionedFile(InternalRow.empty, baseFilePath, 0, baseFile.getFileLen))
|
|
||||||
} else {
|
|
||||||
Option.empty
|
|
||||||
}
|
|
||||||
|
|
||||||
val logPaths = fileSlice.getLogFiles.sorted(HoodieLogFile.getLogFileComparator).iterator().asScala.toList
|
partitionPaths.flatMap { partitionPath =>
|
||||||
val logPathsOptional = if (logPaths.isEmpty) Option.empty else Option(logPaths)
|
val relativePath = getRelativePartitionPath(new Path(basePath), partitionPath)
|
||||||
|
fsView.getLatestMergedFileSlicesBeforeOrOn(relativePath, queryTimestamp).iterator().asScala.toSeq
|
||||||
HoodieMergeOnReadFileSplit(partitionedFile, logPathsOptional, queryInstant, metaClient.getBasePath,
|
|
||||||
maxCompactionMemoryInBytes, mergeType)
|
|
||||||
}).toList
|
|
||||||
fileSplits
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -38,7 +38,6 @@ import org.apache.spark.sql.types.{StringType, StructField, StructType}
|
|||||||
import org.apache.spark.unsafe.types.UTF8String
|
import org.apache.spark.unsafe.types.UTF8String
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.language.implicitConversions
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Implementation of the [[BaseHoodieTableFileIndex]] for Spark
|
* Implementation of the [[BaseHoodieTableFileIndex]] for Spark
|
||||||
@@ -135,12 +134,14 @@ class SparkHoodieTableFileIndex(spark: SparkSession,
|
|||||||
* Fetch list of latest base files w/ corresponding log files, after performing
|
* Fetch list of latest base files w/ corresponding log files, after performing
|
||||||
* partition pruning
|
* partition pruning
|
||||||
*
|
*
|
||||||
|
* TODO unify w/ HoodieFileIndex#listFiles
|
||||||
|
*
|
||||||
* @param partitionFilters partition column filters
|
* @param partitionFilters partition column filters
|
||||||
* @return mapping from string partition paths to its base/log files
|
* @return mapping from string partition paths to its base/log files
|
||||||
*/
|
*/
|
||||||
def listFileSlices(partitionFilters: Seq[Expression]): Map[String, Seq[FileSlice]] = {
|
def listFileSlices(partitionFilters: Seq[Expression]): Map[String, Seq[FileSlice]] = {
|
||||||
// Prune the partition path by the partition filters
|
// Prune the partition path by the partition filters
|
||||||
val prunedPartitions = prunePartition(cachedAllInputFileSlices.asScala.keys.toSeq, partitionFilters)
|
val prunedPartitions = prunePartition(cachedAllInputFileSlices.keySet().asScala.toSeq, partitionFilters)
|
||||||
prunedPartitions.map(partition => {
|
prunedPartitions.map(partition => {
|
||||||
(partition.path, cachedAllInputFileSlices.get(partition).asScala)
|
(partition.path, cachedAllInputFileSlices.get(partition).asScala)
|
||||||
}).toMap
|
}).toMap
|
||||||
|
|||||||
@@ -0,0 +1,370 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
import org.apache.hadoop.fs.viewfs.ViewFileSystem
|
||||||
|
import org.apache.hadoop.fs._
|
||||||
|
import org.apache.hadoop.hdfs.DistributedFileSystem
|
||||||
|
import org.apache.spark.internal.Logging
|
||||||
|
import org.apache.spark.metrics.source.HiveCatalogMetrics
|
||||||
|
import org.apache.spark.util.SerializableConfiguration
|
||||||
|
|
||||||
|
import java.io.FileNotFoundException
|
||||||
|
import scala.collection.mutable
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: This method class is replica of HadoopFSUtils from Spark 3.2.1, with the following adjustments
|
||||||
|
*
|
||||||
|
* - Filtering out of the listed files is adjusted to include files starting w/ "." (to include Hoodie Delta Log
|
||||||
|
* files)
|
||||||
|
*/
|
||||||
|
object HoodieHadoopFSUtils extends Logging {
|
||||||
|
/**
|
||||||
|
* Lists a collection of paths recursively. Picks the listing strategy adaptively depending
|
||||||
|
* on the number of paths to list.
|
||||||
|
*
|
||||||
|
* This may only be called on the driver.
|
||||||
|
*
|
||||||
|
* @param sc Spark context used to run parallel listing.
|
||||||
|
* @param paths Input paths to list
|
||||||
|
* @param hadoopConf Hadoop configuration
|
||||||
|
* @param filter Path filter used to exclude leaf files from result
|
||||||
|
* @param ignoreMissingFiles Ignore missing files that happen during recursive listing
|
||||||
|
* (e.g., due to race conditions)
|
||||||
|
* @param ignoreLocality Whether to fetch data locality info when listing leaf files. If false,
|
||||||
|
* this will return `FileStatus` without `BlockLocation` info.
|
||||||
|
* @param parallelismThreshold The threshold to enable parallelism. If the number of input paths
|
||||||
|
* is smaller than this value, this will fallback to use
|
||||||
|
* sequential listing.
|
||||||
|
* @param parallelismMax The maximum parallelism for listing. If the number of input paths is
|
||||||
|
* larger than this value, parallelism will be throttled to this value
|
||||||
|
* to avoid generating too many tasks.
|
||||||
|
* @return for each input path, the set of discovered files for the path
|
||||||
|
*/
|
||||||
|
def parallelListLeafFiles(sc: SparkContext,
|
||||||
|
paths: Seq[Path],
|
||||||
|
hadoopConf: Configuration,
|
||||||
|
filter: PathFilter,
|
||||||
|
ignoreMissingFiles: Boolean,
|
||||||
|
ignoreLocality: Boolean,
|
||||||
|
parallelismThreshold: Int,
|
||||||
|
parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = {
|
||||||
|
parallelListLeafFilesInternal(sc, paths, hadoopConf, filter, isRootLevel = true,
|
||||||
|
ignoreMissingFiles, ignoreLocality, parallelismThreshold, parallelismMax)
|
||||||
|
}
|
||||||
|
|
||||||
|
// scalastyle:off parameter.number
|
||||||
|
private def parallelListLeafFilesInternal(sc: SparkContext,
|
||||||
|
paths: Seq[Path],
|
||||||
|
hadoopConf: Configuration,
|
||||||
|
filter: PathFilter,
|
||||||
|
isRootLevel: Boolean,
|
||||||
|
ignoreMissingFiles: Boolean,
|
||||||
|
ignoreLocality: Boolean,
|
||||||
|
parallelismThreshold: Int,
|
||||||
|
parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = {
|
||||||
|
|
||||||
|
// Short-circuits parallel listing when serial listing is likely to be faster.
|
||||||
|
if (paths.size <= parallelismThreshold) {
|
||||||
|
// scalastyle:off return
|
||||||
|
return paths.map { path =>
|
||||||
|
val leafFiles = listLeafFiles(
|
||||||
|
path,
|
||||||
|
hadoopConf,
|
||||||
|
filter,
|
||||||
|
Some(sc),
|
||||||
|
ignoreMissingFiles = ignoreMissingFiles,
|
||||||
|
ignoreLocality = ignoreLocality,
|
||||||
|
isRootPath = isRootLevel,
|
||||||
|
parallelismThreshold = parallelismThreshold,
|
||||||
|
parallelismMax = parallelismMax)
|
||||||
|
(path, leafFiles)
|
||||||
|
}
|
||||||
|
// scalastyle:on return
|
||||||
|
}
|
||||||
|
|
||||||
|
logInfo(s"Listing leaf files and directories in parallel under ${paths.length} paths." +
|
||||||
|
s" The first several paths are: ${paths.take(10).mkString(", ")}.")
|
||||||
|
HiveCatalogMetrics.incrementParallelListingJobCount(1)
|
||||||
|
|
||||||
|
val serializableConfiguration = new SerializableConfiguration(hadoopConf)
|
||||||
|
val serializedPaths = paths.map(_.toString)
|
||||||
|
|
||||||
|
// Set the number of parallelism to prevent following file listing from generating many tasks
|
||||||
|
// in case of large #defaultParallelism.
|
||||||
|
val numParallelism = Math.min(paths.size, parallelismMax)
|
||||||
|
|
||||||
|
val previousJobDescription = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
|
||||||
|
val statusMap = try {
|
||||||
|
val description = paths.size match {
|
||||||
|
case 0 =>
|
||||||
|
"Listing leaf files and directories 0 paths"
|
||||||
|
case 1 =>
|
||||||
|
s"Listing leaf files and directories for 1 path:<br/>${paths(0)}"
|
||||||
|
case s =>
|
||||||
|
s"Listing leaf files and directories for $s paths:<br/>${paths(0)}, ..."
|
||||||
|
}
|
||||||
|
sc.setJobDescription(description)
|
||||||
|
sc
|
||||||
|
.parallelize(serializedPaths, numParallelism)
|
||||||
|
.mapPartitions { pathStrings =>
|
||||||
|
val hadoopConf = serializableConfiguration.value
|
||||||
|
pathStrings.map(new Path(_)).toSeq.map { path =>
|
||||||
|
val leafFiles = listLeafFiles(
|
||||||
|
path = path,
|
||||||
|
hadoopConf = hadoopConf,
|
||||||
|
filter = filter,
|
||||||
|
contextOpt = None, // Can't execute parallel scans on workers
|
||||||
|
ignoreMissingFiles = ignoreMissingFiles,
|
||||||
|
ignoreLocality = ignoreLocality,
|
||||||
|
isRootPath = isRootLevel,
|
||||||
|
parallelismThreshold = Int.MaxValue,
|
||||||
|
parallelismMax = 0)
|
||||||
|
(path, leafFiles)
|
||||||
|
}.iterator
|
||||||
|
}.map { case (path, statuses) =>
|
||||||
|
val serializableStatuses = statuses.map { status =>
|
||||||
|
// Turn FileStatus into SerializableFileStatus so we can send it back to the driver
|
||||||
|
val blockLocations = status match {
|
||||||
|
case f: LocatedFileStatus =>
|
||||||
|
f.getBlockLocations.map { loc =>
|
||||||
|
SerializableBlockLocation(
|
||||||
|
loc.getNames,
|
||||||
|
loc.getHosts,
|
||||||
|
loc.getOffset,
|
||||||
|
loc.getLength)
|
||||||
|
}
|
||||||
|
|
||||||
|
case _ =>
|
||||||
|
Array.empty[SerializableBlockLocation]
|
||||||
|
}
|
||||||
|
|
||||||
|
SerializableFileStatus(
|
||||||
|
status.getPath.toString,
|
||||||
|
status.getLen,
|
||||||
|
status.isDirectory,
|
||||||
|
status.getReplication,
|
||||||
|
status.getBlockSize,
|
||||||
|
status.getModificationTime,
|
||||||
|
status.getAccessTime,
|
||||||
|
blockLocations)
|
||||||
|
}
|
||||||
|
(path.toString, serializableStatuses)
|
||||||
|
}.collect()
|
||||||
|
} finally {
|
||||||
|
sc.setJobDescription(previousJobDescription)
|
||||||
|
}
|
||||||
|
|
||||||
|
// turn SerializableFileStatus back to Status
|
||||||
|
statusMap.map { case (path, serializableStatuses) =>
|
||||||
|
val statuses = serializableStatuses.map { f =>
|
||||||
|
val blockLocations = f.blockLocations.map { loc =>
|
||||||
|
new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
|
||||||
|
}
|
||||||
|
new LocatedFileStatus(
|
||||||
|
new FileStatus(
|
||||||
|
f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime,
|
||||||
|
new Path(f.path)),
|
||||||
|
blockLocations)
|
||||||
|
}
|
||||||
|
(new Path(path), statuses)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// scalastyle:on parameter.number
|
||||||
|
|
||||||
|
// scalastyle:off parameter.number
|
||||||
|
/**
|
||||||
|
* Lists a single filesystem path recursively. If a `SparkContext` object is specified, this
|
||||||
|
* function may launch Spark jobs to parallelize listing based on `parallelismThreshold`.
|
||||||
|
*
|
||||||
|
* If sessionOpt is None, this may be called on executors.
|
||||||
|
*
|
||||||
|
* @return all children of path that match the specified filter.
|
||||||
|
*/
|
||||||
|
private def listLeafFiles(path: Path,
|
||||||
|
hadoopConf: Configuration,
|
||||||
|
filter: PathFilter,
|
||||||
|
contextOpt: Option[SparkContext],
|
||||||
|
ignoreMissingFiles: Boolean,
|
||||||
|
ignoreLocality: Boolean,
|
||||||
|
isRootPath: Boolean,
|
||||||
|
parallelismThreshold: Int,
|
||||||
|
parallelismMax: Int): Seq[FileStatus] = {
|
||||||
|
|
||||||
|
logTrace(s"Listing $path")
|
||||||
|
val fs = path.getFileSystem(hadoopConf)
|
||||||
|
|
||||||
|
// Note that statuses only include FileStatus for the files and dirs directly under path,
|
||||||
|
// and does not include anything else recursively.
|
||||||
|
val statuses: Array[FileStatus] = try {
|
||||||
|
fs match {
|
||||||
|
// DistributedFileSystem overrides listLocatedStatus to make 1 single call to namenode
|
||||||
|
// to retrieve the file status with the file block location. The reason to still fallback
|
||||||
|
// to listStatus is because the default implementation would potentially throw a
|
||||||
|
// FileNotFoundException which is better handled by doing the lookups manually below.
|
||||||
|
case (_: DistributedFileSystem | _: ViewFileSystem) if !ignoreLocality =>
|
||||||
|
val remoteIter = fs.listLocatedStatus(path)
|
||||||
|
new Iterator[LocatedFileStatus]() {
|
||||||
|
def next(): LocatedFileStatus = remoteIter.next
|
||||||
|
|
||||||
|
def hasNext(): Boolean = remoteIter.hasNext
|
||||||
|
}.toArray
|
||||||
|
case _ => fs.listStatus(path)
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// If we are listing a root path for SQL (e.g. a top level directory of a table), we need to
|
||||||
|
// ignore FileNotFoundExceptions during this root level of the listing because
|
||||||
|
//
|
||||||
|
// (a) certain code paths might construct an InMemoryFileIndex with root paths that
|
||||||
|
// might not exist (i.e. not all callers are guaranteed to have checked
|
||||||
|
// path existence prior to constructing InMemoryFileIndex) and,
|
||||||
|
// (b) we need to ignore deleted root paths during REFRESH TABLE, otherwise we break
|
||||||
|
// existing behavior and break the ability drop SessionCatalog tables when tables'
|
||||||
|
// root directories have been deleted (which breaks a number of Spark's own tests).
|
||||||
|
//
|
||||||
|
// If we are NOT listing a root path then a FileNotFoundException here means that the
|
||||||
|
// directory was present in a previous level of file listing but is absent in this
|
||||||
|
// listing, likely indicating a race condition (e.g. concurrent table overwrite or S3
|
||||||
|
// list inconsistency).
|
||||||
|
//
|
||||||
|
// The trade-off in supporting existing behaviors / use-cases is that we won't be
|
||||||
|
// able to detect race conditions involving root paths being deleted during
|
||||||
|
// InMemoryFileIndex construction. However, it's still a net improvement to detect and
|
||||||
|
// fail-fast on the non-root cases. For more info see the SPARK-27676 review discussion.
|
||||||
|
case _: FileNotFoundException if isRootPath || ignoreMissingFiles =>
|
||||||
|
logWarning(s"The directory $path was not found. Was it deleted very recently?")
|
||||||
|
Array.empty[FileStatus]
|
||||||
|
}
|
||||||
|
|
||||||
|
val filteredStatuses =
|
||||||
|
statuses.filterNot(status => shouldFilterOutPathName(status.getPath.getName))
|
||||||
|
|
||||||
|
val allLeafStatuses = {
|
||||||
|
val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory)
|
||||||
|
val nestedFiles: Seq[FileStatus] = contextOpt match {
|
||||||
|
case Some(context) if dirs.size > parallelismThreshold =>
|
||||||
|
parallelListLeafFilesInternal(
|
||||||
|
context,
|
||||||
|
dirs.map(_.getPath),
|
||||||
|
hadoopConf = hadoopConf,
|
||||||
|
filter = filter,
|
||||||
|
isRootLevel = false,
|
||||||
|
ignoreMissingFiles = ignoreMissingFiles,
|
||||||
|
ignoreLocality = ignoreLocality,
|
||||||
|
parallelismThreshold = parallelismThreshold,
|
||||||
|
parallelismMax = parallelismMax
|
||||||
|
).flatMap(_._2)
|
||||||
|
case _ =>
|
||||||
|
dirs.flatMap { dir =>
|
||||||
|
listLeafFiles(
|
||||||
|
path = dir.getPath,
|
||||||
|
hadoopConf = hadoopConf,
|
||||||
|
filter = filter,
|
||||||
|
contextOpt = contextOpt,
|
||||||
|
ignoreMissingFiles = ignoreMissingFiles,
|
||||||
|
ignoreLocality = ignoreLocality,
|
||||||
|
isRootPath = false,
|
||||||
|
parallelismThreshold = parallelismThreshold,
|
||||||
|
parallelismMax = parallelismMax)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
val allFiles = topLevelFiles ++ nestedFiles
|
||||||
|
if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
|
||||||
|
}
|
||||||
|
|
||||||
|
val missingFiles = mutable.ArrayBuffer.empty[String]
|
||||||
|
val resolvedLeafStatuses = allLeafStatuses.flatMap {
|
||||||
|
case f: LocatedFileStatus =>
|
||||||
|
Some(f)
|
||||||
|
|
||||||
|
// NOTE:
|
||||||
|
//
|
||||||
|
// - Although S3/S3A/S3N file system can be quite slow for remote file metadata
|
||||||
|
// operations, calling `getFileBlockLocations` does no harm here since these file system
|
||||||
|
// implementations don't actually issue RPC for this method.
|
||||||
|
//
|
||||||
|
// - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
|
||||||
|
// be a big deal since we always use to `parallelListLeafFiles` when the number of
|
||||||
|
// paths exceeds threshold.
|
||||||
|
case f if !ignoreLocality =>
|
||||||
|
// The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
|
||||||
|
// which is very slow on some file system (RawLocalFileSystem, which is launch a
|
||||||
|
// subprocess and parse the stdout).
|
||||||
|
try {
|
||||||
|
val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc =>
|
||||||
|
// Store BlockLocation objects to consume less memory
|
||||||
|
if (loc.getClass == classOf[BlockLocation]) {
|
||||||
|
loc
|
||||||
|
} else {
|
||||||
|
new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
|
||||||
|
f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
|
||||||
|
if (f.isSymlink) {
|
||||||
|
lfs.setSymlink(f.getSymlink)
|
||||||
|
}
|
||||||
|
Some(lfs)
|
||||||
|
} catch {
|
||||||
|
case _: FileNotFoundException if ignoreMissingFiles =>
|
||||||
|
missingFiles += f.getPath.toString
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
case f => Some(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (missingFiles.nonEmpty) {
|
||||||
|
logWarning(
|
||||||
|
s"the following files were missing during file scan:\n ${missingFiles.mkString("\n ")}")
|
||||||
|
}
|
||||||
|
|
||||||
|
resolvedLeafStatuses
|
||||||
|
}
|
||||||
|
// scalastyle:on parameter.number
|
||||||
|
|
||||||
|
/** A serializable variant of HDFS's BlockLocation. This is required by Hadoop 2.7. */
|
||||||
|
private case class SerializableBlockLocation(names: Array[String],
|
||||||
|
hosts: Array[String],
|
||||||
|
offset: Long,
|
||||||
|
length: Long)
|
||||||
|
|
||||||
|
/** A serializable variant of HDFS's FileStatus. This is required by Hadoop 2.7. */
|
||||||
|
private case class SerializableFileStatus(path: String,
|
||||||
|
length: Long,
|
||||||
|
isDir: Boolean,
|
||||||
|
blockReplication: Short,
|
||||||
|
blockSize: Long,
|
||||||
|
modificationTime: Long,
|
||||||
|
accessTime: Long,
|
||||||
|
blockLocations: Array[SerializableBlockLocation])
|
||||||
|
|
||||||
|
/** Checks if we should filter out this path name. */
|
||||||
|
def shouldFilterOutPathName(pathName: String): Boolean = {
|
||||||
|
// We filter follow paths:
|
||||||
|
// 1. everything that starts with _ and ., except _common_metadata and _metadata
|
||||||
|
// because Parquet needs to find those metadata files from leaf files returned by this method.
|
||||||
|
// We should refactor this logic to not mix metadata files with data files.
|
||||||
|
// 2. everything that ends with `._COPYING_`, because this is a intermediate state of file. we
|
||||||
|
// should skip this file in case of double reading.
|
||||||
|
val exclude = (pathName.startsWith("_") && !pathName.contains("=")) || pathName.endsWith("._COPYING_")
|
||||||
|
val include = pathName.startsWith("_common_metadata") || pathName.startsWith("_metadata")
|
||||||
|
exclude && !include
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,102 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.execution.datasources
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration
|
||||||
|
import org.apache.hadoop.fs.{FileStatus, Path, PathFilter}
|
||||||
|
import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
|
||||||
|
import org.apache.spark.HoodieHadoopFSUtils
|
||||||
|
import org.apache.spark.metrics.source.HiveCatalogMetrics
|
||||||
|
import org.apache.spark.sql.SparkSession
|
||||||
|
import org.apache.spark.sql.execution.datasources._
|
||||||
|
import org.apache.spark.sql.types.StructType
|
||||||
|
|
||||||
|
import scala.collection.mutable
|
||||||
|
import scala.collection.mutable.ArrayBuffer
|
||||||
|
|
||||||
|
class HoodieInMemoryFileIndex(sparkSession: SparkSession,
|
||||||
|
rootPathsSpecified: Seq[Path],
|
||||||
|
parameters: Map[String, String],
|
||||||
|
userSpecifiedSchema: Option[StructType],
|
||||||
|
fileStatusCache: FileStatusCache = NoopCache)
|
||||||
|
extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters, userSpecifiedSchema, fileStatusCache) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* List leaf files of given paths. This method will submit a Spark job to do parallel
|
||||||
|
* listing whenever there is a path having more files than the parallel partition discovery threshold.
|
||||||
|
*
|
||||||
|
* This is publicly visible for testing.
|
||||||
|
*
|
||||||
|
* NOTE: This method replicates the one it overrides, however it uses custom method to run parallel
|
||||||
|
* listing that accepts files starting with "."
|
||||||
|
*/
|
||||||
|
override def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
|
||||||
|
val startTime = System.nanoTime()
|
||||||
|
val output = mutable.LinkedHashSet[FileStatus]()
|
||||||
|
val pathsToFetch = mutable.ArrayBuffer[Path]()
|
||||||
|
for (path <- paths) {
|
||||||
|
fileStatusCache.getLeafFiles(path) match {
|
||||||
|
case Some(files) =>
|
||||||
|
HiveCatalogMetrics.incrementFileCacheHits(files.length)
|
||||||
|
output ++= files
|
||||||
|
case None =>
|
||||||
|
pathsToFetch += path
|
||||||
|
}
|
||||||
|
() // for some reasons scalac 2.12 needs this; return type doesn't matter
|
||||||
|
}
|
||||||
|
val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, this.getClass))
|
||||||
|
val discovered = bulkListLeafFiles(sparkSession, pathsToFetch, filter, hadoopConf)
|
||||||
|
|
||||||
|
discovered.foreach { case (path, leafFiles) =>
|
||||||
|
HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
|
||||||
|
fileStatusCache.putLeafFiles(path, leafFiles.toArray)
|
||||||
|
output ++= leafFiles
|
||||||
|
}
|
||||||
|
|
||||||
|
logInfo(s"It took ${(System.nanoTime() - startTime) / (1000 * 1000)} ms to list leaf files" +
|
||||||
|
s" for ${paths.length} paths.")
|
||||||
|
|
||||||
|
output
|
||||||
|
}
|
||||||
|
|
||||||
|
protected def bulkListLeafFiles(sparkSession: SparkSession, paths: ArrayBuffer[Path], filter: PathFilter, hadoopConf: Configuration): Seq[(Path, Seq[FileStatus])] = {
|
||||||
|
HoodieHadoopFSUtils.parallelListLeafFiles(
|
||||||
|
sc = sparkSession.sparkContext,
|
||||||
|
paths = paths,
|
||||||
|
hadoopConf = hadoopConf,
|
||||||
|
filter = new PathFilterWrapper(filter),
|
||||||
|
ignoreMissingFiles = sparkSession.sessionState.conf.ignoreMissingFiles,
|
||||||
|
// NOTE: We're disabling fetching Block Info to speed up file listing
|
||||||
|
ignoreLocality = true,
|
||||||
|
parallelismThreshold = sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold,
|
||||||
|
parallelismMax = sparkSession.sessionState.conf.parallelPartitionDiscoveryParallelism)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
object HoodieInMemoryFileIndex {
|
||||||
|
def create(sparkSession: SparkSession, globbedPaths: Seq[Path]): HoodieInMemoryFileIndex = {
|
||||||
|
val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
|
||||||
|
new HoodieInMemoryFileIndex(sparkSession, globbedPaths, Map(), Option.empty, fileStatusCache)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class PathFilterWrapper(val filter: PathFilter) extends PathFilter with Serializable {
|
||||||
|
override def accept(path: Path): Boolean = {
|
||||||
|
(filter == null || filter.accept(path)) && !HoodieHadoopFSUtils.shouldFilterOutPathName(path.getName)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.execution.datasources
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path
|
||||||
|
import org.apache.spark.sql.SparkSession
|
||||||
|
import org.junit.jupiter.api.Assertions.assertEquals
|
||||||
|
import org.junit.jupiter.api.Test
|
||||||
|
import org.junit.jupiter.api.io.TempDir
|
||||||
|
|
||||||
|
import java.io.File
|
||||||
|
import java.nio.file.Paths
|
||||||
|
|
||||||
|
class TestHoodieInMemoryFileIndex {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
def testCreateInMemoryIndex(@TempDir tempDir: File): Unit = {
|
||||||
|
val spark = SparkSession.builder
|
||||||
|
.appName("Hoodie Datasource test")
|
||||||
|
.master("local[2]")
|
||||||
|
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||||
|
.getOrCreate
|
||||||
|
|
||||||
|
val folders: Seq[Path] = Seq(
|
||||||
|
new Path(Paths.get(tempDir.getAbsolutePath, "folder1").toUri),
|
||||||
|
new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri)
|
||||||
|
)
|
||||||
|
|
||||||
|
val files: Seq[Path] = Seq(
|
||||||
|
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file1").toUri),
|
||||||
|
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file2").toUri),
|
||||||
|
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file3").toUri),
|
||||||
|
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file4").toUri)
|
||||||
|
)
|
||||||
|
|
||||||
|
folders.foreach(folder => new File(folder.toUri).mkdir())
|
||||||
|
files.foreach(file => new File(file.toUri).createNewFile())
|
||||||
|
|
||||||
|
val index = HoodieInMemoryFileIndex.create(spark, Seq(folders(0), folders(1)))
|
||||||
|
val indexedFilePaths = index.allFiles().map(fs => fs.getPath)
|
||||||
|
assertEquals(files.sortWith(_.toString < _.toString), indexedFilePaths.sortWith(_.toString < _.toString))
|
||||||
|
spark.stop()
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -88,35 +88,6 @@ class TestHoodieSparkUtils {
|
|||||||
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
def testCreateInMemoryIndex(@TempDir tempDir: File): Unit = {
|
|
||||||
val spark = SparkSession.builder
|
|
||||||
.appName("Hoodie Datasource test")
|
|
||||||
.master("local[2]")
|
|
||||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
|
||||||
.getOrCreate
|
|
||||||
|
|
||||||
val folders: Seq[Path] = Seq(
|
|
||||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1").toUri),
|
|
||||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri)
|
|
||||||
)
|
|
||||||
|
|
||||||
val files: Seq[Path] = Seq(
|
|
||||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file1").toUri),
|
|
||||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file2").toUri),
|
|
||||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file3").toUri),
|
|
||||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file4").toUri)
|
|
||||||
)
|
|
||||||
|
|
||||||
folders.foreach(folder => new File(folder.toUri).mkdir())
|
|
||||||
files.foreach(file => new File(file.toUri).createNewFile())
|
|
||||||
|
|
||||||
val index = HoodieSparkUtils.createInMemoryFileIndex(spark, Seq(folders(0), folders(1)))
|
|
||||||
val indexedFilePaths = index.allFiles().map(fs => fs.getPath)
|
|
||||||
assertEquals(files.sortWith(_.toString < _.toString), indexedFilePaths.sortWith(_.toString < _.toString))
|
|
||||||
spark.stop()
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testCreateRddSchemaEvol(): Unit = {
|
def testCreateRddSchemaEvol(): Unit = {
|
||||||
val spark = SparkSession.builder
|
val spark = SparkSession.builder
|
||||||
|
|||||||
@@ -112,9 +112,9 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
|
|||||||
val fullColumnsReadStats: Array[(String, Long)] =
|
val fullColumnsReadStats: Array[(String, Long)] =
|
||||||
if (HoodieSparkUtils.isSpark3)
|
if (HoodieSparkUtils.isSpark3)
|
||||||
Array(
|
Array(
|
||||||
("rider", 14665),
|
("rider", 14166),
|
||||||
("rider,driver", 14665),
|
("rider,driver", 14166),
|
||||||
("rider,driver,tip_history", 14665))
|
("rider,driver,tip_history", 14166))
|
||||||
else if (HoodieSparkUtils.isSpark2)
|
else if (HoodieSparkUtils.isSpark2)
|
||||||
// TODO re-enable tests (these tests are very unstable currently)
|
// TODO re-enable tests (these tests are very unstable currently)
|
||||||
Array(
|
Array(
|
||||||
@@ -163,11 +163,29 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
|
|||||||
else
|
else
|
||||||
fail("Only Spark 3 and Spark 2 are currently supported")
|
fail("Only Spark 3 and Spark 2 are currently supported")
|
||||||
|
|
||||||
|
// Stats for the reads fetching _all_ columns (currently for MOR to be able to merge
|
||||||
|
// records properly full row has to be fetched; note, how amount of bytes read
|
||||||
|
// is invariant of the # of columns)
|
||||||
|
val fullColumnsReadStats: Array[(String, Long)] =
|
||||||
|
if (HoodieSparkUtils.isSpark3)
|
||||||
|
Array(
|
||||||
|
("rider", 14166),
|
||||||
|
("rider,driver", 14166),
|
||||||
|
("rider,driver,tip_history", 14166))
|
||||||
|
else if (HoodieSparkUtils.isSpark2)
|
||||||
|
// TODO re-enable tests (these tests are very unstable currently)
|
||||||
|
Array(
|
||||||
|
("rider", -1),
|
||||||
|
("rider,driver", -1),
|
||||||
|
("rider,driver,tip_history", -1))
|
||||||
|
else
|
||||||
|
fail("Only Spark 3 and Spark 2 are currently supported")
|
||||||
|
|
||||||
// Test MOR / Snapshot / Skip-merge
|
// Test MOR / Snapshot / Skip-merge
|
||||||
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL, projectedColumnsReadStats)
|
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL, projectedColumnsReadStats)
|
||||||
|
|
||||||
// Test MOR / Snapshot / Payload-combine
|
// Test MOR / Snapshot / Payload-combine
|
||||||
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL, projectedColumnsReadStats)
|
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL, fullColumnsReadStats)
|
||||||
|
|
||||||
// Test MOR / Read Optimized
|
// Test MOR / Read Optimized
|
||||||
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, "null", projectedColumnsReadStats)
|
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, "null", projectedColumnsReadStats)
|
||||||
@@ -209,9 +227,9 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
|
|||||||
val fullColumnsReadStats: Array[(String, Long)] =
|
val fullColumnsReadStats: Array[(String, Long)] =
|
||||||
if (HoodieSparkUtils.isSpark3)
|
if (HoodieSparkUtils.isSpark3)
|
||||||
Array(
|
Array(
|
||||||
("rider", 19683),
|
("rider", 19684),
|
||||||
("rider,driver", 19683),
|
("rider,driver", 19684),
|
||||||
("rider,driver,tip_history", 19683))
|
("rider,driver,tip_history", 19684))
|
||||||
else if (HoodieSparkUtils.isSpark2)
|
else if (HoodieSparkUtils.isSpark2)
|
||||||
// TODO re-enable tests (these tests are very unstable currently)
|
// TODO re-enable tests (these tests are very unstable currently)
|
||||||
Array(
|
Array(
|
||||||
|
|||||||
Reference in New Issue
Block a user