1
0

[HUDI-1109] Support Spark Structured Streaming read from Hudi table (#2485)

This commit is contained in:
pengzhiwei
2021-02-17 19:36:29 +08:00
committed by GitHub
parent 5d2491d10c
commit 37972071ff
10 changed files with 517 additions and 15 deletions

View File

@@ -22,12 +22,13 @@ import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType}
import org.apache.hudi.DataSourceWriteOptions.{BOOTSTRAP_OPERATION_OPT_VAL, OPERATION_OPT_KEY}
import org.apache.hudi.common.fs.FSUtils
import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
import org.apache.hudi.exception.HoodieException
import org.apache.hudi.hadoop.HoodieROTablePathFilter
import org.apache.log4j.LogManager
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.execution.streaming.{Sink, Source}
import org.apache.spark.sql.hudi.streaming.HoodieStreamSource
import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StructType
@@ -44,6 +45,7 @@ class DefaultSource extends RelationProvider
with CreatableRelationProvider
with DataSourceRegister
with StreamSinkProvider
with StreamSourceProvider
with Serializable {
SparkSession.getActiveSession.foreach { spark =>
@@ -191,4 +193,35 @@ class DefaultSource extends RelationProvider
.resolveRelation()
}
}
override def sourceSchema(sqlContext: SQLContext,
schema: Option[StructType],
providerName: String,
parameters: Map[String, String]): (String, StructType) = {
val path = parameters.get("path")
if (path.isEmpty || path.get == null) {
throw new HoodieException(s"'path' must be specified.")
}
val metaClient = new HoodieTableMetaClient(
sqlContext.sparkSession.sessionState.newHadoopConf(), path.get)
val schemaResolver = new TableSchemaResolver(metaClient)
val sqlSchema =
try {
val avroSchema = schemaResolver.getTableAvroSchema
AvroConversionUtils.convertAvroSchemaToStructType(avroSchema)
} catch {
case _: Exception =>
require(schema.isDefined, "Fail to resolve source schema")
schema.get
}
(shortName(), sqlSchema)
}
override def createSource(sqlContext: SQLContext,
metadataPath: String,
schema: Option[StructType],
providerName: String,
parameters: Map[String, String]): Source = {
new HoodieStreamSource(sqlContext, metadataPath, schema, parameters)
}
}

View File

@@ -21,7 +21,7 @@ package org.apache.hudi
import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hudi.client.utils.SparkRowDeserializer
import org.apache.hudi.client.utils.SparkRowSerDe
import org.apache.hudi.common.model.HoodieRecord
import org.apache.spark.SPARK_VERSION
import org.apache.spark.rdd.RDD
@@ -99,7 +99,7 @@ object HoodieSparkUtils {
// Use the Avro schema to derive the StructType which has the correct nullability information
val dataType = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType]
val encoder = RowEncoder.apply(dataType).resolveAndBind()
val deserializer = HoodieSparkUtils.createDeserializer(encoder)
val deserializer = HoodieSparkUtils.createRowSerDe(encoder)
df.queryExecution.toRdd.map(row => deserializer.deserializeRow(row))
.mapPartitions { records =>
if (records.isEmpty) Iterator.empty
@@ -110,12 +110,12 @@ object HoodieSparkUtils {
}
}
def createDeserializer(encoder: ExpressionEncoder[Row]): SparkRowDeserializer = {
// TODO remove Spark2RowDeserializer if Spark 2.x support is dropped
def createRowSerDe(encoder: ExpressionEncoder[Row]): SparkRowSerDe = {
// TODO remove Spark2RowSerDe if Spark 2.x support is dropped
if (SPARK_VERSION.startsWith("2.")) {
new Spark2RowDeserializer(encoder)
new Spark2RowSerDe(encoder)
} else {
new Spark3RowDeserializer(encoder)
new Spark3RowSerDe(encoder)
}
}
}

View File

@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.hudi.streaming
import com.fasterxml.jackson.annotation.JsonInclude.Include
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import org.apache.hudi.common.table.timeline.HoodieTimeline
import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
case class HoodieSourceOffset(commitTime: String) extends Offset {
override def json(): String = {
HoodieSourceOffset.toJson(this)
}
override def equals(obj: Any): Boolean = {
obj match {
case HoodieSourceOffset(otherCommitTime) =>
otherCommitTime == commitTime
case _=> false
}
}
override def hashCode(): Int = {
commitTime.hashCode
}
}
object HoodieSourceOffset {
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.setSerializationInclusion(Include.NON_ABSENT)
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
mapper.registerModule(DefaultScalaModule)
def toJson(offset: HoodieSourceOffset): String = {
mapper.writeValueAsString(offset)
}
def fromJson(json: String): HoodieSourceOffset = {
mapper.readValue[HoodieSourceOffset](json)
}
def apply(offset: Offset): HoodieSourceOffset = {
offset match {
case SerializedOffset(json) => fromJson(json)
case o: HoodieSourceOffset => o
}
}
val INIT_OFFSET = HoodieSourceOffset(HoodieTimeline.INIT_INSTANT_TS)
}

View File

@@ -0,0 +1,197 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.hudi.streaming
import java.io.{BufferedWriter, InputStream, OutputStream, OutputStreamWriter}
import java.nio.charset.StandardCharsets
import java.util.Date
import org.apache.hadoop.fs.Path
import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils, IncrementalRelation, MergeOnReadIncrementalRelation}
import org.apache.hudi.common.model.HoodieTableType
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
import org.apache.hudi.common.util.{FileIOUtils, TablePathUtils}
import org.apache.spark.sql.hudi.streaming.HoodieStreamSource.VERSION
import org.apache.spark.sql.hudi.streaming.HoodieSourceOffset.INIT_OFFSET
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.avro.SchemaConverters
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.execution.streaming.{HDFSMetadataLog, Offset, Source}
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext}
/**
* The Struct Stream Source for Hudi to consume the data by streaming job.
* @param sqlContext
* @param metadataPath
* @param schemaOption
* @param parameters
*/
class HoodieStreamSource(
sqlContext: SQLContext,
metadataPath: String,
schemaOption: Option[StructType],
parameters: Map[String, String])
extends Source with Logging with Serializable {
@transient private val hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
private lazy val tablePath: Path = {
val path = new Path(parameters.getOrElse("path", "Missing 'path' option"))
val fs = path.getFileSystem(hadoopConf)
TablePathUtils.getTablePath(fs, path).get()
}
private lazy val metaClient = new HoodieTableMetaClient(hadoopConf, tablePath.toString)
private lazy val tableType = metaClient.getTableType
@transient private var lastOffset: HoodieSourceOffset = _
@transient private lazy val initialOffsets = {
val metadataLog =
new HDFSMetadataLog[HoodieSourceOffset](sqlContext.sparkSession, metadataPath) {
override def serialize(metadata: HoodieSourceOffset, out: OutputStream): Unit = {
val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))
writer.write("v" + VERSION + "\n")
writer.write(metadata.json)
writer.flush()
}
/**
* Deserialize the init offset from the metadata file.
* The format in the metadata file is like this:
* ----------------------------------------------
* v1 -- The version info in the first line
* offsetJson -- The json string of HoodieSourceOffset in the rest of the file
* -----------------------------------------------
* @param in
* @return
*/
override def deserialize(in: InputStream): HoodieSourceOffset = {
val content = FileIOUtils.readAsUTFString(in)
// Get version from the first line
val firstLineEnd = content.indexOf("\n")
if (firstLineEnd > 0) {
val version = getVersion(content.substring(0, firstLineEnd))
if (version > VERSION) {
throw new IllegalStateException(s"UnSupportVersion: max support version is: $VERSION" +
s" current version is: $version")
}
// Get offset from the rest line in the file
HoodieSourceOffset.fromJson(content.substring(firstLineEnd + 1))
} else {
throw new IllegalStateException(s"Bad metadata format, failed to find the version line.")
}
}
}
metadataLog.get(0).getOrElse {
metadataLog.add(0, INIT_OFFSET)
INIT_OFFSET
}
}
private def getVersion(versionLine: String): Int = {
if (versionLine.startsWith("v")) {
versionLine.substring(1).toInt
} else {
throw new IllegalStateException(s"Illegal version line: $versionLine " +
s"in the streaming metadata path")
}
}
override def schema: StructType = {
schemaOption.getOrElse {
val schemaUtil = new TableSchemaResolver(metaClient)
SchemaConverters.toSqlType(schemaUtil.getTableAvroSchema)
.dataType.asInstanceOf[StructType]
}
}
/**
* Get the latest offset from the hoodie table.
* @return
*/
override def getOffset: Option[Offset] = {
metaClient.reloadActiveTimeline()
val activeInstants = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants
if (!activeInstants.empty()) {
val currentLatestCommitTime = activeInstants.lastInstant().get().getTimestamp
if (lastOffset == null || currentLatestCommitTime > lastOffset.commitTime) {
lastOffset = HoodieSourceOffset(currentLatestCommitTime)
}
} else { // if there are no active commits, use the init offset
lastOffset = initialOffsets
}
Some(lastOffset)
}
override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
initialOffsets
val startOffset = start.map(HoodieSourceOffset(_))
.getOrElse(initialOffsets)
val endOffset = HoodieSourceOffset(end)
if (startOffset == endOffset) {
sqlContext.internalCreateDataFrame(
sqlContext.sparkContext.emptyRDD[InternalRow].setName("empty"), schema, isStreaming = true)
} else {
// Consume the data between (startCommitTime, endCommitTime]
val incParams = parameters ++ Map(
DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY -> startCommitTime(startOffset),
DataSourceReadOptions.END_INSTANTTIME_OPT_KEY -> endOffset.commitTime
)
val rdd = tableType match {
case HoodieTableType.COPY_ON_WRITE =>
val serDe = HoodieSparkUtils.createRowSerDe(RowEncoder(schema))
new IncrementalRelation(sqlContext, incParams, schema, metaClient)
.buildScan()
.map(serDe.serializeRow)
case HoodieTableType.MERGE_ON_READ =>
val requiredColumns = schema.fields.map(_.name)
new MergeOnReadIncrementalRelation(sqlContext, incParams, schema, metaClient)
.buildScan(requiredColumns, Array.empty[Filter])
.asInstanceOf[RDD[InternalRow]]
case _ => throw new IllegalArgumentException(s"UnSupport tableType: $tableType")
}
sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
}
}
private def startCommitTime(startOffset: HoodieSourceOffset): String = {
startOffset match {
case INIT_OFFSET => startOffset.commitTime
case HoodieSourceOffset(commitTime) =>
val time = HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime
// As we consume the data between (start, end], start is not included,
// so we +1s to the start commit time here.
HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date(time + 1000))
case _=> throw new IllegalStateException("UnKnow offset type.")
}
}
override def stop(): Unit = {
}
}
object HoodieStreamSource {
val VERSION = 1
}