[HUDI-1109] Support Spark Structured Streaming read from Hudi table (#2485)
This commit is contained in:
@@ -23,6 +23,8 @@ import org.apache.spark.sql.catalyst.InternalRow;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public interface SparkRowDeserializer extends Serializable {
|
||||
public interface SparkRowSerDe extends Serializable {
|
||||
Row deserializeRow(InternalRow internalRow);
|
||||
|
||||
InternalRow serializeRow(Row row);
|
||||
}
|
||||
@@ -266,6 +266,25 @@
|
||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||
<classifier>tests</classifier>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||
<classifier>tests</classifier>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
|
||||
<classifier>tests</classifier>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Spark (Packages) -->
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
|
||||
@@ -22,12 +22,13 @@ import org.apache.hudi.DataSourceReadOptions._
|
||||
import org.apache.hudi.common.model.{HoodieRecord, HoodieTableType}
|
||||
import org.apache.hudi.DataSourceWriteOptions.{BOOTSTRAP_OPERATION_OPT_VAL, OPERATION_OPT_KEY}
|
||||
import org.apache.hudi.common.fs.FSUtils
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.hadoop.HoodieROTablePathFilter
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.execution.datasources.DataSource
|
||||
import org.apache.spark.sql.execution.streaming.Sink
|
||||
import org.apache.spark.sql.execution.streaming.{Sink, Source}
|
||||
import org.apache.spark.sql.hudi.streaming.HoodieStreamSource
|
||||
import org.apache.spark.sql.sources._
|
||||
import org.apache.spark.sql.streaming.OutputMode
|
||||
import org.apache.spark.sql.types.StructType
|
||||
@@ -44,6 +45,7 @@ class DefaultSource extends RelationProvider
|
||||
with CreatableRelationProvider
|
||||
with DataSourceRegister
|
||||
with StreamSinkProvider
|
||||
with StreamSourceProvider
|
||||
with Serializable {
|
||||
|
||||
SparkSession.getActiveSession.foreach { spark =>
|
||||
@@ -191,4 +193,35 @@ class DefaultSource extends RelationProvider
|
||||
.resolveRelation()
|
||||
}
|
||||
}
|
||||
|
||||
override def sourceSchema(sqlContext: SQLContext,
|
||||
schema: Option[StructType],
|
||||
providerName: String,
|
||||
parameters: Map[String, String]): (String, StructType) = {
|
||||
val path = parameters.get("path")
|
||||
if (path.isEmpty || path.get == null) {
|
||||
throw new HoodieException(s"'path' must be specified.")
|
||||
}
|
||||
val metaClient = new HoodieTableMetaClient(
|
||||
sqlContext.sparkSession.sessionState.newHadoopConf(), path.get)
|
||||
val schemaResolver = new TableSchemaResolver(metaClient)
|
||||
val sqlSchema =
|
||||
try {
|
||||
val avroSchema = schemaResolver.getTableAvroSchema
|
||||
AvroConversionUtils.convertAvroSchemaToStructType(avroSchema)
|
||||
} catch {
|
||||
case _: Exception =>
|
||||
require(schema.isDefined, "Fail to resolve source schema")
|
||||
schema.get
|
||||
}
|
||||
(shortName(), sqlSchema)
|
||||
}
|
||||
|
||||
override def createSource(sqlContext: SQLContext,
|
||||
metadataPath: String,
|
||||
schema: Option[StructType],
|
||||
providerName: String,
|
||||
parameters: Map[String, String]): Source = {
|
||||
new HoodieStreamSource(sqlContext, metadataPath, schema, parameters)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ package org.apache.hudi
|
||||
import org.apache.avro.Schema
|
||||
import org.apache.avro.generic.GenericRecord
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.hudi.client.utils.SparkRowDeserializer
|
||||
import org.apache.hudi.client.utils.SparkRowSerDe
|
||||
import org.apache.hudi.common.model.HoodieRecord
|
||||
import org.apache.spark.SPARK_VERSION
|
||||
import org.apache.spark.rdd.RDD
|
||||
@@ -99,7 +99,7 @@ object HoodieSparkUtils {
|
||||
// Use the Avro schema to derive the StructType which has the correct nullability information
|
||||
val dataType = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType]
|
||||
val encoder = RowEncoder.apply(dataType).resolveAndBind()
|
||||
val deserializer = HoodieSparkUtils.createDeserializer(encoder)
|
||||
val deserializer = HoodieSparkUtils.createRowSerDe(encoder)
|
||||
df.queryExecution.toRdd.map(row => deserializer.deserializeRow(row))
|
||||
.mapPartitions { records =>
|
||||
if (records.isEmpty) Iterator.empty
|
||||
@@ -110,12 +110,12 @@ object HoodieSparkUtils {
|
||||
}
|
||||
}
|
||||
|
||||
def createDeserializer(encoder: ExpressionEncoder[Row]): SparkRowDeserializer = {
|
||||
// TODO remove Spark2RowDeserializer if Spark 2.x support is dropped
|
||||
def createRowSerDe(encoder: ExpressionEncoder[Row]): SparkRowSerDe = {
|
||||
// TODO remove Spark2RowSerDe if Spark 2.x support is dropped
|
||||
if (SPARK_VERSION.startsWith("2.")) {
|
||||
new Spark2RowDeserializer(encoder)
|
||||
new Spark2RowSerDe(encoder)
|
||||
} else {
|
||||
new Spark3RowDeserializer(encoder)
|
||||
new Spark3RowSerDe(encoder)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.streaming
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude.Include
|
||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
|
||||
import com.fasterxml.jackson.module.scala.DefaultScalaModule
|
||||
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline
|
||||
import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
|
||||
|
||||
case class HoodieSourceOffset(commitTime: String) extends Offset {
|
||||
|
||||
override def json(): String = {
|
||||
HoodieSourceOffset.toJson(this)
|
||||
}
|
||||
|
||||
override def equals(obj: Any): Boolean = {
|
||||
obj match {
|
||||
case HoodieSourceOffset(otherCommitTime) =>
|
||||
otherCommitTime == commitTime
|
||||
case _=> false
|
||||
}
|
||||
}
|
||||
|
||||
override def hashCode(): Int = {
|
||||
commitTime.hashCode
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
object HoodieSourceOffset {
|
||||
val mapper = new ObjectMapper with ScalaObjectMapper
|
||||
mapper.setSerializationInclusion(Include.NON_ABSENT)
|
||||
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
|
||||
mapper.registerModule(DefaultScalaModule)
|
||||
|
||||
def toJson(offset: HoodieSourceOffset): String = {
|
||||
mapper.writeValueAsString(offset)
|
||||
}
|
||||
|
||||
def fromJson(json: String): HoodieSourceOffset = {
|
||||
mapper.readValue[HoodieSourceOffset](json)
|
||||
}
|
||||
|
||||
def apply(offset: Offset): HoodieSourceOffset = {
|
||||
offset match {
|
||||
case SerializedOffset(json) => fromJson(json)
|
||||
case o: HoodieSourceOffset => o
|
||||
}
|
||||
}
|
||||
|
||||
val INIT_OFFSET = HoodieSourceOffset(HoodieTimeline.INIT_INSTANT_TS)
|
||||
}
|
||||
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.hudi.streaming
|
||||
|
||||
import java.io.{BufferedWriter, InputStream, OutputStream, OutputStreamWriter}
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.util.Date
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.{DataSourceReadOptions, HoodieSparkUtils, IncrementalRelation, MergeOnReadIncrementalRelation}
|
||||
import org.apache.hudi.common.model.HoodieTableType
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline
|
||||
import org.apache.hudi.common.table.{HoodieTableMetaClient, TableSchemaResolver}
|
||||
import org.apache.hudi.common.util.{FileIOUtils, TablePathUtils}
|
||||
import org.apache.spark.sql.hudi.streaming.HoodieStreamSource.VERSION
|
||||
import org.apache.spark.sql.hudi.streaming.HoodieSourceOffset.INIT_OFFSET
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.avro.SchemaConverters
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||
import org.apache.spark.sql.execution.streaming.{HDFSMetadataLog, Offset, Source}
|
||||
import org.apache.spark.sql.sources.Filter
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||
|
||||
/**
|
||||
* The Struct Stream Source for Hudi to consume the data by streaming job.
|
||||
* @param sqlContext
|
||||
* @param metadataPath
|
||||
* @param schemaOption
|
||||
* @param parameters
|
||||
*/
|
||||
class HoodieStreamSource(
|
||||
sqlContext: SQLContext,
|
||||
metadataPath: String,
|
||||
schemaOption: Option[StructType],
|
||||
parameters: Map[String, String])
|
||||
extends Source with Logging with Serializable {
|
||||
|
||||
@transient private val hadoopConf = sqlContext.sparkSession.sessionState.newHadoopConf()
|
||||
private lazy val tablePath: Path = {
|
||||
val path = new Path(parameters.getOrElse("path", "Missing 'path' option"))
|
||||
val fs = path.getFileSystem(hadoopConf)
|
||||
TablePathUtils.getTablePath(fs, path).get()
|
||||
}
|
||||
private lazy val metaClient = new HoodieTableMetaClient(hadoopConf, tablePath.toString)
|
||||
private lazy val tableType = metaClient.getTableType
|
||||
|
||||
@transient private var lastOffset: HoodieSourceOffset = _
|
||||
@transient private lazy val initialOffsets = {
|
||||
val metadataLog =
|
||||
new HDFSMetadataLog[HoodieSourceOffset](sqlContext.sparkSession, metadataPath) {
|
||||
override def serialize(metadata: HoodieSourceOffset, out: OutputStream): Unit = {
|
||||
val writer = new BufferedWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))
|
||||
writer.write("v" + VERSION + "\n")
|
||||
writer.write(metadata.json)
|
||||
writer.flush()
|
||||
}
|
||||
|
||||
/**
|
||||
* Deserialize the init offset from the metadata file.
|
||||
* The format in the metadata file is like this:
|
||||
* ----------------------------------------------
|
||||
* v1 -- The version info in the first line
|
||||
* offsetJson -- The json string of HoodieSourceOffset in the rest of the file
|
||||
* -----------------------------------------------
|
||||
* @param in
|
||||
* @return
|
||||
*/
|
||||
override def deserialize(in: InputStream): HoodieSourceOffset = {
|
||||
val content = FileIOUtils.readAsUTFString(in)
|
||||
// Get version from the first line
|
||||
val firstLineEnd = content.indexOf("\n")
|
||||
if (firstLineEnd > 0) {
|
||||
val version = getVersion(content.substring(0, firstLineEnd))
|
||||
if (version > VERSION) {
|
||||
throw new IllegalStateException(s"UnSupportVersion: max support version is: $VERSION" +
|
||||
s" current version is: $version")
|
||||
}
|
||||
// Get offset from the rest line in the file
|
||||
HoodieSourceOffset.fromJson(content.substring(firstLineEnd + 1))
|
||||
} else {
|
||||
throw new IllegalStateException(s"Bad metadata format, failed to find the version line.")
|
||||
}
|
||||
}
|
||||
}
|
||||
metadataLog.get(0).getOrElse {
|
||||
metadataLog.add(0, INIT_OFFSET)
|
||||
INIT_OFFSET
|
||||
}
|
||||
}
|
||||
|
||||
private def getVersion(versionLine: String): Int = {
|
||||
if (versionLine.startsWith("v")) {
|
||||
versionLine.substring(1).toInt
|
||||
} else {
|
||||
throw new IllegalStateException(s"Illegal version line: $versionLine " +
|
||||
s"in the streaming metadata path")
|
||||
}
|
||||
}
|
||||
|
||||
override def schema: StructType = {
|
||||
schemaOption.getOrElse {
|
||||
val schemaUtil = new TableSchemaResolver(metaClient)
|
||||
SchemaConverters.toSqlType(schemaUtil.getTableAvroSchema)
|
||||
.dataType.asInstanceOf[StructType]
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the latest offset from the hoodie table.
|
||||
* @return
|
||||
*/
|
||||
override def getOffset: Option[Offset] = {
|
||||
metaClient.reloadActiveTimeline()
|
||||
val activeInstants = metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants
|
||||
if (!activeInstants.empty()) {
|
||||
val currentLatestCommitTime = activeInstants.lastInstant().get().getTimestamp
|
||||
if (lastOffset == null || currentLatestCommitTime > lastOffset.commitTime) {
|
||||
lastOffset = HoodieSourceOffset(currentLatestCommitTime)
|
||||
}
|
||||
} else { // if there are no active commits, use the init offset
|
||||
lastOffset = initialOffsets
|
||||
}
|
||||
Some(lastOffset)
|
||||
}
|
||||
|
||||
override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
|
||||
initialOffsets
|
||||
|
||||
val startOffset = start.map(HoodieSourceOffset(_))
|
||||
.getOrElse(initialOffsets)
|
||||
val endOffset = HoodieSourceOffset(end)
|
||||
|
||||
if (startOffset == endOffset) {
|
||||
sqlContext.internalCreateDataFrame(
|
||||
sqlContext.sparkContext.emptyRDD[InternalRow].setName("empty"), schema, isStreaming = true)
|
||||
} else {
|
||||
// Consume the data between (startCommitTime, endCommitTime]
|
||||
val incParams = parameters ++ Map(
|
||||
DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY -> startCommitTime(startOffset),
|
||||
DataSourceReadOptions.END_INSTANTTIME_OPT_KEY -> endOffset.commitTime
|
||||
)
|
||||
|
||||
val rdd = tableType match {
|
||||
case HoodieTableType.COPY_ON_WRITE =>
|
||||
val serDe = HoodieSparkUtils.createRowSerDe(RowEncoder(schema))
|
||||
new IncrementalRelation(sqlContext, incParams, schema, metaClient)
|
||||
.buildScan()
|
||||
.map(serDe.serializeRow)
|
||||
case HoodieTableType.MERGE_ON_READ =>
|
||||
val requiredColumns = schema.fields.map(_.name)
|
||||
new MergeOnReadIncrementalRelation(sqlContext, incParams, schema, metaClient)
|
||||
.buildScan(requiredColumns, Array.empty[Filter])
|
||||
.asInstanceOf[RDD[InternalRow]]
|
||||
case _ => throw new IllegalArgumentException(s"UnSupport tableType: $tableType")
|
||||
}
|
||||
sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = true)
|
||||
}
|
||||
}
|
||||
|
||||
private def startCommitTime(startOffset: HoodieSourceOffset): String = {
|
||||
startOffset match {
|
||||
case INIT_OFFSET => startOffset.commitTime
|
||||
case HoodieSourceOffset(commitTime) =>
|
||||
val time = HoodieActiveTimeline.COMMIT_FORMATTER.parse(commitTime).getTime
|
||||
// As we consume the data between (start, end], start is not included,
|
||||
// so we +1s to the start commit time here.
|
||||
HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date(time + 1000))
|
||||
case _=> throw new IllegalStateException("UnKnow offset type.")
|
||||
}
|
||||
}
|
||||
|
||||
override def stop(): Unit = {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
object HoodieStreamSource {
|
||||
val VERSION = 1
|
||||
}
|
||||
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.functional
|
||||
|
||||
import org.apache.hudi.DataSourceWriteOptions
|
||||
import org.apache.hudi.DataSourceWriteOptions.{PRECOMBINE_FIELD_OPT_KEY, RECORDKEY_FIELD_OPT_KEY}
|
||||
import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_READ}
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||
import org.apache.hudi.config.HoodieWriteConfig.{DELETE_PARALLELISM, INSERT_PARALLELISM, TABLE_NAME, UPSERT_PARALLELISM}
|
||||
import org.apache.spark.sql.streaming.StreamTest
|
||||
import org.apache.spark.sql.{Row, SaveMode}
|
||||
|
||||
class TestStreamingSource extends StreamTest {
|
||||
|
||||
import testImplicits._
|
||||
private val commonOptions = Map(
|
||||
RECORDKEY_FIELD_OPT_KEY -> "id",
|
||||
PRECOMBINE_FIELD_OPT_KEY -> "ts",
|
||||
INSERT_PARALLELISM -> "4",
|
||||
UPSERT_PARALLELISM -> "4",
|
||||
DELETE_PARALLELISM -> "4"
|
||||
)
|
||||
private val columns = Seq("id", "name", "price", "ts")
|
||||
|
||||
override protected def sparkConf = {
|
||||
super.sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
}
|
||||
|
||||
test("test cow stream source") {
|
||||
withTempDir { inputDir =>
|
||||
val tablePath = s"${inputDir.getCanonicalPath}/test_cow_stream"
|
||||
HoodieTableMetaClient.initTableType(spark.sessionState.newHadoopConf(), tablePath,
|
||||
COPY_ON_WRITE, getTableName(tablePath), DataSourceWriteOptions.DEFAULT_PAYLOAD_OPT_VAL)
|
||||
|
||||
addData(tablePath, Seq(("1", "a1", "10", "000")))
|
||||
val df = spark.readStream
|
||||
.format("org.apache.hudi")
|
||||
.load(tablePath)
|
||||
.select("id", "name", "price", "ts")
|
||||
|
||||
testStream(df)(
|
||||
AssertOnQuery {q => q.processAllAvailable(); true },
|
||||
CheckAnswerRows(Seq(Row("1", "a1", "10", "000")), lastOnly = true, isSorted = false),
|
||||
StopStream,
|
||||
|
||||
addDataToQuery(tablePath, Seq(("1", "a1", "12", "000"))),
|
||||
StartStream(),
|
||||
AssertOnQuery {q => q.processAllAvailable(); true },
|
||||
CheckAnswerRows(Seq(Row("1", "a1", "12", "000")), lastOnly = true, isSorted = false),
|
||||
|
||||
addDataToQuery(tablePath,
|
||||
Seq(("2", "a2", "12", "000"),
|
||||
("3", "a3", "12", "000"),
|
||||
("4", "a4", "12", "000"))),
|
||||
AssertOnQuery {q => q.processAllAvailable(); true },
|
||||
CheckAnswerRows(
|
||||
Seq(Row("2", "a2", "12", "000"),
|
||||
Row("3", "a3", "12", "000"),
|
||||
Row("4", "a4", "12", "000")),
|
||||
lastOnly = true, isSorted = false),
|
||||
StopStream,
|
||||
|
||||
addDataToQuery(tablePath, Seq(("5", "a5", "12", "000"))),
|
||||
addDataToQuery(tablePath, Seq(("6", "a6", "12", "000"))),
|
||||
addDataToQuery(tablePath, Seq(("5", "a5", "15", "000"))),
|
||||
StartStream(),
|
||||
AssertOnQuery {q => q.processAllAvailable(); true },
|
||||
CheckAnswerRows(
|
||||
Seq(Row("6", "a6", "12", "000"),
|
||||
Row("5", "a5", "15", "000")),
|
||||
lastOnly = true, isSorted = false)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
test("test mor stream source") {
|
||||
withTempDir { inputDir =>
|
||||
val tablePath = s"${inputDir.getCanonicalPath}/test_mor_stream"
|
||||
HoodieTableMetaClient.initTableType(spark.sessionState.newHadoopConf(), tablePath,
|
||||
MERGE_ON_READ, getTableName(tablePath), DataSourceWriteOptions.DEFAULT_PAYLOAD_OPT_VAL)
|
||||
|
||||
addData(tablePath, Seq(("1", "a1", "10", "000")))
|
||||
val df = spark.readStream
|
||||
.format("org.apache.hudi")
|
||||
.load(tablePath)
|
||||
.select("id", "name", "price", "ts")
|
||||
|
||||
testStream(df)(
|
||||
AssertOnQuery {q => q.processAllAvailable(); true },
|
||||
CheckAnswerRows(Seq(Row("1", "a1", "10", "000")), lastOnly = true, isSorted = false),
|
||||
StopStream,
|
||||
|
||||
addDataToQuery(tablePath,
|
||||
Seq(("2", "a2", "12", "000"),
|
||||
("3", "a3", "12", "000"),
|
||||
("2", "a2", "10", "001"))),
|
||||
StartStream(),
|
||||
AssertOnQuery {q => q.processAllAvailable(); true },
|
||||
CheckAnswerRows(
|
||||
Seq(Row("3", "a3", "12", "000"),
|
||||
Row("2", "a2", "10", "001")),
|
||||
lastOnly = true, isSorted = false),
|
||||
StopStream,
|
||||
|
||||
addDataToQuery(tablePath, Seq(("5", "a5", "12", "000"))),
|
||||
addDataToQuery(tablePath, Seq(("6", "a6", "12", "000"))),
|
||||
StartStream(),
|
||||
AssertOnQuery {q => q.processAllAvailable(); true },
|
||||
CheckAnswerRows(
|
||||
Seq(Row("5", "a5", "12", "000"),
|
||||
Row("6", "a6", "12", "000")),
|
||||
lastOnly = true, isSorted = false)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private def addData(inputPath: String, rows: Seq[(String, String, String, String)]): Unit = {
|
||||
rows.toDF(columns: _*)
|
||||
.write
|
||||
.format("org.apache.hudi")
|
||||
.options(commonOptions)
|
||||
.option(TABLE_NAME, getTableName(inputPath))
|
||||
.mode(SaveMode.Append)
|
||||
.save(inputPath)
|
||||
}
|
||||
|
||||
private def addDataToQuery(inputPath: String,
|
||||
rows: Seq[(String, String, String, String)]): AssertOnQuery = {
|
||||
AssertOnQuery { _=>
|
||||
addData(inputPath, rows)
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
private def getTableName(inputPath: String): String = {
|
||||
val start = inputPath.lastIndexOf('/')
|
||||
inputPath.substring(start + 1)
|
||||
}
|
||||
}
|
||||
@@ -17,14 +17,17 @@
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.client.utils.SparkRowDeserializer
|
||||
|
||||
import org.apache.hudi.client.utils.SparkRowSerDe
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||
|
||||
class Spark2RowDeserializer(val encoder: ExpressionEncoder[Row]) extends SparkRowDeserializer {
|
||||
class Spark2RowSerDe(val encoder: ExpressionEncoder[Row]) extends SparkRowSerDe {
|
||||
def deserializeRow(internalRow: InternalRow): Row = {
|
||||
encoder.fromRow(internalRow)
|
||||
}
|
||||
|
||||
override def serializeRow(row: Row): InternalRow = {
|
||||
encoder.toRow(row)
|
||||
}
|
||||
}
|
||||
@@ -17,17 +17,21 @@
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.hudi.client.utils.SparkRowDeserializer
|
||||
|
||||
import org.apache.hudi.client.utils.SparkRowSerDe
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
|
||||
|
||||
class Spark3RowDeserializer(val encoder: ExpressionEncoder[Row]) extends SparkRowDeserializer {
|
||||
class Spark3RowSerDe(val encoder: ExpressionEncoder[Row]) extends SparkRowSerDe {
|
||||
|
||||
private val deserializer: ExpressionEncoder.Deserializer[Row] = encoder.createDeserializer()
|
||||
private val serializer: ExpressionEncoder.Serializer[Row] = encoder.createSerializer()
|
||||
|
||||
def deserializeRow(internalRow: InternalRow): Row = {
|
||||
deserializer.apply(internalRow)
|
||||
}
|
||||
|
||||
override def serializeRow(row: Row): InternalRow = {
|
||||
serializer.apply(row)
|
||||
}
|
||||
}
|
||||
21
pom.xml
21
pom.xml
@@ -527,6 +527,27 @@
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||
<classifier>tests</classifier>
|
||||
<version>${spark.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||
<classifier>tests</classifier>
|
||||
<version>${spark.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-catalyst_${scala.binary.version}</artifactId>
|
||||
<classifier>tests</classifier>
|
||||
<version>${spark.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Spark (Packages) -->
|
||||
<dependency>
|
||||
|
||||
Reference in New Issue
Block a user