1
0

[HUDI-764] [HUDI-765] ORC reader writer Implementation (#2999)

Co-authored-by: Qingyun (Teresa) Kang <kteresa@uber.com>
This commit is contained in:
Jintao Guan
2021-06-15 15:21:43 -07:00
committed by GitHub
parent cb642ceb75
commit b8fe5b91d5
29 changed files with 2268 additions and 91 deletions

View File

@@ -19,7 +19,7 @@ package org.apache.hudi
import org.apache.hadoop.fs.Path
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.common.model.HoodieRecord
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord}
import org.apache.hudi.DataSourceWriteOptions.{BOOTSTRAP_OPERATION_OPT_VAL, OPERATION_OPT_KEY}
import org.apache.hudi.common.fs.FSUtils
import org.apache.hudi.common.model.HoodieTableType.{COPY_ON_WRITE, MERGE_ON_READ}
@@ -28,6 +28,7 @@ import org.apache.hudi.exception.HoodieException
import org.apache.hudi.hadoop.HoodieROTablePathFilter
import org.apache.log4j.LogManager
import org.apache.spark.sql.execution.datasources.{DataSource, FileStatusCache, HadoopFsRelation}
import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
import org.apache.spark.sql.execution.streaming.{Sink, Source}
import org.apache.spark.sql.hudi.streaming.HoodieStreamSource
@@ -186,6 +187,10 @@ class DefaultSource extends RelationProvider
extraReadPaths: Seq[String],
metaClient: HoodieTableMetaClient): BaseRelation = {
log.info("Loading Base File Only View with options :" + optParams)
val (tableFileFormat, formatClassName) = metaClient.getTableConfig.getBaseFileFormat match {
case HoodieFileFormat.PARQUET => (new ParquetFileFormat, "parquet")
case HoodieFileFormat.ORC => (new OrcFileFormat, "orc")
}
if (useHoodieFileIndex) {
@@ -198,7 +203,7 @@ class DefaultSource extends RelationProvider
fileIndex.partitionSchema,
fileIndex.dataSchema,
bucketSpec = None,
fileFormat = new ParquetFileFormat,
fileFormat = tableFileFormat,
optParams)(sqlContext.sparkSession)
} else {
// this is just effectively RO view only, where `path` can contain a mix of
@@ -208,12 +213,12 @@ class DefaultSource extends RelationProvider
classOf[HoodieROTablePathFilter],
classOf[org.apache.hadoop.fs.PathFilter])
// simply return as a regular parquet relation
// simply return as a regular relation
DataSource.apply(
sparkSession = sqlContext.sparkSession,
paths = extraReadPaths,
userSpecifiedSchema = Option(schema),
className = "parquet",
className = formatClassName,
options = optParams)
.resolveRelation()
}