[HUDI-4081][HUDI-4472] Addressing Spark SQL vs Spark DS performance gap (#6213)
This commit is contained in:
@@ -22,7 +22,7 @@ import org.apache.avro.generic.GenericRecord
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.hudi.DataSourceWriteOptions._
|
||||
import org.apache.hudi.HoodieConversionUtils.toProperties
|
||||
import org.apache.hudi.HoodieConversionUtils.{toProperties, toScalaOption}
|
||||
import org.apache.hudi.HoodieWriterUtils._
|
||||
import org.apache.hudi.avro.HoodieAvroUtils
|
||||
import org.apache.hudi.client.{HoodieWriteResult, SparkRDDWriteClient}
|
||||
@@ -31,7 +31,7 @@ import org.apache.hudi.common.fs.FSUtils
|
||||
import org.apache.hudi.common.model._
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline
|
||||
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient, TableSchemaResolver}
|
||||
import org.apache.hudi.common.util.{CommitUtils, StringUtils}
|
||||
import org.apache.hudi.common.util.{CommitUtils, Functions, StringUtils}
|
||||
import org.apache.hudi.config.HoodieBootstrapConfig.{BASE_PATH, INDEX_CLASS_NAME}
|
||||
import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig}
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
@@ -72,8 +72,7 @@ object HoodieSparkSqlWriter {
|
||||
hoodieTableConfigOpt: Option[HoodieTableConfig] = Option.empty,
|
||||
hoodieWriteClient: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]] = Option.empty,
|
||||
asyncCompactionTriggerFn: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]] => Unit] = Option.empty,
|
||||
asyncClusteringTriggerFn: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]] => Unit] = Option.empty
|
||||
)
|
||||
asyncClusteringTriggerFn: Option[SparkRDDWriteClient[HoodieRecordPayload[Nothing]] => Unit] = Option.empty)
|
||||
: (Boolean, common.util.Option[String], common.util.Option[String], common.util.Option[String],
|
||||
SparkRDDWriteClient[HoodieRecordPayload[Nothing]], HoodieTableConfig) = {
|
||||
|
||||
@@ -241,39 +240,49 @@ object HoodieSparkSqlWriter {
|
||||
sparkContext.getConf.registerKryoClasses(
|
||||
Array(classOf[org.apache.avro.generic.GenericData],
|
||||
classOf[org.apache.avro.Schema]))
|
||||
var schema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace)
|
||||
val lastestSchema = getLatestTableSchema(fs, basePath, sparkContext, schema)
|
||||
var internalSchemaOpt = getLatestTableInternalSchema(fs, basePath, sparkContext)
|
||||
if (reconcileSchema && parameters.getOrDefault(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key(), "false").toBoolean
|
||||
&& internalSchemaOpt.isEmpty) {
|
||||
// force apply full schema evolution.
|
||||
internalSchemaOpt = Some(AvroInternalSchemaConverter.convert(schema))
|
||||
}
|
||||
if (reconcileSchema) {
|
||||
schema = lastestSchema
|
||||
}
|
||||
if (internalSchemaOpt.isDefined) {
|
||||
// Apply schema evolution.
|
||||
val mergedSparkSchema = if (!reconcileSchema) {
|
||||
AvroConversionUtils.convertAvroSchemaToStructType(AvroSchemaEvolutionUtils.canonicalizeColumnNullability(schema, lastestSchema))
|
||||
} else {
|
||||
// Auto merge write schema and read schema.
|
||||
val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(schema, internalSchemaOpt.get)
|
||||
AvroConversionUtils.convertAvroSchemaToStructType(AvroInternalSchemaConverter.convert(mergedInternalSchema, lastestSchema.getName))
|
||||
}
|
||||
schema = AvroConversionUtils.convertStructTypeToAvroSchema(mergedSparkSchema, structName, nameSpace)
|
||||
}
|
||||
|
||||
if (reconcileSchema && internalSchemaOpt.isEmpty) {
|
||||
schema = lastestSchema
|
||||
}
|
||||
validateSchemaForHoodieIsDeleted(schema)
|
||||
sparkContext.getConf.registerAvroSchemas(schema)
|
||||
log.info(s"Registered avro schema : ${schema.toString(true)}")
|
||||
// TODO(HUDI-4472) revisit and simplify schema handling
|
||||
val sourceSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, nameSpace)
|
||||
val latestTableSchema = getLatestTableSchema(fs, basePath, sparkContext).getOrElse(sourceSchema)
|
||||
|
||||
val schemaEvolutionEnabled = parameters.getOrDefault(DataSourceReadOptions.SCHEMA_EVOLUTION_ENABLED.key(), "false").toBoolean
|
||||
var internalSchemaOpt = getLatestTableInternalSchema(fs, basePath, sparkContext)
|
||||
|
||||
val writerSchema: Schema =
|
||||
if (reconcileSchema) {
|
||||
// In case we need to reconcile the schema and schema evolution is enabled,
|
||||
// we will force-apply schema evolution to the writer's schema
|
||||
if (schemaEvolutionEnabled && internalSchemaOpt.isEmpty) {
|
||||
internalSchemaOpt = Some(AvroInternalSchemaConverter.convert(sourceSchema))
|
||||
}
|
||||
|
||||
if (internalSchemaOpt.isDefined) {
|
||||
// Apply schema evolution, by auto-merging write schema and read schema
|
||||
val mergedInternalSchema = AvroSchemaEvolutionUtils.reconcileSchema(sourceSchema, internalSchemaOpt.get)
|
||||
AvroInternalSchemaConverter.convert(mergedInternalSchema, latestTableSchema.getName)
|
||||
} else if (TableSchemaResolver.isSchemaCompatible(sourceSchema, latestTableSchema)) {
|
||||
// In case schema reconciliation is enabled and source and latest table schemas
|
||||
// are compatible (as defined by [[TableSchemaResolver#isSchemaCompatible]], then we will
|
||||
// pick latest table's schema as the writer's schema
|
||||
latestTableSchema
|
||||
} else {
|
||||
// Otherwise fallback to original source's schema
|
||||
sourceSchema
|
||||
}
|
||||
} else {
|
||||
// In case reconciliation is disabled, we still have to do nullability attributes
|
||||
// (minor) reconciliation, making sure schema of the incoming batch is in-line with
|
||||
// the data already committed in the table
|
||||
AvroSchemaEvolutionUtils.canonicalizeColumnNullability(sourceSchema, latestTableSchema)
|
||||
}
|
||||
|
||||
validateSchemaForHoodieIsDeleted(writerSchema)
|
||||
sparkContext.getConf.registerAvroSchemas(writerSchema)
|
||||
log.info(s"Registered avro schema : ${writerSchema.toString(true)}")
|
||||
|
||||
// Convert to RDD[HoodieRecord]
|
||||
val genericRecords: RDD[GenericRecord] = HoodieSparkUtils.createRdd(df, structName, nameSpace, reconcileSchema,
|
||||
org.apache.hudi.common.util.Option.of(schema))
|
||||
org.apache.hudi.common.util.Option.of(writerSchema))
|
||||
val shouldCombine = parameters(INSERT_DROP_DUPS.key()).toBoolean ||
|
||||
operation.equals(WriteOperationType.UPSERT) ||
|
||||
parameters.getOrElse(HoodieWriteConfig.COMBINE_BEFORE_INSERT.key(),
|
||||
@@ -295,10 +304,10 @@ object HoodieSparkSqlWriter {
|
||||
hoodieRecord
|
||||
}).toJavaRDD()
|
||||
|
||||
val writeSchema = if (dropPartitionColumns) generateSchemaWithoutPartitionColumns(partitionColumns, schema) else schema
|
||||
val writerDataSchema = if (dropPartitionColumns) generateSchemaWithoutPartitionColumns(partitionColumns, writerSchema) else writerSchema
|
||||
// Create a HoodieWriteClient & issue the write.
|
||||
|
||||
val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, writeSchema.toString, path,
|
||||
val client = hoodieWriteClient.getOrElse(DataSourceUtils.createHoodieClient(jsc, writerDataSchema.toString, path,
|
||||
tblName, mapAsJavaMap(addSchemaEvolutionParameters(parameters, internalSchemaOpt) - HoodieWriteConfig.AUTO_COMMIT_ENABLE.key)
|
||||
)).asInstanceOf[SparkRDDWriteClient[HoodieRecordPayload[Nothing]]]
|
||||
|
||||
@@ -388,14 +397,18 @@ object HoodieSparkSqlWriter {
|
||||
* @param schema incoming record's schema.
|
||||
* @return Pair of(boolean, table schema), where first entry will be true only if schema conversion is required.
|
||||
*/
|
||||
def getLatestTableSchema(fs: FileSystem, basePath: Path, sparkContext: SparkContext, schema: Schema): Schema = {
|
||||
var latestSchema: Schema = schema
|
||||
def getLatestTableSchema(fs: FileSystem, basePath: Path, sparkContext: SparkContext): Option[Schema] = {
|
||||
if (FSUtils.isTableExists(basePath.toString, fs)) {
|
||||
val tableMetaClient = HoodieTableMetaClient.builder.setConf(sparkContext.hadoopConfiguration).setBasePath(basePath.toString).build()
|
||||
val tableMetaClient = HoodieTableMetaClient.builder
|
||||
.setConf(sparkContext.hadoopConfiguration)
|
||||
.setBasePath(basePath.toString)
|
||||
.build()
|
||||
val tableSchemaResolver = new TableSchemaResolver(tableMetaClient)
|
||||
latestSchema = tableSchemaResolver.getLatestSchema(schema, false, null)
|
||||
|
||||
toScalaOption(tableSchemaResolver.getTableAvroSchemaFromLatestCommit(false))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
latestSchema
|
||||
}
|
||||
|
||||
def registerKryoClassesAndGetGenericRecords(tblName: String, sparkContext: SparkContext, df: Dataset[Row],
|
||||
|
||||
@@ -317,8 +317,8 @@ object HoodieSqlCommonUtils extends SparkAdapterSupport {
|
||||
def castIfNeeded(child: Expression, dataType: DataType, conf: SQLConf): Expression = {
|
||||
child match {
|
||||
case Literal(nul, NullType) => Literal(nul, dataType)
|
||||
case _ => if (child.dataType != dataType)
|
||||
Cast(child, dataType, Option(conf.sessionLocalTimeZone)) else child
|
||||
case expr if child.dataType != dataType => Cast(expr, dataType, Option(conf.sessionLocalTimeZone))
|
||||
case _ => child
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user