1
0

[HUDI-2051] Enable Hive Sync When Spark Enable Hive Meta For Spark Sql (#3126)

This commit is contained in:
pengzhiwei
2021-07-02 16:08:36 +08:00
committed by GitHub
parent 6eca06d074
commit 6403547431
7 changed files with 14 additions and 17 deletions

View File

@@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
import org.apache.spark.sql.catalyst.expressions.{And, Attribute, Cast, Expression, Literal}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
import org.apache.spark.sql.types.{DataType, NullType, StringType, StructField, StructType}
import scala.collection.immutable.Map
@@ -171,10 +171,6 @@ object HoodieSqlUtils extends SparkAdapterSupport {
/**
* Append the SparkSession config and table options to the baseConfig.
* We add the "spark" prefix to hoodie's config key.
* @param spark
* @param options
* @param baseConfig
* @return
*/
def withSparkConf(spark: SparkSession, options: Map[String, String])
(baseConfig: Map[String, String]): Map[String, String] = {
@@ -184,4 +180,7 @@ object HoodieSqlUtils extends SparkAdapterSupport {
}
def isSpark3: Boolean = SPARK_VERSION.startsWith("3.")
def isEnableHive(sparkSession: SparkSession): Boolean =
"hive" == sparkSession.sessionState.conf.getConf(StaticSQLConf.CATALOG_IMPLEMENTATION)
}

View File

@@ -155,7 +155,7 @@ case class CreateHoodieTableCommand(table: CatalogTable, ignoreIfExists: Boolean
validateTable(newTable)
// Create table in the catalog
val enableHive = "hive" == sessionState.conf.getConf(StaticSQLConf.CATALOG_IMPLEMENTATION)
val enableHive = isEnableHive(sparkSession)
if (enableHive) {
createHiveDataSourceTable(newTable, sparkSession)
} else {

View File

@@ -66,10 +66,9 @@ case class DeleteHoodieTableCommand(deleteTable: DeleteFromTable) extends Runnab
assert(primaryColumns.nonEmpty,
s"There are no primary key defined in table $tableId, cannot execute delete operator")
withSparkConf(sparkSession, targetTable.storage.properties) {
Map(
"path" -> path.toString,
"path" -> path,
KEYGENERATOR_CLASS_OPT_KEY.key -> classOf[SqlKeyGenerator].getCanonicalName,
TABLE_NAME.key -> tableId.table,
OPERATION_OPT_KEY.key -> DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL,

View File

@@ -18,7 +18,6 @@
package org.apache.spark.sql.hudi.command
import java.util.Properties
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericRecord, IndexedRecord}
import org.apache.hudi.common.model.{DefaultHoodieRecordPayload, HoodieRecord}
@@ -232,7 +231,7 @@ object InsertIntoHoodieTableCommand {
} else {
classOf[DefaultHoodieRecordPayload].getCanonicalName
}
val enableHive = isEnableHive(sparkSession)
withSparkConf(sparkSession, options) {
Map(
"path" -> path,
@@ -244,7 +243,7 @@ object InsertIntoHoodieTableCommand {
RECORDKEY_FIELD_OPT_KEY.key -> primaryColumns.mkString(","),
PARTITIONPATH_FIELD_OPT_KEY.key -> partitionFields,
PAYLOAD_CLASS_OPT_KEY.key -> payloadClassName,
META_SYNC_ENABLED_OPT_KEY.key -> "true",
META_SYNC_ENABLED_OPT_KEY.key -> enableHive.toString,
HIVE_USE_JDBC_OPT_KEY.key -> "false",
HIVE_DATABASE_OPT_KEY.key -> table.identifier.database.getOrElse("default"),
HIVE_TABLE_OPT_KEY.key -> table.identifier.table,

View File

@@ -18,7 +18,6 @@
package org.apache.spark.sql.hudi.command
import java.util.Base64
import org.apache.avro.Schema
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig
@@ -426,7 +425,8 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab
throw new IllegalArgumentException(s"Merge Key[${targetKey2SourceExpression.keySet.mkString(",")}] is not" +
s" Equal to the defined primary key[${definedPk.mkString(",")}] in table $targetTableName")
}
// Enable the hive sync by default if spark have enable the hive metastore.
val enableHive = isEnableHive(sparkSession)
HoodieWriterUtils.parametersWithWriteDefaults(
withSparkConf(sparkSession, options) {
Map(
@@ -437,7 +437,7 @@ case class MergeIntoHoodieTableCommand(mergeInto: MergeIntoTable) extends Runnab
TABLE_NAME.key -> targetTableName,
PARTITIONPATH_FIELD_OPT_KEY.key -> targetTable.partitionColumnNames.mkString(","),
PAYLOAD_CLASS_OPT_KEY.key -> classOf[ExpressionPayload].getCanonicalName,
META_SYNC_ENABLED_OPT_KEY.key -> "true",
META_SYNC_ENABLED_OPT_KEY.key -> enableHive.toString,
HIVE_USE_JDBC_OPT_KEY.key -> "false",
HIVE_DATABASE_OPT_KEY.key -> targetTableDb,
HIVE_TABLE_OPT_KEY.key -> targetTableName,

View File

@@ -93,16 +93,17 @@ case class UpdateHoodieTableCommand(updateTable: UpdateTable) extends RunnableCo
assert(primaryColumns.nonEmpty,
s"There are no primary key in table $tableId, cannot execute update operator")
val enableHive = isEnableHive(sparkSession)
withSparkConf(sparkSession, targetTable.storage.properties) {
Map(
"path" -> path.toString,
"path" -> path,
RECORDKEY_FIELD_OPT_KEY.key -> primaryColumns.mkString(","),
KEYGENERATOR_CLASS_OPT_KEY.key -> classOf[SqlKeyGenerator].getCanonicalName,
PRECOMBINE_FIELD_OPT_KEY.key -> primaryColumns.head, //set the default preCombine field.
TABLE_NAME.key -> tableId.table,
OPERATION_OPT_KEY.key -> DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL,
PARTITIONPATH_FIELD_OPT_KEY.key -> targetTable.partitionColumnNames.mkString(","),
META_SYNC_ENABLED_OPT_KEY.key -> "false", // TODO make the meta sync enable by default.
META_SYNC_ENABLED_OPT_KEY.key -> enableHive.toString,
HIVE_USE_JDBC_OPT_KEY.key -> "false",
HIVE_DATABASE_OPT_KEY.key -> tableId.database.getOrElse("default"),
HIVE_TABLE_OPT_KEY.key -> tableId.table,

View File

@@ -37,7 +37,6 @@ class TestHoodieSqlBase extends FunSuite with BeforeAndAfterAll {
.appName("hoodie sql test")
.withExtensions(new HoodieSparkSessionExtension)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("hoodie.datasource.meta.sync.enable", "false")
.config("hoodie.insert.shuffle.parallelism", "4")
.config("hoodie.upsert.shuffle.parallelism", "4")
.config("hoodie.delete.shuffle.parallelism", "4")