[HUDI-2161] Adding support to disable meta columns with bulk insert operation (#3247)
This commit is contained in:
committed by
GitHub
parent
2099bf41db
commit
d5026e9a24
@@ -18,12 +18,6 @@
|
||||
|
||||
package org.apache.hudi;
|
||||
|
||||
import static org.apache.spark.sql.functions.callUDF;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
@@ -41,8 +35,17 @@ import org.apache.spark.sql.api.java.UDF1;
|
||||
import org.apache.spark.sql.functions;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import scala.collection.JavaConverters;
|
||||
|
||||
import static org.apache.spark.sql.functions.callUDF;
|
||||
|
||||
/**
|
||||
* Helper class to assist in preparing {@link Dataset<Row>}s for bulk insert with datasource implementation.
|
||||
*/
|
||||
@@ -112,4 +115,40 @@ public class HoodieDatasetBulkInsertHelper {
|
||||
|
||||
return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add empty meta fields and reorder such that meta fields are at the beginning.
|
||||
*
|
||||
* @param rows
|
||||
* @return
|
||||
*/
|
||||
public static Dataset<Row> prepareHoodieDatasetForBulkInsertWithoutMetaFields(Dataset<Row> rows) {
|
||||
// add empty meta cols.
|
||||
Dataset<Row> rowsWithMetaCols = rows
|
||||
.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD,
|
||||
functions.lit("").cast(DataTypes.StringType))
|
||||
.withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD,
|
||||
functions.lit("").cast(DataTypes.StringType))
|
||||
.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD,
|
||||
functions.lit("").cast(DataTypes.StringType))
|
||||
.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD,
|
||||
functions.lit("").cast(DataTypes.StringType))
|
||||
.withColumn(HoodieRecord.FILENAME_METADATA_FIELD,
|
||||
functions.lit("").cast(DataTypes.StringType));
|
||||
|
||||
List<Column> originalFields =
|
||||
Arrays.stream(rowsWithMetaCols.schema().fields()).filter(field -> !field.name().contains("_hoodie_")).map(f -> new Column(f.name())).collect(Collectors.toList());
|
||||
|
||||
List<Column> metaFields =
|
||||
Arrays.stream(rowsWithMetaCols.schema().fields()).filter(field -> field.name().contains("_hoodie_")).map(f -> new Column(f.name())).collect(Collectors.toList());
|
||||
|
||||
// reorder such that all meta columns are at the beginning followed by original columns
|
||||
List<Column> allCols = new ArrayList<>();
|
||||
allCols.addAll(metaFields);
|
||||
allCols.addAll(originalFields);
|
||||
|
||||
return rowsWithMetaCols.select(
|
||||
JavaConverters.collectionAsScalaIterableConverter(allCols).asScala().toSeq());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ import org.apache.hudi.common.util.{CommitUtils, ReflectionUtils}
|
||||
import org.apache.hudi.config.HoodieBootstrapConfig.{BOOTSTRAP_BASE_PATH_PROP, BOOTSTRAP_INDEX_CLASS_PROP}
|
||||
import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig}
|
||||
import org.apache.hudi.exception.HoodieException
|
||||
import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerWithRowsFactory
|
||||
import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRowsFactory, NonSortPartitionerWithRows}
|
||||
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool}
|
||||
import org.apache.hudi.index.SparkHoodieIndex
|
||||
import org.apache.hudi.internal.DataSourceInternalWriterHelper
|
||||
@@ -128,6 +128,7 @@ object HoodieSparkSqlWriter {
|
||||
.setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_OPT_KEY))
|
||||
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
|
||||
.setPartitionColumns(partitionColumns)
|
||||
.setPopulateMetaFields(parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
|
||||
.initTable(sparkContext.hadoopConfiguration, path.get)
|
||||
tableConfig = tableMetaClient.getTableConfig
|
||||
}
|
||||
@@ -139,7 +140,8 @@ object HoodieSparkSqlWriter {
|
||||
if (hoodieConfig.getBoolean(ENABLE_ROW_WRITER_OPT_KEY) &&
|
||||
operation == WriteOperationType.BULK_INSERT) {
|
||||
val (success, commitTime: common.util.Option[String]) = bulkInsertAsRow(sqlContext, parameters, df, tblName,
|
||||
basePath, path, instantTime)
|
||||
basePath, path, instantTime, parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(),
|
||||
HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
|
||||
return (success, commitTime, common.util.Option.empty(), common.util.Option.empty(), hoodieWriteClient.orNull, tableConfig)
|
||||
}
|
||||
// scalastyle:on
|
||||
@@ -330,7 +332,8 @@ object HoodieSparkSqlWriter {
|
||||
tblName: String,
|
||||
basePath: Path,
|
||||
path: Option[String],
|
||||
instantTime: String): (Boolean, common.util.Option[String]) = {
|
||||
instantTime: String,
|
||||
populateMetaFields: Boolean): (Boolean, common.util.Option[String]) = {
|
||||
val sparkContext = sqlContext.sparkContext
|
||||
// register classes & schemas
|
||||
val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName)
|
||||
@@ -345,22 +348,36 @@ object HoodieSparkSqlWriter {
|
||||
}
|
||||
val params = parameters.updated(HoodieWriteConfig.AVRO_SCHEMA.key, schema.toString)
|
||||
val writeConfig = DataSourceUtils.createHoodieConfig(schema.toString, path.get, tblName, mapAsJavaMap(params))
|
||||
val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig)
|
||||
val bulkInsertPartitionerRows : BulkInsertPartitioner[Dataset[Row]] = if (userDefinedBulkInsertPartitionerOpt.isPresent) {
|
||||
userDefinedBulkInsertPartitionerOpt.get
|
||||
}
|
||||
else {
|
||||
BulkInsertInternalPartitionerWithRowsFactory.get(writeConfig.getBulkInsertSortMode)
|
||||
val bulkInsertPartitionerRows : BulkInsertPartitioner[Dataset[Row]] = if (populateMetaFields) {
|
||||
val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig)
|
||||
if (userDefinedBulkInsertPartitionerOpt.isPresent) {
|
||||
userDefinedBulkInsertPartitionerOpt.get
|
||||
}
|
||||
else {
|
||||
BulkInsertInternalPartitionerWithRowsFactory.get(writeConfig.getBulkInsertSortMode)
|
||||
}
|
||||
} else {
|
||||
// Sort modes are not yet supported when meta fields are disabled
|
||||
new NonSortPartitionerWithRows()
|
||||
}
|
||||
val arePartitionRecordsSorted = bulkInsertPartitionerRows.arePartitionRecordsSorted();
|
||||
parameters.updated(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED, arePartitionRecordsSorted.toString)
|
||||
val isGlobalIndex = SparkHoodieIndex.isGlobalIndex(writeConfig)
|
||||
val hoodieDF = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace,
|
||||
bulkInsertPartitionerRows, isGlobalIndex)
|
||||
val isGlobalIndex = if (populateMetaFields) {
|
||||
SparkHoodieIndex.isGlobalIndex(writeConfig)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
val hoodieDF = if (populateMetaFields) {
|
||||
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace,
|
||||
bulkInsertPartitionerRows, isGlobalIndex)
|
||||
} else {
|
||||
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsertWithoutMetaFields(df)
|
||||
}
|
||||
if (SPARK_VERSION.startsWith("2.")) {
|
||||
hoodieDF.write.format("org.apache.hudi.internal")
|
||||
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)
|
||||
.options(params)
|
||||
.mode(SaveMode.Append)
|
||||
.save()
|
||||
} else if (SPARK_VERSION.startsWith("3.")) {
|
||||
hoodieDF.write.format("org.apache.hudi.spark3.internal")
|
||||
|
||||
@@ -1,206 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.avro.Schema
|
||||
import org.apache.avro.generic.GenericRecord
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.hudi.common.model.HoodieRecord
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
import org.apache.spark.sql.avro.SchemaConverters
|
||||
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal}
|
||||
import org.apache.spark.sql.sources.{And, EqualNullSafe, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Not, Or, StringContains, StringEndsWith, StringStartsWith}
|
||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||
import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex}
|
||||
import org.apache.spark.sql.types.{StringType, StructField, StructType}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
object HoodieSparkUtils extends SparkAdapterSupport {
|
||||
|
||||
def getMetaSchema: StructType = {
|
||||
StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
|
||||
StructField(col, StringType, nullable = true)
|
||||
}))
|
||||
}
|
||||
|
||||
/**
|
||||
* This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]].
|
||||
* [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally.
|
||||
*/
|
||||
def isGlobPath(pattern: Path): Boolean = {
|
||||
pattern.toString.exists("{}[]*?\\".toSet.contains)
|
||||
}
|
||||
|
||||
/**
|
||||
* This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]].
|
||||
* [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally.
|
||||
*/
|
||||
def globPath(fs: FileSystem, pattern: Path): Seq[Path] = {
|
||||
Option(fs.globStatus(pattern)).map { statuses =>
|
||||
statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq
|
||||
}.getOrElse(Seq.empty[Path])
|
||||
}
|
||||
|
||||
/**
|
||||
* This method copied from [[org.apache.spark.deploy.SparkHadoopUtil]].
|
||||
* [[org.apache.spark.deploy.SparkHadoopUtil]] becomes private since Spark 3.0.0 and hence we had to copy it locally.
|
||||
*/
|
||||
def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = {
|
||||
if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern)
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks to see whether input path contains a glob pattern and if yes, maps it to a list of absolute paths
|
||||
* which match the glob pattern. Otherwise, returns original path
|
||||
*
|
||||
* @param paths List of absolute or globbed paths
|
||||
* @param fs File system
|
||||
* @return list of absolute file paths
|
||||
*/
|
||||
def checkAndGlobPathIfNecessary(paths: Seq[String], fs: FileSystem): Seq[Path] = {
|
||||
paths.flatMap(path => {
|
||||
val qualified = new Path(path).makeQualified(fs.getUri, fs.getWorkingDirectory)
|
||||
val globPaths = globPathIfNecessary(fs, qualified)
|
||||
globPaths
|
||||
})
|
||||
}
|
||||
|
||||
def createInMemoryFileIndex(sparkSession: SparkSession, globbedPaths: Seq[Path]): InMemoryFileIndex = {
|
||||
val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
|
||||
new InMemoryFileIndex(sparkSession, globbedPaths, Map(), Option.empty, fileStatusCache)
|
||||
}
|
||||
|
||||
def createRdd(df: DataFrame, structName: String, recordNamespace: String): RDD[GenericRecord] = {
|
||||
val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(df.schema, structName, recordNamespace)
|
||||
createRdd(df, avroSchema, structName, recordNamespace)
|
||||
}
|
||||
|
||||
def createRdd(df: DataFrame, avroSchema: Schema, structName: String, recordNamespace: String)
|
||||
: RDD[GenericRecord] = {
|
||||
// Use the Avro schema to derive the StructType which has the correct nullability information
|
||||
val dataType = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType]
|
||||
val encoder = RowEncoder.apply(dataType).resolveAndBind()
|
||||
val deserializer = sparkAdapter.createSparkRowSerDe(encoder)
|
||||
df.queryExecution.toRdd.map(row => deserializer.deserializeRow(row))
|
||||
.mapPartitions { records =>
|
||||
if (records.isEmpty) Iterator.empty
|
||||
else {
|
||||
val convertor = AvroConversionHelper.createConverterToAvro(dataType, structName, recordNamespace)
|
||||
records.map { x => convertor(x).asInstanceOf[GenericRecord] }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Filters to Catalyst Expressions and joined by And. If convert success return an
|
||||
* Non-Empty Option[Expression],or else return None.
|
||||
*/
|
||||
def convertToCatalystExpressions(filters: Array[Filter],
|
||||
tableSchema: StructType): Option[Expression] = {
|
||||
val expressions = filters.map(convertToCatalystExpression(_, tableSchema))
|
||||
if (expressions.forall(p => p.isDefined)) {
|
||||
if (expressions.isEmpty) {
|
||||
None
|
||||
} else if (expressions.length == 1) {
|
||||
expressions(0)
|
||||
} else {
|
||||
Some(expressions.map(_.get).reduce(org.apache.spark.sql.catalyst.expressions.And))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert Filter to Catalyst Expression. If convert success return an Non-Empty
|
||||
* Option[Expression],or else return None.
|
||||
*/
|
||||
def convertToCatalystExpression(filter: Filter, tableSchema: StructType): Option[Expression] = {
|
||||
Option(
|
||||
filter match {
|
||||
case EqualTo(attribute, value) =>
|
||||
org.apache.spark.sql.catalyst.expressions.EqualTo(toAttribute(attribute, tableSchema), Literal.create(value))
|
||||
case EqualNullSafe(attribute, value) =>
|
||||
org.apache.spark.sql.catalyst.expressions.EqualNullSafe(toAttribute(attribute, tableSchema), Literal.create(value))
|
||||
case GreaterThan(attribute, value) =>
|
||||
org.apache.spark.sql.catalyst.expressions.GreaterThan(toAttribute(attribute, tableSchema), Literal.create(value))
|
||||
case GreaterThanOrEqual(attribute, value) =>
|
||||
org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual(toAttribute(attribute, tableSchema), Literal.create(value))
|
||||
case LessThan(attribute, value) =>
|
||||
org.apache.spark.sql.catalyst.expressions.LessThan(toAttribute(attribute, tableSchema), Literal.create(value))
|
||||
case LessThanOrEqual(attribute, value) =>
|
||||
org.apache.spark.sql.catalyst.expressions.LessThanOrEqual(toAttribute(attribute, tableSchema), Literal.create(value))
|
||||
case In(attribute, values) =>
|
||||
val attrExp = toAttribute(attribute, tableSchema)
|
||||
val valuesExp = values.map(v => Literal.create(v))
|
||||
org.apache.spark.sql.catalyst.expressions.In(attrExp, valuesExp)
|
||||
case IsNull(attribute) =>
|
||||
org.apache.spark.sql.catalyst.expressions.IsNull(toAttribute(attribute, tableSchema))
|
||||
case IsNotNull(attribute) =>
|
||||
org.apache.spark.sql.catalyst.expressions.IsNotNull(toAttribute(attribute, tableSchema))
|
||||
case And(left, right) =>
|
||||
val leftExp = convertToCatalystExpression(left, tableSchema)
|
||||
val rightExp = convertToCatalystExpression(right, tableSchema)
|
||||
if (leftExp.isEmpty || rightExp.isEmpty) {
|
||||
null
|
||||
} else {
|
||||
org.apache.spark.sql.catalyst.expressions.And(leftExp.get, rightExp.get)
|
||||
}
|
||||
case Or(left, right) =>
|
||||
val leftExp = convertToCatalystExpression(left, tableSchema)
|
||||
val rightExp = convertToCatalystExpression(right, tableSchema)
|
||||
if (leftExp.isEmpty || rightExp.isEmpty) {
|
||||
null
|
||||
} else {
|
||||
org.apache.spark.sql.catalyst.expressions.Or(leftExp.get, rightExp.get)
|
||||
}
|
||||
case Not(child) =>
|
||||
val childExp = convertToCatalystExpression(child, tableSchema)
|
||||
if (childExp.isEmpty) {
|
||||
null
|
||||
} else {
|
||||
org.apache.spark.sql.catalyst.expressions.Not(childExp.get)
|
||||
}
|
||||
case StringStartsWith(attribute, value) =>
|
||||
val leftExp = toAttribute(attribute, tableSchema)
|
||||
val rightExp = Literal.create(s"$value%")
|
||||
sparkAdapter.createLike(leftExp, rightExp)
|
||||
case StringEndsWith(attribute, value) =>
|
||||
val leftExp = toAttribute(attribute, tableSchema)
|
||||
val rightExp = Literal.create(s"%$value")
|
||||
sparkAdapter.createLike(leftExp, rightExp)
|
||||
case StringContains(attribute, value) =>
|
||||
val leftExp = toAttribute(attribute, tableSchema)
|
||||
val rightExp = Literal.create(s"%$value%")
|
||||
sparkAdapter.createLike(leftExp, rightExp)
|
||||
case _=> null
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
private def toAttribute(columnName: String, tableSchema: StructType): AttributeReference = {
|
||||
val field = tableSchema.find(p => p.name == columnName)
|
||||
assert(field.isDefined, s"Cannot find column: $columnName, Table Columns are: " +
|
||||
s"${tableSchema.fieldNames.mkString(",")}")
|
||||
AttributeReference(columnName, field.get.dataType, field.get.nullable)()
|
||||
}
|
||||
}
|
||||
@@ -1,37 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi
|
||||
|
||||
import org.apache.spark.sql.hudi.{HoodieSqlUtils, SparkAdapter}
|
||||
|
||||
/**
|
||||
* Use the SparkAdapterSupport trait to get the SparkAdapter when we
|
||||
* need to adapt the difference between spark2 and spark3.
|
||||
*/
|
||||
trait SparkAdapterSupport {
|
||||
|
||||
lazy val sparkAdapter: SparkAdapter = {
|
||||
val adapterClass = if (HoodieSqlUtils.isSpark3) {
|
||||
"org.apache.spark.sql.adapter.Spark3Adapter"
|
||||
} else {
|
||||
"org.apache.spark.sql.adapter.Spark2Adapter"
|
||||
}
|
||||
getClass.getClassLoader.loadClass(adapterClass)
|
||||
.newInstance().asInstanceOf[SparkAdapter]
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user