1
0

[HUDI-1153] Spark DataSource and Streaming Write must fail when operation type is misconfigured (#2014)

This commit is contained in:
Sreeram Ramji
2020-09-04 09:08:30 -07:00
committed by GitHub
parent 8d19ebfd0f
commit 6537af2676
5 changed files with 47 additions and 36 deletions

View File

@@ -71,6 +71,14 @@ public enum WriteOperationType {
} }
} }
/**
* Getter for value.
* @return string form of WriteOperationType
*/
public String value() {
return value;
}
public static boolean isChangingRecords(WriteOperationType operationType) { public static boolean isChangingRecords(WriteOperationType operationType) {
return operationType == UPSERT || operationType == UPSERT_PREPPED || operationType == DELETE; return operationType == UPSERT || operationType == UPSERT_PREPPED || operationType == DELETE;
} }

View File

@@ -25,6 +25,7 @@ import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
@@ -248,16 +249,18 @@ public class DataSourceUtils {
} }
public static JavaRDD<WriteStatus> doWriteOperation(HoodieWriteClient client, JavaRDD<HoodieRecord> hoodieRecords, public static JavaRDD<WriteStatus> doWriteOperation(HoodieWriteClient client, JavaRDD<HoodieRecord> hoodieRecords,
String instantTime, String operation) throws HoodieException { String instantTime, WriteOperationType operation) throws HoodieException {
if (operation.equals(DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL())) { switch (operation) {
Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner = case BULK_INSERT:
createUserDefinedBulkInsertPartitioner(client.getConfig()); Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner =
return client.bulkInsert(hoodieRecords, instantTime, userDefinedBulkInsertPartitioner); createUserDefinedBulkInsertPartitioner(client.getConfig());
} else if (operation.equals(DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL())) { return client.bulkInsert(hoodieRecords, instantTime, userDefinedBulkInsertPartitioner);
return client.insert(hoodieRecords, instantTime); case INSERT:
} else { return client.insert(hoodieRecords, instantTime);
// default is upsert case UPSERT:
return client.upsert(hoodieRecords, instantTime); return client.upsert(hoodieRecords, instantTime);
default:
throw new HoodieException("Not a valid operation type for doWriteOperation: " + operation.toString());
} }
} }

View File

@@ -19,6 +19,7 @@ package org.apache.hudi
import org.apache.hudi.common.model.HoodieTableType import org.apache.hudi.common.model.HoodieTableType
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload
import org.apache.hudi.common.model.WriteOperationType
import org.apache.hudi.hive.HiveSyncTool import org.apache.hudi.hive.HiveSyncTool
import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor import org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
import org.apache.hudi.keygen.SimpleKeyGenerator import org.apache.hudi.keygen.SimpleKeyGenerator
@@ -136,11 +137,11 @@ object DataSourceWriteOptions {
* Default: upsert() * Default: upsert()
*/ */
val OPERATION_OPT_KEY = "hoodie.datasource.write.operation" val OPERATION_OPT_KEY = "hoodie.datasource.write.operation"
val BULK_INSERT_OPERATION_OPT_VAL = "bulk_insert" val BULK_INSERT_OPERATION_OPT_VAL = WriteOperationType.BULK_INSERT.value
val INSERT_OPERATION_OPT_VAL = "insert" val INSERT_OPERATION_OPT_VAL = WriteOperationType.INSERT.value
val UPSERT_OPERATION_OPT_VAL = "upsert" val UPSERT_OPERATION_OPT_VAL = WriteOperationType.UPSERT.value
val DELETE_OPERATION_OPT_VAL = "delete" val DELETE_OPERATION_OPT_VAL = WriteOperationType.DELETE.value
val BOOTSTRAP_OPERATION_OPT_VAL = "bootstrap" val BOOTSTRAP_OPERATION_OPT_VAL = WriteOperationType.BOOTSTRAP.value
val DEFAULT_OPERATION_OPT_VAL = UPSERT_OPERATION_OPT_VAL val DEFAULT_OPERATION_OPT_VAL = UPSERT_OPERATION_OPT_VAL
/** /**

View File

@@ -29,7 +29,7 @@ import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.avro.HoodieAvroUtils import org.apache.hudi.avro.HoodieAvroUtils
import org.apache.hudi.client.{HoodieWriteClient, WriteStatus} import org.apache.hudi.client.{HoodieWriteClient, WriteStatus}
import org.apache.hudi.common.config.TypedProperties import org.apache.hudi.common.config.TypedProperties
import org.apache.hudi.common.model.{HoodieRecordPayload, HoodieTableType} import org.apache.hudi.common.model.{HoodieRecordPayload, HoodieTableType, WriteOperationType}
import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient} import org.apache.hudi.common.table.{HoodieTableConfig, HoodieTableMetaClient}
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline import org.apache.hudi.common.table.timeline.HoodieActiveTimeline
import org.apache.hudi.common.util.ReflectionUtils import org.apache.hudi.common.util.ReflectionUtils
@@ -76,21 +76,19 @@ private[hudi] object HoodieSparkSqlWriter {
case _ => throw new HoodieException("hoodie only support org.apache.spark.serializer.KryoSerializer as spark.serializer") case _ => throw new HoodieException("hoodie only support org.apache.spark.serializer.KryoSerializer as spark.serializer")
} }
val tableType = parameters(TABLE_TYPE_OPT_KEY) val tableType = parameters(TABLE_TYPE_OPT_KEY)
val operation = var operation = WriteOperationType.fromValue(parameters(OPERATION_OPT_KEY))
// It does not make sense to allow upsert() operation if INSERT_DROP_DUPS_OPT_KEY is true // It does not make sense to allow upsert() operation if INSERT_DROP_DUPS_OPT_KEY is true
// Auto-correct the operation to "insert" if OPERATION_OPT_KEY is set to "upsert" wrongly // Auto-correct the operation to "insert" if OPERATION_OPT_KEY is set to "upsert" wrongly
// or not set (in which case it will be set as "upsert" by parametersWithWriteDefaults()) . // or not set (in which case it will be set as "upsert" by parametersWithWriteDefaults()) .
if (parameters(INSERT_DROP_DUPS_OPT_KEY).toBoolean && if (parameters(INSERT_DROP_DUPS_OPT_KEY).toBoolean &&
parameters(OPERATION_OPT_KEY) == UPSERT_OPERATION_OPT_VAL) { operation == WriteOperationType.UPSERT) {
log.warn(s"$UPSERT_OPERATION_OPT_VAL is not applicable " + log.warn(s"$UPSERT_OPERATION_OPT_VAL is not applicable " +
s"when $INSERT_DROP_DUPS_OPT_KEY is set to be true, " + s"when $INSERT_DROP_DUPS_OPT_KEY is set to be true, " +
s"overriding the $OPERATION_OPT_KEY to be $INSERT_OPERATION_OPT_VAL") s"overriding the $OPERATION_OPT_KEY to be $INSERT_OPERATION_OPT_VAL")
INSERT_OPERATION_OPT_VAL operation = WriteOperationType.INSERT
} else { }
parameters(OPERATION_OPT_KEY)
}
val jsc = new JavaSparkContext(sparkContext) val jsc = new JavaSparkContext(sparkContext)
val basePath = new Path(path.get) val basePath = new Path(path.get)
@@ -123,7 +121,7 @@ private[hudi] object HoodieSparkSqlWriter {
// scalastyle:on // scalastyle:on
val (writeStatuses, writeClient: HoodieWriteClient[HoodieRecordPayload[Nothing]]) = val (writeStatuses, writeClient: HoodieWriteClient[HoodieRecordPayload[Nothing]]) =
if (!operation.equalsIgnoreCase(DELETE_OPERATION_OPT_VAL)) { if (operation != WriteOperationType.DELETE) {
// register classes & schemas // register classes & schemas
val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName) val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName)
sparkContext.getConf.registerKryoClasses( sparkContext.getConf.registerKryoClasses(
@@ -242,7 +240,7 @@ private[hudi] object HoodieSparkSqlWriter {
log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.") log.warn(s"hoodie table at $basePath already exists. Ignoring & not performing actual writes.")
false false
} else { } else {
handleSaveModes(mode, basePath, tableConfig, tableName, BOOTSTRAP_OPERATION_OPT_VAL, fs) handleSaveModes(mode, basePath, tableConfig, tableName, WriteOperationType.BOOTSTRAP, fs)
} }
if (!tableExists) { if (!tableExists) {
@@ -290,7 +288,7 @@ private[hudi] object HoodieSparkSqlWriter {
} }
private def handleSaveModes(mode: SaveMode, tablePath: Path, tableConfig: HoodieTableConfig, tableName: String, private def handleSaveModes(mode: SaveMode, tablePath: Path, tableConfig: HoodieTableConfig, tableName: String,
operation: String, fs: FileSystem): Unit = { operation: WriteOperationType, fs: FileSystem): Unit = {
if (mode == SaveMode.Append && tableExists) { if (mode == SaveMode.Append && tableExists) {
val existingTableName = tableConfig.getTableName val existingTableName = tableConfig.getTableName
if (!existingTableName.equals(tableName)) { if (!existingTableName.equals(tableName)) {
@@ -298,7 +296,7 @@ private[hudi] object HoodieSparkSqlWriter {
} }
} }
if (!operation.equalsIgnoreCase(DELETE_OPERATION_OPT_VAL)) { if (operation != WriteOperationType.DELETE) {
if (mode == SaveMode.ErrorIfExists && tableExists) { if (mode == SaveMode.ErrorIfExists && tableExists) {
throw new HoodieException(s"hoodie table at $tablePath already exists.") throw new HoodieException(s"hoodie table at $tablePath already exists.")
} else if (mode == SaveMode.Overwrite && tableExists) { } else if (mode == SaveMode.Overwrite && tableExists) {
@@ -309,7 +307,7 @@ private[hudi] object HoodieSparkSqlWriter {
} else { } else {
// Delete Operation only supports Append mode // Delete Operation only supports Append mode
if (mode != SaveMode.Append) { if (mode != SaveMode.Append) {
throw new HoodieException(s"Append is the only save mode applicable for $operation operation") throw new HoodieException(s"Append is the only save mode applicable for ${operation.toString} operation")
} }
} }
} }
@@ -384,7 +382,7 @@ private[hudi] object HoodieSparkSqlWriter {
tableConfig: HoodieTableConfig, tableConfig: HoodieTableConfig,
instantTime: String, instantTime: String,
basePath: Path, basePath: Path,
operation: String, operation: WriteOperationType,
jsc: JavaSparkContext): (Boolean, common.util.Option[java.lang.String]) = { jsc: JavaSparkContext): (Boolean, common.util.Option[java.lang.String]) = {
val errorCount = writeStatuses.rdd.filter(ws => ws.hasErrors).count() val errorCount = writeStatuses.rdd.filter(ws => ws.hasErrors).count()
if (errorCount == 0) { if (errorCount == 0) {
@@ -422,7 +420,7 @@ private[hudi] object HoodieSparkSqlWriter {
} }
(commitSuccess && metaSyncSuccess, compactionInstant) (commitSuccess && metaSyncSuccess, compactionInstant)
} else { } else {
log.error(s"$operation failed with $errorCount errors :") log.error(s"${operation.toString} failed with $errorCount errors :")
if (log.isTraceEnabled) { if (log.isTraceEnabled) {
log.trace("Printing out the top 100 errors") log.trace("Printing out the top 100 errors")
writeStatuses.rdd.filter(ws => ws.hasErrors) writeStatuses.rdd.filter(ws => ws.hasErrors)

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.HoodieWriteClient; import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieException;
@@ -108,7 +109,7 @@ public class TestDataSourceUtils {
when(hoodieWriteClient.getConfig()).thenReturn(config); when(hoodieWriteClient.getConfig()).thenReturn(config);
DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time",
DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL()); WriteOperationType.BULK_INSERT);
verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(),
optionCaptor.capture()); optionCaptor.capture());
@@ -121,7 +122,7 @@ public class TestDataSourceUtils {
Exception exception = assertThrows(HoodieException.class, () -> { Exception exception = assertThrows(HoodieException.class, () -> {
DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time",
DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL()); WriteOperationType.BULK_INSERT);
}); });
assertThat(exception.getMessage(), containsString("Could not create UserDefinedBulkInsertPartitioner")); assertThat(exception.getMessage(), containsString("Could not create UserDefinedBulkInsertPartitioner"));
@@ -132,7 +133,7 @@ public class TestDataSourceUtils {
setAndVerifyHoodieWriteClientWith(NoOpBulkInsertPartitioner.class.getName()); setAndVerifyHoodieWriteClientWith(NoOpBulkInsertPartitioner.class.getName());
DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time", DataSourceUtils.doWriteOperation(hoodieWriteClient, hoodieRecords, "test-time",
DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL()); WriteOperationType.BULK_INSERT);
verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(), verify(hoodieWriteClient, times(1)).bulkInsert(any(hoodieRecords.getClass()), anyString(),
optionCaptor.capture()); optionCaptor.capture());