1
0

[HUDI-1105] Adding dedup support for Bulk Insert w/ Rows (#2206)

This commit is contained in:
Sivabalan Narayanan
2021-07-07 17:38:26 -04:00
committed by GitHub
parent 8f7ad8b178
commit 16e90d30ea
7 changed files with 265 additions and 23 deletions

View File

@@ -18,6 +18,12 @@
package org.apache.hudi;
import static org.apache.spark.sql.functions.callUDF;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.ReflectionUtils;
@@ -35,16 +41,8 @@ import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import scala.collection.JavaConverters;
import static org.apache.spark.sql.functions.callUDF;
/**
* Helper class to assist in preparing {@link Dataset<Row>}s for bulk insert with datasource implementation.
*/
@@ -69,7 +67,8 @@ public class HoodieDatasetBulkInsertHelper {
*/
public static Dataset<Row> prepareHoodieDatasetForBulkInsert(SQLContext sqlContext,
HoodieWriteConfig config, Dataset<Row> rows, String structName, String recordNamespace,
BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows) {
BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows,
boolean isGlobalIndex) {
List<Column> originalFields =
Arrays.stream(rows.schema().fields()).map(f -> new Column(f.name())).collect(Collectors.toList());
@@ -100,9 +99,15 @@ public class HoodieDatasetBulkInsertHelper {
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.FILENAME_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType));
Dataset<Row> dedupedDf = rowDatasetWithHoodieColumns;
if (config.shouldCombineBeforeInsert()) {
dedupedDf = SparkRowWriteHelper.newInstance().deduplicateRows(rowDatasetWithHoodieColumns, config.getPreCombineField(), isGlobalIndex);
}
List<Column> orderedFields = Stream.concat(HoodieRecord.HOODIE_META_COLUMNS.stream().map(Column::new),
originalFields.stream()).collect(Collectors.toList());
Dataset<Row> colOrderedDataset = rowDatasetWithHoodieColumns.select(
Dataset<Row> colOrderedDataset = dedupedDf.select(
JavaConverters.collectionAsScalaIterableConverter(orderedFields).asScala().toSeq());
return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());

View File

@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.catalyst.expressions.Attribute;
import org.apache.spark.sql.types.StructType;
import java.util.List;
import java.util.stream.Collectors;
import scala.Tuple2;
import scala.collection.JavaConversions;
import scala.collection.JavaConverters;
/**
* Helper class to assist in deduplicating Rows for BulkInsert with Rows.
*/
public class SparkRowWriteHelper {
private SparkRowWriteHelper() {
}
private static class WriteHelperHolder {
private static final SparkRowWriteHelper SPARK_WRITE_HELPER = new SparkRowWriteHelper();
}
public static SparkRowWriteHelper newInstance() {
return SparkRowWriteHelper.WriteHelperHolder.SPARK_WRITE_HELPER;
}
public Dataset<Row> deduplicateRows(Dataset<Row> inputDf, String preCombineField, boolean isGlobalIndex) {
ExpressionEncoder encoder = getEncoder(inputDf.schema());
return inputDf.groupByKey(
(MapFunction<Row, String>) value ->
isGlobalIndex ? (value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)) :
(value.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + "+" + value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)), Encoders.STRING())
.reduceGroups((ReduceFunction<Row>) (v1, v2) -> {
if (((Comparable) v1.getAs(preCombineField)).compareTo(((Comparable) v2.getAs(preCombineField))) >= 0) {
return v1;
} else {
return v2;
}
}
).map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, encoder);
}
private ExpressionEncoder getEncoder(StructType schema) {
List<Attribute> attributes = JavaConversions.asJavaCollection(schema.toAttributes()).stream()
.map(Attribute::toAttribute).collect(Collectors.toList());
return RowEncoder.apply(schema)
.resolveAndBind(JavaConverters.asScalaBufferConverter(attributes).asScala().toSeq(),
SimpleAnalyzer$.MODULE$);
}
}

View File

@@ -38,6 +38,7 @@ import org.apache.hudi.exception.HoodieException
import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerWithRowsFactory
import org.apache.hudi.hive.util.ConfigUtils
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool}
import org.apache.hudi.index.SparkHoodieIndex
import org.apache.hudi.internal.DataSourceInternalWriterHelper
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory
import org.apache.hudi.sync.common.AbstractSyncTool
@@ -345,8 +346,9 @@ object HoodieSparkSqlWriter {
}
val arePartitionRecordsSorted = bulkInsertPartitionerRows.arePartitionRecordsSorted();
parameters.updated(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED, arePartitionRecordsSorted.toString)
val isGlobalIndex = SparkHoodieIndex.isGlobalIndex(writeConfig)
val hoodieDF = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace,
bulkInsertPartitionerRows)
bulkInsertPartitionerRows, isGlobalIndex)
if (SPARK_VERSION.startsWith("2.")) {
hoodieDF.write.format("org.apache.hudi.internal")
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)

View File

@@ -25,15 +25,32 @@ import org.apache.hudi.testutils.DataSourceTestUtils;
import org.apache.hudi.testutils.HoodieClientTestBase;
import org.apache.avro.Schema;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.catalyst.expressions.Attribute;
import org.apache.spark.sql.types.StructType;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import scala.Tuple2;
import scala.collection.JavaConversions;
import scala.collection.JavaConverters;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -45,13 +62,22 @@ import static org.junit.jupiter.api.Assertions.fail;
public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
private String schemaStr;
private Schema schema;
private transient Schema schema;
private StructType structType;
public TestHoodieDatasetBulkInsertHelper() throws IOException {
init();
}
/**
* args for schema evolution test.
*/
private static Stream<Arguments> providePreCombineArgs() {
return Stream.of(
Arguments.of(false),
Arguments.of(true));
}
private void init() throws IOException {
schemaStr = FileIOUtils.readAsUTFString(getClass().getResourceAsStream("/exampleSchema.txt"));
schema = DataSourceTestUtils.getStructTypeExampleSchema();
@@ -59,12 +85,12 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
}
@Test
public void testBulkInsertHelper() throws IOException {
HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet()).build();
public void testBulkInsertHelper() {
HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet()).combineInput(false, false).build();
List<Row> rows = DataSourceTestUtils.generateRandomRows(10);
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", "testNamespace",
new NonSortPartitionerWithRows());
Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName",
"testNamespace", new NonSortPartitionerWithRows(), false);
StructType resultSchema = result.schema();
assertEquals(result.count(), 10);
@@ -74,6 +100,42 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
assertTrue(resultSchema.fieldIndex(entry.getKey()) == entry.getValue());
}
result.toJavaRDD().foreach(entry -> {
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD)).equals(entry.getAs("_row_key")));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).equals(entry.getAs("partition")));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD)).equals(""));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).equals(""));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.FILENAME_METADATA_FIELD)).equals(""));
});
Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
.drop(HoodieRecord.FILENAME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
assertTrue(dataset.except(trimmedOutput).count() == 0);
}
@ParameterizedTest
@MethodSource("providePreCombineArgs")
public void testBulkInsertPreCombine(boolean enablePreCombine) {
HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(getPropsAllSet()).combineInput(enablePreCombine, enablePreCombine)
.withPreCombineField("ts").build();
List<Row> inserts = DataSourceTestUtils.generateRandomRows(10);
Dataset<Row> toUpdateDataset = sqlContext.createDataFrame(inserts.subList(0, 5), structType);
List<Row> updates = DataSourceTestUtils.updateRowsWithHigherTs(toUpdateDataset);
List<Row> rows = new ArrayList<>();
rows.addAll(inserts);
rows.addAll(updates);
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName",
"testNamespace", new NonSortPartitionerWithRows(), false);
StructType resultSchema = result.schema();
assertEquals(result.count(), enablePreCombine ? 10 : 15);
assertEquals(resultSchema.fieldNames().length, structType.fieldNames().length + HoodieRecord.HOODIE_META_COLUMNS.size());
for (Map.Entry<String, Integer> entry : HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.entrySet()) {
assertTrue(resultSchema.fieldIndex(entry.getKey()) == entry.getValue());
}
int metadataRecordKeyIndex = resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD);
int metadataParitionPathIndex = resultSchema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD);
int metadataCommitTimeIndex = resultSchema.fieldIndex(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
@@ -87,6 +149,30 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
assertTrue(entry.get(metadataCommitTimeIndex).equals(""));
assertTrue(entry.get(metadataFilenameIndex).equals(""));
});
Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
.drop(HoodieRecord.FILENAME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
// find resolved input snapshot
ExpressionEncoder encoder = getEncoder(dataset.schema());
if (enablePreCombine) {
Dataset<Row> inputSnapshotDf = dataset.groupByKey(
(MapFunction<Row, String>) value -> value.getAs("partition") + "+" + value.getAs("_row_key"), Encoders.STRING())
.reduceGroups((ReduceFunction<Row>) (v1, v2) -> {
long ts1 = v1.getAs("ts");
long ts2 = v2.getAs("ts");
if (ts1 >= ts2) {
return v1;
} else {
return v2;
}
})
.map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, encoder);
assertTrue(inputSnapshotDf.except(trimmedOutput).count() == 0);
} else {
assertTrue(dataset.except(trimmedOutput).count() == 0);
}
}
private Map<String, String> getPropsAllSet() {
@@ -120,7 +206,7 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
try {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName",
"testNamespace", new NonSortPartitionerWithRows());
"testNamespace", new NonSortPartitionerWithRows(), false);
fail("Should have thrown exception");
} catch (Exception e) {
// ignore
@@ -131,7 +217,7 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
dataset = sqlContext.createDataFrame(rows, structType);
try {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName",
"testNamespace", new NonSortPartitionerWithRows());
"testNamespace", new NonSortPartitionerWithRows(), false);
fail("Should have thrown exception");
} catch (Exception e) {
// ignore
@@ -142,7 +228,7 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
dataset = sqlContext.createDataFrame(rows, structType);
try {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName",
"testNamespace", new NonSortPartitionerWithRows());
"testNamespace", new NonSortPartitionerWithRows(), false);
fail("Should have thrown exception");
} catch (Exception e) {
// ignore
@@ -153,10 +239,18 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
dataset = sqlContext.createDataFrame(rows, structType);
try {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName",
"testNamespace", new NonSortPartitionerWithRows());
"testNamespace", new NonSortPartitionerWithRows(), false);
fail("Should have thrown exception");
} catch (Exception e) {
// ignore
}
}
private ExpressionEncoder getEncoder(StructType schema) {
List<Attribute> attributes = JavaConversions.asJavaCollection(schema.toAttributes()).stream()
.map(Attribute::toAttribute).collect(Collectors.toList());
return RowEncoder.apply(schema)
.resolveAndBind(JavaConverters.asScalaBufferConverter(attributes).asScala().toSeq(),
SimpleAnalyzer$.MODULE$);
}
}

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.testutils;
import org.apache.hudi.common.util.FileIOUtils;
import org.apache.avro.Schema;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
@@ -75,6 +76,20 @@ public class DataSourceTestUtils {
return toReturn;
}
public static List<Row> getUniqueRows(List<Row> inserts, int count) {
List<Row> toReturn = new ArrayList<>();
int soFar = 0;
int curIndex = 0;
while (soFar < count) {
if (!toReturn.contains(inserts.get(curIndex))) {
toReturn.add(inserts.get(curIndex));
soFar++;
}
curIndex++;
}
return toReturn;
}
public static List<Row> generateRandomRowsEvolvedSchema(int count) {
Random random = new Random();
List<Row> toReturn = new ArrayList<>();
@@ -89,4 +104,18 @@ public class DataSourceTestUtils {
}
return toReturn;
}
public static List<Row> updateRowsWithHigherTs(Dataset<Row> inputDf) {
Random random = new Random();
List<Row> input = inputDf.collectAsList();
List<Row> rows = new ArrayList<>();
for (Row row : input) {
Object[] values = new Object[3];
values[0] = row.getAs("_row_key");
values[1] = row.getAs("partition");
values[2] = ((Long) row.getAs("ts")) + random.nextInt(1000);
rows.add(RowFactory.create(values));
}
return rows;
}
}

View File

@@ -144,7 +144,13 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
// generate the inserts
val schema = DataSourceTestUtils.getStructTypeExampleSchema
val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema)
val records = DataSourceTestUtils.generateRandomRows(1000)
val inserts = DataSourceTestUtils.generateRandomRows(1000)
// add some updates so that preCombine kicks in
val toUpdateDataset = sqlContext.createDataFrame(DataSourceTestUtils.getUniqueRows(inserts, 40), structType)
val updates = DataSourceTestUtils.updateRowsWithHigherTs(toUpdateDataset)
val records = inserts.union(updates)
val recordsSeq = convertRowListToSeq(records)
val df = spark.createDataFrame(sc.parallelize(recordsSeq), structType)
// write to Hudi
@@ -161,6 +167,7 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
// fetch all records from parquet files generated from write to hudi
val actualDf = sqlContext.read.parquet(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2))
val resultRows = actualDf.collectAsList()
// remove metadata columns so that expected and actual DFs can be compared as is
val trimmedDf = actualDf.drop(HoodieRecord.HOODIE_META_COLUMNS.get(0)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(1))
@@ -448,9 +455,9 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
.foreach(tableType => {
test("test schema evolution for " + tableType) {
initSparkContext("test_schema_evolution")
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path")
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path_schema_evol")
try {
val hoodieFooTableName = "hoodie_foo_tbl_" + tableType
val hoodieFooTableName = "hoodie_foo_tbl_schema_evolution_" + tableType
//create a new table
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,