1
0

[HUDI-4039] Make sure all builtin KeyGenerators properly implement Spark specific APIs (#5523)

This set of changes makes sure that all builtin KeyGenerators properly implement Spark-specific APIs in a performant way (minimizing key-generators overhead)
This commit is contained in:
Alexey Kudinkin
2022-07-22 08:35:07 -07:00
committed by GitHub
parent d5c904e10e
commit eea4a692c0
52 changed files with 1507 additions and 1363 deletions

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.io.storage.row;
import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieStorageConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.io.storage.HoodieParquetConfig;
@@ -115,6 +116,6 @@ public class TestHoodieInternalRowParquetWriter extends HoodieClientTestHarness
writeConfig.getBloomFilterFPP(),
writeConfig.getDynamicBloomFilterMaxNumEntries(),
writeConfig.getBloomFilterType());
return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, filter, writeConfig);
return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, Option.of(filter), writeConfig);
}
}

View File

@@ -80,9 +80,11 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
@ValueSource(booleans = { true, false })
public void testRowCreateHandle(boolean populateMetaFields) throws Exception {
// init config and table
HoodieWriteConfig cfg =
SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort).build();
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
HoodieWriteConfig config = SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort)
.withPopulateMetaFields(populateMetaFields)
.build();
HoodieTable table = HoodieSparkTable.create(config, context, metaClient);
List<String> fileNames = new ArrayList<>();
List<String> fileAbsPaths = new ArrayList<>();
@@ -95,8 +97,8 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
String fileId = UUID.randomUUID().toString();
String instantTime = "000";
HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime,
RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, populateMetaFields);
HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, config, partitionPath, fileId, instantTime,
RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE);
int size = 10 + RANDOM.nextInt(1000);
// Generate inputs
Dataset<Row> inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size, partitionPath, false);
@@ -133,7 +135,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
String instantTime = "000";
HoodieRowCreateHandle handle =
new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, true);
new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE);
int size = 10 + RANDOM.nextInt(1000);
int totalFailures = 5;
// Generate first batch of valid rows
@@ -186,7 +188,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
try {
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, true);
new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE);
fail("Should have thrown exception");
} catch (HoodieInsertException ioe) {
// expected without metadata table

View File

@@ -179,6 +179,7 @@ public class SparkDatasetTestUtils {
public static HoodieWriteConfig.Builder getConfigBuilder(String basePath, int timelineServicePort) {
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
.withPopulateMetaFields(true)
.withParallelism(2, 2)
.withDeleteParallelism(2)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())

View File

@@ -1,102 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen
import java.sql.Timestamp
import org.apache.spark.sql.Row
import org.apache.hudi.keygen.RowKeyGeneratorHelper._
import org.apache.spark.sql.types.{DataType, DataTypes}
import org.junit.jupiter.api.{Assertions, Test}
import scala.collection.JavaConverters._
class TestRowGeneratorHelper {
@Test
def testGetPartitionPathFromRow(): Unit = {
/** single plain partition */
val row1 = Row.fromSeq(Seq(1, "z3", 10.0, "20220108"))
val ptField1 = List("dt").asJava
val mapValue = org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3)).asJava, DataTypes.LongType)
val ptPos1 = Map("dt" -> mapValue).asJava
Assertions.assertEquals("20220108",
getPartitionPathFromRow(row1, ptField1, false, ptPos1))
Assertions.assertEquals("dt=20220108",
getPartitionPathFromRow(row1, ptField1, true, ptPos1))
/** multiple plain partitions */
val row2 = Row.fromSeq(Seq(1, "z3", 10.0, "2022", "01", "08"))
val ptField2 = List("year", "month", "day").asJava
val ptPos2 = Map("year" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3)).asJava, DataTypes.StringType),
"month" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(4)).asJava, DataTypes.StringType),
"day" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(5)).asJava, DataTypes.StringType)
).asJava
Assertions.assertEquals("2022/01/08",
getPartitionPathFromRow(row2, ptField2, false, ptPos2))
Assertions.assertEquals("year=2022/month=01/day=08",
getPartitionPathFromRow(row2, ptField2, true, ptPos2))
/** multiple partitions which contains TimeStamp type or Instant type */
val timestamp = Timestamp.valueOf("2020-01-08 10:00:00")
val instant = timestamp.toInstant
val ptField3 = List("event", "event_time").asJava
val ptPos3 = Map("event" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3)).asJava, DataTypes.StringType),
"event_time" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(4)).asJava, DataTypes.TimestampType)
).asJava
// with timeStamp type
val row2_ts = Row.fromSeq(Seq(1, "z3", 10.0, "click", timestamp))
Assertions.assertEquals("click/2020-01-08 10:00:00.0",
getPartitionPathFromRow(row2_ts, ptField3, false, ptPos3))
Assertions.assertEquals("event=click/event_time=2020-01-08 10:00:00.0",
getPartitionPathFromRow(row2_ts, ptField3, true, ptPos3))
// with instant type
val row2_instant = Row.fromSeq(Seq(1, "z3", 10.0, "click", instant))
Assertions.assertEquals("click/2020-01-08 10:00:00.0",
getPartitionPathFromRow(row2_instant, ptField3, false, ptPos3))
Assertions.assertEquals("event=click/event_time=2020-01-08 10:00:00.0",
getPartitionPathFromRow(row2_instant, ptField3, true, ptPos3))
/** mixed case with plain and nested partitions */
val nestedRow4 = Row.fromSeq(Seq(instant, "ad"))
val ptField4 = List("event_time").asJava
val ptPos4 = Map("event_time" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3), new Integer(0)).asJava, DataTypes.TimestampType)).asJava
// with instant type
val row4 = Row.fromSeq(Seq(1, "z3", 10.0, nestedRow4, "click"))
Assertions.assertEquals("2020-01-08 10:00:00.0",
getPartitionPathFromRow(row4, ptField4, false, ptPos4))
Assertions.assertEquals("event_time=2020-01-08 10:00:00.0",
getPartitionPathFromRow(row4, ptField4, true, ptPos4))
val nestedRow5 = Row.fromSeq(Seq(timestamp, "ad"))
val ptField5 = List("event", "event_time").asJava
val ptPos5 = Map(
"event_time" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3), new Integer(0)).asJava, DataTypes.TimestampType),
"event" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(4)).asJava, DataTypes.StringType)
).asJava
val row5 = Row.fromSeq(Seq(1, "z3", 10.0, nestedRow5, "click"))
Assertions.assertEquals("click/2020-01-08 10:00:00.0",
getPartitionPathFromRow(row5, ptField5, false, ptPos5))
Assertions.assertEquals("event=click/event_time=2020-01-08 10:00:00.0",
getPartitionPathFromRow(row5, ptField5, true, ptPos5))
}
}

View File

@@ -41,7 +41,7 @@ class TestHoodieUnsafeRowUtils {
assertEquals(
Seq((1, schema(1)), (0, schema(1).dataType.asInstanceOf[StructType](0))),
composeNestedFieldPath(schema, "bar.baz").toSeq)
composeNestedFieldPath(schema, "bar.baz").parts.toSeq)
assertThrows(classOf[IllegalArgumentException]) { () =>
composeNestedFieldPath(schema, "foo.baz")
@@ -148,6 +148,7 @@ class TestHoodieUnsafeRowUtils {
}
}
// TODO rebase on ScalaAssertionSupport
private def assertThrows[T <: Throwable](expectedExceptionClass: Class[T])(f: () => Unit): T = {
try {
f.apply()