[HUDI-4039] Make sure all builtin KeyGenerators properly implement Spark specific APIs (#5523)
This set of changes makes sure that all builtin KeyGenerators properly implement Spark-specific APIs in a performant way (minimizing key-generators overhead)
This commit is contained in:
@@ -21,6 +21,7 @@ package org.apache.hudi.io.storage.row;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.config.HoodieStorageConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||
@@ -115,6 +116,6 @@ public class TestHoodieInternalRowParquetWriter extends HoodieClientTestHarness
|
||||
writeConfig.getBloomFilterFPP(),
|
||||
writeConfig.getDynamicBloomFilterMaxNumEntries(),
|
||||
writeConfig.getBloomFilterType());
|
||||
return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, filter, writeConfig);
|
||||
return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, Option.of(filter), writeConfig);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,9 +80,11 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
|
||||
@ValueSource(booleans = { true, false })
|
||||
public void testRowCreateHandle(boolean populateMetaFields) throws Exception {
|
||||
// init config and table
|
||||
HoodieWriteConfig cfg =
|
||||
SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort).build();
|
||||
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
|
||||
HoodieWriteConfig config = SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort)
|
||||
.withPopulateMetaFields(populateMetaFields)
|
||||
.build();
|
||||
|
||||
HoodieTable table = HoodieSparkTable.create(config, context, metaClient);
|
||||
List<String> fileNames = new ArrayList<>();
|
||||
List<String> fileAbsPaths = new ArrayList<>();
|
||||
|
||||
@@ -95,8 +97,8 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
|
||||
String fileId = UUID.randomUUID().toString();
|
||||
String instantTime = "000";
|
||||
|
||||
HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime,
|
||||
RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, populateMetaFields);
|
||||
HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, config, partitionPath, fileId, instantTime,
|
||||
RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE);
|
||||
int size = 10 + RANDOM.nextInt(1000);
|
||||
// Generate inputs
|
||||
Dataset<Row> inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size, partitionPath, false);
|
||||
@@ -133,7 +135,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
|
||||
String instantTime = "000";
|
||||
|
||||
HoodieRowCreateHandle handle =
|
||||
new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, true);
|
||||
new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE);
|
||||
int size = 10 + RANDOM.nextInt(1000);
|
||||
int totalFailures = 5;
|
||||
// Generate first batch of valid rows
|
||||
@@ -186,7 +188,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
|
||||
|
||||
try {
|
||||
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
|
||||
new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, true);
|
||||
new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE);
|
||||
fail("Should have thrown exception");
|
||||
} catch (HoodieInsertException ioe) {
|
||||
// expected without metadata table
|
||||
|
||||
@@ -179,6 +179,7 @@ public class SparkDatasetTestUtils {
|
||||
|
||||
public static HoodieWriteConfig.Builder getConfigBuilder(String basePath, int timelineServicePort) {
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
|
||||
.withPopulateMetaFields(true)
|
||||
.withParallelism(2, 2)
|
||||
.withDeleteParallelism(2)
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).build())
|
||||
|
||||
@@ -1,102 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.keygen
|
||||
|
||||
import java.sql.Timestamp
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.hudi.keygen.RowKeyGeneratorHelper._
|
||||
import org.apache.spark.sql.types.{DataType, DataTypes}
|
||||
import org.junit.jupiter.api.{Assertions, Test}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
class TestRowGeneratorHelper {
|
||||
|
||||
@Test
|
||||
def testGetPartitionPathFromRow(): Unit = {
|
||||
|
||||
/** single plain partition */
|
||||
val row1 = Row.fromSeq(Seq(1, "z3", 10.0, "20220108"))
|
||||
val ptField1 = List("dt").asJava
|
||||
val mapValue = org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3)).asJava, DataTypes.LongType)
|
||||
val ptPos1 = Map("dt" -> mapValue).asJava
|
||||
|
||||
Assertions.assertEquals("20220108",
|
||||
getPartitionPathFromRow(row1, ptField1, false, ptPos1))
|
||||
Assertions.assertEquals("dt=20220108",
|
||||
getPartitionPathFromRow(row1, ptField1, true, ptPos1))
|
||||
|
||||
/** multiple plain partitions */
|
||||
val row2 = Row.fromSeq(Seq(1, "z3", 10.0, "2022", "01", "08"))
|
||||
val ptField2 = List("year", "month", "day").asJava
|
||||
val ptPos2 = Map("year" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3)).asJava, DataTypes.StringType),
|
||||
"month" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(4)).asJava, DataTypes.StringType),
|
||||
"day" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(5)).asJava, DataTypes.StringType)
|
||||
).asJava
|
||||
Assertions.assertEquals("2022/01/08",
|
||||
getPartitionPathFromRow(row2, ptField2, false, ptPos2))
|
||||
Assertions.assertEquals("year=2022/month=01/day=08",
|
||||
getPartitionPathFromRow(row2, ptField2, true, ptPos2))
|
||||
|
||||
/** multiple partitions which contains TimeStamp type or Instant type */
|
||||
val timestamp = Timestamp.valueOf("2020-01-08 10:00:00")
|
||||
val instant = timestamp.toInstant
|
||||
val ptField3 = List("event", "event_time").asJava
|
||||
val ptPos3 = Map("event" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3)).asJava, DataTypes.StringType),
|
||||
"event_time" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(4)).asJava, DataTypes.TimestampType)
|
||||
).asJava
|
||||
|
||||
// with timeStamp type
|
||||
val row2_ts = Row.fromSeq(Seq(1, "z3", 10.0, "click", timestamp))
|
||||
Assertions.assertEquals("click/2020-01-08 10:00:00.0",
|
||||
getPartitionPathFromRow(row2_ts, ptField3, false, ptPos3))
|
||||
Assertions.assertEquals("event=click/event_time=2020-01-08 10:00:00.0",
|
||||
getPartitionPathFromRow(row2_ts, ptField3, true, ptPos3))
|
||||
|
||||
// with instant type
|
||||
val row2_instant = Row.fromSeq(Seq(1, "z3", 10.0, "click", instant))
|
||||
Assertions.assertEquals("click/2020-01-08 10:00:00.0",
|
||||
getPartitionPathFromRow(row2_instant, ptField3, false, ptPos3))
|
||||
Assertions.assertEquals("event=click/event_time=2020-01-08 10:00:00.0",
|
||||
getPartitionPathFromRow(row2_instant, ptField3, true, ptPos3))
|
||||
|
||||
/** mixed case with plain and nested partitions */
|
||||
val nestedRow4 = Row.fromSeq(Seq(instant, "ad"))
|
||||
val ptField4 = List("event_time").asJava
|
||||
val ptPos4 = Map("event_time" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3), new Integer(0)).asJava, DataTypes.TimestampType)).asJava
|
||||
// with instant type
|
||||
val row4 = Row.fromSeq(Seq(1, "z3", 10.0, nestedRow4, "click"))
|
||||
Assertions.assertEquals("2020-01-08 10:00:00.0",
|
||||
getPartitionPathFromRow(row4, ptField4, false, ptPos4))
|
||||
Assertions.assertEquals("event_time=2020-01-08 10:00:00.0",
|
||||
getPartitionPathFromRow(row4, ptField4, true, ptPos4))
|
||||
|
||||
val nestedRow5 = Row.fromSeq(Seq(timestamp, "ad"))
|
||||
val ptField5 = List("event", "event_time").asJava
|
||||
val ptPos5 = Map(
|
||||
"event_time" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(3), new Integer(0)).asJava, DataTypes.TimestampType),
|
||||
"event" -> org.apache.hudi.common.util.collection.Pair.of(List(new Integer(4)).asJava, DataTypes.StringType)
|
||||
).asJava
|
||||
val row5 = Row.fromSeq(Seq(1, "z3", 10.0, nestedRow5, "click"))
|
||||
Assertions.assertEquals("click/2020-01-08 10:00:00.0",
|
||||
getPartitionPathFromRow(row5, ptField5, false, ptPos5))
|
||||
Assertions.assertEquals("event=click/event_time=2020-01-08 10:00:00.0",
|
||||
getPartitionPathFromRow(row5, ptField5, true, ptPos5))
|
||||
}
|
||||
}
|
||||
@@ -41,7 +41,7 @@ class TestHoodieUnsafeRowUtils {
|
||||
|
||||
assertEquals(
|
||||
Seq((1, schema(1)), (0, schema(1).dataType.asInstanceOf[StructType](0))),
|
||||
composeNestedFieldPath(schema, "bar.baz").toSeq)
|
||||
composeNestedFieldPath(schema, "bar.baz").parts.toSeq)
|
||||
|
||||
assertThrows(classOf[IllegalArgumentException]) { () =>
|
||||
composeNestedFieldPath(schema, "foo.baz")
|
||||
@@ -148,6 +148,7 @@ class TestHoodieUnsafeRowUtils {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO rebase on ScalaAssertionSupport
|
||||
private def assertThrows[T <: Throwable](expectedExceptionClass: Class[T])(f: () => Unit): T = {
|
||||
try {
|
||||
f.apply()
|
||||
|
||||
Reference in New Issue
Block a user