1
0

[HUDI-4039] Make sure all builtin KeyGenerators properly implement Spark specific APIs (#5523)

This set of changes makes sure that all builtin KeyGenerators properly implement Spark-specific APIs in a performant way (minimizing key-generators overhead)
This commit is contained in:
Alexey Kudinkin
2022-07-22 08:35:07 -07:00
committed by GitHub
parent d5c904e10e
commit eea4a692c0
52 changed files with 1507 additions and 1363 deletions

View File

@@ -23,7 +23,6 @@ import org.apache.hudi.client.HoodieInternalWriteStatus;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.storage.row.HoodieRowCreateHandle;
@@ -44,6 +43,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.UUID;
@@ -74,7 +74,7 @@ public class BulkInsertDataInternalWriterHelper {
* NOTE: This is stored as Catalyst's internal {@link UTF8String} to avoid
* conversion (deserialization) b/w {@link UTF8String} and {@link String}
*/
private String lastKnownPartitionPath = null;
private UTF8String lastKnownPartitionPath = null;
private HoodieRowCreateHandle handle;
private int numFilesWritten = 0;
@@ -133,11 +133,13 @@ public class BulkInsertDataInternalWriterHelper {
public void write(InternalRow row) throws IOException {
try {
String partitionPath = extractPartitionPath(row);
if (lastKnownPartitionPath == null || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) {
UTF8String partitionPath = extractPartitionPath(row);
if (lastKnownPartitionPath == null || !Objects.equals(lastKnownPartitionPath, partitionPath) || !handle.canWrite()) {
LOG.info("Creating new file for partition path " + partitionPath);
handle = getRowCreateHandle(partitionPath);
lastKnownPartitionPath = partitionPath;
handle = getRowCreateHandle(partitionPath.toString());
// NOTE: It's crucial to make a copy here, since [[UTF8String]] could be pointing into
// a mutable underlying buffer
lastKnownPartitionPath = partitionPath.clone();
}
handle.write(row);
@@ -162,31 +164,19 @@ public class BulkInsertDataInternalWriterHelper {
handle = null;
}
private String extractPartitionPath(InternalRow row) {
String partitionPath;
private UTF8String extractPartitionPath(InternalRow row) {
if (populateMetaFields) {
// In case meta-fields are materialized w/in the table itself, we can just simply extract
// partition path from there
//
// NOTE: Helper keeps track of [[lastKnownPartitionPath]] as [[UTF8String]] to avoid
// conversion from Catalyst internal representation into a [[String]]
partitionPath = row.getString(HoodieRecord.PARTITION_PATH_META_FIELD_POS);
return row.getUTF8String(HoodieRecord.PARTITION_PATH_META_FIELD_ORD);
} else if (keyGeneratorOpt.isPresent()) {
// TODO(HUDI-4039) this should be handled by the SimpleKeyGenerator itself
if (simpleKeyGen) {
String partitionPathValue = row.get(simplePartitionFieldIndex, simplePartitionFieldDataType).toString();
partitionPath = partitionPathValue != null ? partitionPathValue : PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH;
if (writeConfig.isHiveStylePartitioningEnabled()) {
partitionPath = (keyGeneratorOpt.get()).getPartitionPathFields().get(0) + "=" + partitionPath;
}
} else {
// only BuiltIn key generators are supported if meta fields are disabled.
partitionPath = keyGeneratorOpt.get().getPartitionPath(row, structType);
}
return keyGeneratorOpt.get().getPartitionPath(row, structType);
} else {
partitionPath = "";
return UTF8String.EMPTY_UTF8;
}
return partitionPath;
}
private HoodieRowCreateHandle getRowCreateHandle(String partitionPath) throws IOException {
@@ -209,7 +199,7 @@ public class BulkInsertDataInternalWriterHelper {
private HoodieRowCreateHandle createHandle(String partitionPath) {
return new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType, populateMetaFields);
instantTime, taskPartitionId, taskId, taskEpochId, structType);
}
private String getNextFileId() {

View File

@@ -23,7 +23,7 @@ import org.apache.hudi.common.model.HoodieRecord
import org.apache.hudi.common.util.ReflectionUtils
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.index.SparkHoodieIndexFactory
import org.apache.hudi.keygen.BuiltinKeyGenerator
import org.apache.hudi.keygen.{BuiltinKeyGenerator, SparkKeyGeneratorInterface}
import org.apache.hudi.table.BulkInsertPartitioner
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
@@ -63,13 +63,12 @@ object HoodieDatasetBulkInsertHelper extends Logging {
df.queryExecution.toRdd.mapPartitions { iter =>
val keyGenerator =
ReflectionUtils.loadClass(keyGeneratorClassName, new TypedProperties(config.getProps))
.asInstanceOf[BuiltinKeyGenerator]
.asInstanceOf[SparkKeyGeneratorInterface]
iter.map { row =>
val (recordKey, partitionPath) =
if (populateMetaFields) {
(UTF8String.fromString(keyGenerator.getRecordKey(row, schema)),
UTF8String.fromString(keyGenerator.getPartitionPath(row, schema)))
(keyGenerator.getRecordKey(row, schema), keyGenerator.getPartitionPath(row, schema))
} else {
(UTF8String.EMPTY_UTF8, UTF8String.EMPTY_UTF8)
}