[HUDI-4039] Make sure all builtin KeyGenerators properly implement Spark specific APIs (#5523)
This set of changes makes sure that all builtin KeyGenerators properly implement Spark-specific APIs in a performant way (minimizing key-generators overhead)
This commit is contained in:
@@ -23,7 +23,6 @@ import org.apache.hudi.client.HoodieInternalWriteStatus;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.io.storage.row.HoodieRowCreateHandle;
|
||||
@@ -44,6 +43,7 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Properties;
|
||||
import java.util.UUID;
|
||||
|
||||
@@ -74,7 +74,7 @@ public class BulkInsertDataInternalWriterHelper {
|
||||
* NOTE: This is stored as Catalyst's internal {@link UTF8String} to avoid
|
||||
* conversion (deserialization) b/w {@link UTF8String} and {@link String}
|
||||
*/
|
||||
private String lastKnownPartitionPath = null;
|
||||
private UTF8String lastKnownPartitionPath = null;
|
||||
private HoodieRowCreateHandle handle;
|
||||
private int numFilesWritten = 0;
|
||||
|
||||
@@ -133,11 +133,13 @@ public class BulkInsertDataInternalWriterHelper {
|
||||
|
||||
public void write(InternalRow row) throws IOException {
|
||||
try {
|
||||
String partitionPath = extractPartitionPath(row);
|
||||
if (lastKnownPartitionPath == null || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) {
|
||||
UTF8String partitionPath = extractPartitionPath(row);
|
||||
if (lastKnownPartitionPath == null || !Objects.equals(lastKnownPartitionPath, partitionPath) || !handle.canWrite()) {
|
||||
LOG.info("Creating new file for partition path " + partitionPath);
|
||||
handle = getRowCreateHandle(partitionPath);
|
||||
lastKnownPartitionPath = partitionPath;
|
||||
handle = getRowCreateHandle(partitionPath.toString());
|
||||
// NOTE: It's crucial to make a copy here, since [[UTF8String]] could be pointing into
|
||||
// a mutable underlying buffer
|
||||
lastKnownPartitionPath = partitionPath.clone();
|
||||
}
|
||||
|
||||
handle.write(row);
|
||||
@@ -162,31 +164,19 @@ public class BulkInsertDataInternalWriterHelper {
|
||||
handle = null;
|
||||
}
|
||||
|
||||
private String extractPartitionPath(InternalRow row) {
|
||||
String partitionPath;
|
||||
private UTF8String extractPartitionPath(InternalRow row) {
|
||||
if (populateMetaFields) {
|
||||
// In case meta-fields are materialized w/in the table itself, we can just simply extract
|
||||
// partition path from there
|
||||
//
|
||||
// NOTE: Helper keeps track of [[lastKnownPartitionPath]] as [[UTF8String]] to avoid
|
||||
// conversion from Catalyst internal representation into a [[String]]
|
||||
partitionPath = row.getString(HoodieRecord.PARTITION_PATH_META_FIELD_POS);
|
||||
return row.getUTF8String(HoodieRecord.PARTITION_PATH_META_FIELD_ORD);
|
||||
} else if (keyGeneratorOpt.isPresent()) {
|
||||
// TODO(HUDI-4039) this should be handled by the SimpleKeyGenerator itself
|
||||
if (simpleKeyGen) {
|
||||
String partitionPathValue = row.get(simplePartitionFieldIndex, simplePartitionFieldDataType).toString();
|
||||
partitionPath = partitionPathValue != null ? partitionPathValue : PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH;
|
||||
if (writeConfig.isHiveStylePartitioningEnabled()) {
|
||||
partitionPath = (keyGeneratorOpt.get()).getPartitionPathFields().get(0) + "=" + partitionPath;
|
||||
}
|
||||
} else {
|
||||
// only BuiltIn key generators are supported if meta fields are disabled.
|
||||
partitionPath = keyGeneratorOpt.get().getPartitionPath(row, structType);
|
||||
}
|
||||
return keyGeneratorOpt.get().getPartitionPath(row, structType);
|
||||
} else {
|
||||
partitionPath = "";
|
||||
return UTF8String.EMPTY_UTF8;
|
||||
}
|
||||
return partitionPath;
|
||||
}
|
||||
|
||||
private HoodieRowCreateHandle getRowCreateHandle(String partitionPath) throws IOException {
|
||||
@@ -209,7 +199,7 @@ public class BulkInsertDataInternalWriterHelper {
|
||||
|
||||
private HoodieRowCreateHandle createHandle(String partitionPath) {
|
||||
return new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
|
||||
instantTime, taskPartitionId, taskId, taskEpochId, structType, populateMetaFields);
|
||||
instantTime, taskPartitionId, taskId, taskEpochId, structType);
|
||||
}
|
||||
|
||||
private String getNextFileId() {
|
||||
|
||||
@@ -23,7 +23,7 @@ import org.apache.hudi.common.model.HoodieRecord
|
||||
import org.apache.hudi.common.util.ReflectionUtils
|
||||
import org.apache.hudi.config.HoodieWriteConfig
|
||||
import org.apache.hudi.index.SparkHoodieIndexFactory
|
||||
import org.apache.hudi.keygen.BuiltinKeyGenerator
|
||||
import org.apache.hudi.keygen.{BuiltinKeyGenerator, SparkKeyGeneratorInterface}
|
||||
import org.apache.hudi.table.BulkInsertPartitioner
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.rdd.RDD
|
||||
@@ -63,13 +63,12 @@ object HoodieDatasetBulkInsertHelper extends Logging {
|
||||
df.queryExecution.toRdd.mapPartitions { iter =>
|
||||
val keyGenerator =
|
||||
ReflectionUtils.loadClass(keyGeneratorClassName, new TypedProperties(config.getProps))
|
||||
.asInstanceOf[BuiltinKeyGenerator]
|
||||
.asInstanceOf[SparkKeyGeneratorInterface]
|
||||
|
||||
iter.map { row =>
|
||||
val (recordKey, partitionPath) =
|
||||
if (populateMetaFields) {
|
||||
(UTF8String.fromString(keyGenerator.getRecordKey(row, schema)),
|
||||
UTF8String.fromString(keyGenerator.getPartitionPath(row, schema)))
|
||||
(keyGenerator.getRecordKey(row, schema), keyGenerator.getPartitionPath(row, schema))
|
||||
} else {
|
||||
(UTF8String.EMPTY_UTF8, UTF8String.EMPTY_UTF8)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user