[HUDI-2161] Adding support to disable meta columns with bulk insert operation (#3247)
This commit is contained in:
committed by
GitHub
parent
2099bf41db
commit
d5026e9a24
@@ -403,6 +403,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
||||
upgradeDowngrade.run(metaClient, HoodieTableVersion.current(), config, context, instantTime);
|
||||
}
|
||||
}
|
||||
metaClient.validateTableProperties(config.getProps(), operationType);
|
||||
return getTableAndInitCtx(metaClient, operationType, instantTime);
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
|
||||
@@ -33,12 +33,19 @@ public interface HoodieInternalRowFileWriter {
|
||||
boolean canWrite();
|
||||
|
||||
/**
|
||||
* Writes an {@link InternalRow} to the HoodieInternalRowFileWriter.
|
||||
* Writes an {@link InternalRow} to the HoodieInternalRowFileWriter. Also takes in associated record key to be added to bloom filter if required.
|
||||
*
|
||||
* @throws IOException on any exception while writing.
|
||||
*/
|
||||
void writeRow(String key, InternalRow row) throws IOException;
|
||||
|
||||
/**
|
||||
* Writes an {@link InternalRow} to the HoodieInternalRowFileWriter.
|
||||
*
|
||||
* @throws IOException on any exception while writing.
|
||||
*/
|
||||
void writeRow(InternalRow row) throws IOException;
|
||||
|
||||
/**
|
||||
* Closes the {@link HoodieInternalRowFileWriter} and may not take in any more writes.
|
||||
*/
|
||||
@@ -16,21 +16,22 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
||||
|
||||
/**
|
||||
* Factory to assist in instantiating a new {@link HoodieInternalRowFileWriter}.
|
||||
*/
|
||||
@@ -76,4 +77,29 @@ public class HoodieInternalRowFileWriterFactory {
|
||||
writeSupport.getHadoopConf(),
|
||||
writeConfig.getParquetCompressionRatio()));
|
||||
}
|
||||
|
||||
public static HoodieInternalRowFileWriter getInternalRowFileWriterWithoutMetaFields(
|
||||
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
|
||||
throws IOException {
|
||||
if (PARQUET.getFileExtension().equals(hoodieTable.getBaseFileExtension())) {
|
||||
return newParquetInternalRowFileWriterWithoutMetaFields(path, config, schema, hoodieTable);
|
||||
}
|
||||
throw new HoodieIOException(hoodieTable.getBaseFileExtension() + " format not supported yet in row writer path");
|
||||
}
|
||||
|
||||
private static HoodieInternalRowFileWriter newParquetInternalRowFileWriterWithoutMetaFields(
|
||||
Path path, HoodieWriteConfig writeConfig, StructType structType, HoodieTable table)
|
||||
throws IOException {
|
||||
HoodieRowParquetWriteSupport writeSupport =
|
||||
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, null);
|
||||
return new HoodieInternalRowParquetWriter(
|
||||
path, new HoodieRowParquetConfig(
|
||||
writeSupport,
|
||||
writeConfig.getParquetCompressionCodec(),
|
||||
writeConfig.getParquetBlockSize(),
|
||||
writeConfig.getParquetPageSize(),
|
||||
writeConfig.getParquetMaxFileSize(),
|
||||
writeSupport.getHadoopConf(),
|
||||
writeConfig.getParquetCompressionRatio()));
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
@@ -65,6 +65,11 @@ public class HoodieInternalRowParquetWriter extends ParquetWriter<InternalRow>
|
||||
writeSupport.add(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeRow(InternalRow row) throws IOException {
|
||||
super.write(row);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
@@ -16,7 +16,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io;
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.client.HoodieInternalWriteStatus;
|
||||
import org.apache.hudi.client.model.HoodieInternalRow;
|
||||
@@ -30,8 +30,6 @@ import org.apache.hudi.common.util.HoodieTimer;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieInsertException;
|
||||
import org.apache.hudi.io.storage.HoodieInternalRowFileWriter;
|
||||
import org.apache.hudi.io.storage.HoodieInternalRowFileWriterFactory;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.MarkerFiles;
|
||||
|
||||
@@ -61,12 +59,12 @@ public class HoodieRowCreateHandle implements Serializable {
|
||||
private final long taskEpochId;
|
||||
private final HoodieTable table;
|
||||
private final HoodieWriteConfig writeConfig;
|
||||
private final HoodieInternalRowFileWriter fileWriter;
|
||||
protected final HoodieInternalRowFileWriter fileWriter;
|
||||
private final String partitionPath;
|
||||
private final Path path;
|
||||
private final String fileId;
|
||||
private final FileSystem fs;
|
||||
private final HoodieInternalWriteStatus writeStatus;
|
||||
protected final HoodieInternalWriteStatus writeStatus;
|
||||
private final HoodieTimer currTimer;
|
||||
|
||||
public HoodieRowCreateHandle(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId,
|
||||
@@ -197,7 +195,7 @@ public class HoodieRowCreateHandle implements Serializable {
|
||||
return taskPartitionId + "-" + taskId + "-" + taskEpochId;
|
||||
}
|
||||
|
||||
private HoodieInternalRowFileWriter createNewFileWriter(
|
||||
protected HoodieInternalRowFileWriter createNewFileWriter(
|
||||
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
|
||||
throws IOException {
|
||||
return HoodieInternalRowFileWriterFactory.getInternalRowFileWriter(
|
||||
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* RowCreateHandle to be used when meta fields are disabled.
|
||||
*/
|
||||
public class HoodieRowCreateHandleWithoutMetaFields extends HoodieRowCreateHandle {
|
||||
|
||||
public HoodieRowCreateHandleWithoutMetaFields(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId, String instantTime,
|
||||
int taskPartitionId, long taskId, long taskEpochId, StructType structType) {
|
||||
super(table, writeConfig, partitionPath, fileId, instantTime, taskPartitionId, taskId, taskEpochId, structType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the incoming InternalRow as is.
|
||||
*
|
||||
* @param record instance of {@link InternalRow} that needs to be written to the fileWriter.
|
||||
* @throws IOException
|
||||
*/
|
||||
@Override
|
||||
public void write(InternalRow record) throws IOException {
|
||||
try {
|
||||
fileWriter.writeRow(record);
|
||||
writeStatus.markSuccess();
|
||||
} catch (Throwable ge) {
|
||||
writeStatus.setGlobalError(ge);
|
||||
throw new HoodieException("Exception thrown while writing spark InternalRows to file ", ge);
|
||||
}
|
||||
}
|
||||
|
||||
protected HoodieInternalRowFileWriter createNewFileWriter(
|
||||
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
|
||||
throws IOException {
|
||||
return HoodieInternalRowFileWriterFactory.getInternalRowFileWriterWithoutMetaFields(
|
||||
path, hoodieTable, config, schema);
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,9 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.io.storage.HoodieBaseParquetConfig;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
@@ -16,7 +16,7 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
@@ -18,21 +18,28 @@
|
||||
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.ApiMaturityLevel;
|
||||
import org.apache.hudi.AvroConversionHelper;
|
||||
import org.apache.hudi.HoodieSparkUtils;
|
||||
import org.apache.hudi.PublicAPIMethod;
|
||||
import org.apache.hudi.client.utils.SparkRowSerDe;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieKeyException;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.DataType;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import scala.Function1;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import scala.Function1;
|
||||
|
||||
/**
|
||||
* Base class for the built-in key generators. Contains methods structured for
|
||||
* code reuse amongst them.
|
||||
@@ -42,10 +49,12 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
|
||||
private static final String STRUCT_NAME = "hoodieRowTopLevelField";
|
||||
private static final String NAMESPACE = "hoodieRow";
|
||||
private transient Function1<Object, Object> converterFn = null;
|
||||
private SparkRowSerDe sparkRowSerDe;
|
||||
protected StructType structType;
|
||||
|
||||
protected Map<String, List<Integer>> recordKeyPositions = new HashMap<>();
|
||||
protected Map<String, List<Integer>> partitionPathPositions = new HashMap<>();
|
||||
protected Map<String, List<DataType>> partitionPathDataTypes = null;
|
||||
|
||||
protected BuiltinKeyGenerator(TypedProperties config) {
|
||||
super(config);
|
||||
@@ -81,6 +90,29 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
|
||||
return getKey(genericRecord).getPartitionPath();
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch partition path from {@link InternalRow}.
|
||||
*
|
||||
* @param internalRow {@link InternalRow} instance from which partition path needs to be fetched from.
|
||||
* @param structType schema of the internalRow.
|
||||
* @return the partition path.
|
||||
*/
|
||||
public String getPartitionPath(InternalRow internalRow, StructType structType) {
|
||||
try {
|
||||
initDeserializer(structType);
|
||||
Row row = sparkRowSerDe.deserializeRow(internalRow);
|
||||
return getPartitionPath(row);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieIOException("Conversion of InternalRow to Row failed with exception " + e);
|
||||
}
|
||||
}
|
||||
|
||||
private void initDeserializer(StructType structType) {
|
||||
if (sparkRowSerDe == null) {
|
||||
sparkRowSerDe = HoodieSparkUtils.getDeserializer(structType);
|
||||
}
|
||||
}
|
||||
|
||||
void buildFieldPositionMapIfNeeded(StructType structType) {
|
||||
if (this.structType == null) {
|
||||
// parse simple fields
|
||||
@@ -116,5 +148,39 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
|
||||
this.structType = structType;
|
||||
}
|
||||
}
|
||||
|
||||
protected String getPartitionPathInternal(InternalRow row, StructType structType) {
|
||||
buildFieldDataTypesMapIfNeeded(structType);
|
||||
validatePartitionFieldsForInternalRow();
|
||||
return RowKeyGeneratorHelper.getPartitionPathFromInternalRow(row, getPartitionPathFields(),
|
||||
hiveStylePartitioning, partitionPathPositions, partitionPathDataTypes);
|
||||
}
|
||||
|
||||
protected void validatePartitionFieldsForInternalRow() {
|
||||
partitionPathPositions.entrySet().forEach(entry -> {
|
||||
if (entry.getValue().size() > 1) {
|
||||
throw new IllegalArgumentException("Nested column for partitioning is not supported with disabling meta columns");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void buildFieldDataTypesMapIfNeeded(StructType structType) {
|
||||
buildFieldPositionMapIfNeeded(structType);
|
||||
if (this.partitionPathDataTypes == null) {
|
||||
this.partitionPathDataTypes = new HashMap<>();
|
||||
if (getPartitionPathFields() != null) {
|
||||
// populating simple fields are good enough
|
||||
getPartitionPathFields().stream().filter(f -> !f.isEmpty()).filter(f -> !(f.contains(".")))
|
||||
.forEach(f -> {
|
||||
if (structType.getFieldIndex(f).isDefined()) {
|
||||
partitionPathDataTypes.put(f,
|
||||
Collections.singletonList((structType.fields()[structType.fieldIndex(f)].dataType())));
|
||||
} else {
|
||||
partitionPathDataTypes.put(f, Collections.singletonList(null));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,10 +17,13 @@
|
||||
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Collectors;
|
||||
@@ -64,4 +67,9 @@ public class ComplexKeyGenerator extends BuiltinKeyGenerator {
|
||||
hiveStylePartitioning, partitionPathPositions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPartitionPath(InternalRow row, StructType structType) {
|
||||
return getPartitionPathInternal(row, structType);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -18,13 +18,17 @@
|
||||
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieKeyException;
|
||||
import org.apache.hudi.exception.HoodieKeyGeneratorException;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
@@ -74,10 +78,15 @@ public class CustomKeyGenerator extends BuiltinKeyGenerator {
|
||||
|
||||
@Override
|
||||
public String getPartitionPath(Row row) {
|
||||
return getPartitionPath(Option.empty(), Option.of(row));
|
||||
return getPartitionPath(Option.empty(), Option.of(row), Option.empty());
|
||||
}
|
||||
|
||||
private String getPartitionPath(Option<GenericRecord> record, Option<Row> row) {
|
||||
@Override
|
||||
public String getPartitionPath(InternalRow row, StructType structType) {
|
||||
return getPartitionPath(Option.empty(), Option.empty(), Option.of(Pair.of(row, structType)));
|
||||
}
|
||||
|
||||
private String getPartitionPath(Option<GenericRecord> record, Option<Row> row, Option<Pair<InternalRow, StructType>> internalRowStructTypePair) {
|
||||
if (getPartitionPathFields() == null) {
|
||||
throw new HoodieKeyException("Unable to find field names for partition path in cfg");
|
||||
}
|
||||
@@ -101,16 +110,22 @@ public class CustomKeyGenerator extends BuiltinKeyGenerator {
|
||||
case SIMPLE:
|
||||
if (record.isPresent()) {
|
||||
partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(record.get()));
|
||||
} else {
|
||||
} else if (row.isPresent()) {
|
||||
partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(row.get()));
|
||||
} else {
|
||||
partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(internalRowStructTypePair.get().getKey(),
|
||||
internalRowStructTypePair.get().getValue()));
|
||||
}
|
||||
break;
|
||||
case TIMESTAMP:
|
||||
try {
|
||||
if (record.isPresent()) {
|
||||
partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(record.get()));
|
||||
} else {
|
||||
} else if (row.isPresent()) {
|
||||
partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(row.get()));
|
||||
} else {
|
||||
partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(internalRowStructTypePair.get().getKey(),
|
||||
internalRowStructTypePair.get().getValue()));
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieKeyGeneratorException("Unable to initialise TimestampBasedKeyGenerator class", ioe);
|
||||
|
||||
@@ -18,10 +18,13 @@
|
||||
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
@@ -65,5 +68,10 @@ public class GlobalDeleteKeyGenerator extends BuiltinKeyGenerator {
|
||||
public String getPartitionPath(Row row) {
|
||||
return globalAvroDeleteKeyGenerator.getEmptyPartition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPartitionPath(InternalRow row, StructType structType) {
|
||||
return globalAvroDeleteKeyGenerator.getEmptyPartition();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,6 +22,8 @@ import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
@@ -63,5 +65,9 @@ public class NonpartitionedKeyGenerator extends BuiltinKeyGenerator {
|
||||
return nonpartitionedAvroKeyGenerator.getEmptyPartition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPartitionPath(InternalRow internalRow, StructType structType) {
|
||||
return nonpartitionedAvroKeyGenerator.getEmptyPartition();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,10 +19,13 @@
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.hudi.exception.HoodieKeyException;
|
||||
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.DataType;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import scala.Option;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
@@ -33,6 +36,8 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import scala.Option;
|
||||
|
||||
import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH;
|
||||
import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR;
|
||||
import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER;
|
||||
@@ -121,6 +126,54 @@ public class RowKeyGeneratorHelper {
|
||||
}).collect(Collectors.joining(DEFAULT_PARTITION_PATH_SEPARATOR));
|
||||
}
|
||||
|
||||
public static String getPartitionPathFromInternalRow(InternalRow row, List<String> partitionPathFields, boolean hiveStylePartitioning,
|
||||
Map<String, List<Integer>> partitionPathPositions,
|
||||
Map<String, List<DataType>> partitionPathDataTypes) {
|
||||
return IntStream.range(0, partitionPathFields.size()).mapToObj(idx -> {
|
||||
String field = partitionPathFields.get(idx);
|
||||
String val = null;
|
||||
List<Integer> fieldPositions = partitionPathPositions.get(field);
|
||||
if (fieldPositions.size() == 1) { // simple
|
||||
Integer fieldPos = fieldPositions.get(0);
|
||||
// for partition path, if field is not found, index will be set to -1
|
||||
if (fieldPos == -1 || row.isNullAt(fieldPos)) {
|
||||
val = DEFAULT_PARTITION_PATH;
|
||||
} else {
|
||||
Object value = row.get(fieldPos, partitionPathDataTypes.get(field).get(0));
|
||||
if (value == null || value.toString().isEmpty()) {
|
||||
val = DEFAULT_PARTITION_PATH;
|
||||
} else {
|
||||
val = value.toString();
|
||||
}
|
||||
}
|
||||
if (hiveStylePartitioning) {
|
||||
val = field + "=" + val;
|
||||
}
|
||||
} else { // nested
|
||||
throw new IllegalArgumentException("Nested partitioning is not supported with disabling meta columns.");
|
||||
}
|
||||
return val;
|
||||
}).collect(Collectors.joining(DEFAULT_PARTITION_PATH_SEPARATOR));
|
||||
}
|
||||
|
||||
public static Object getFieldValFromInternalRow(InternalRow internalRow,
|
||||
Integer partitionPathPosition,
|
||||
DataType partitionPathDataType) {
|
||||
Object val = null;
|
||||
if (internalRow.isNullAt(partitionPathPosition)) {
|
||||
return DEFAULT_PARTITION_PATH;
|
||||
} else {
|
||||
Object value = partitionPathDataType == DataTypes.StringType ? internalRow.getString(partitionPathPosition) : internalRow.get(partitionPathPosition, partitionPathDataType);
|
||||
if (value == null || value.toString().isEmpty()) {
|
||||
val = DEFAULT_PARTITION_PATH;
|
||||
} else {
|
||||
val = value;
|
||||
}
|
||||
}
|
||||
return val;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Fetch the field value located at the positions requested for.
|
||||
*
|
||||
|
||||
@@ -18,10 +18,13 @@
|
||||
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
@@ -72,4 +75,9 @@ public class SimpleKeyGenerator extends BuiltinKeyGenerator {
|
||||
return RowKeyGeneratorHelper.getPartitionPathFromRow(row, getPartitionPathFields(),
|
||||
hiveStylePartitioning, partitionPathPositions);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPartitionPath(InternalRow row, StructType structType) {
|
||||
return getPartitionPathInternal(row, structType);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,11 +18,14 @@
|
||||
|
||||
package org.apache.hudi.keygen;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.exception.HoodieKeyGeneratorException;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.catalyst.InternalRow;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@@ -64,9 +67,23 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator {
|
||||
|
||||
@Override
|
||||
public String getPartitionPath(Row row) {
|
||||
Object fieldVal = null;
|
||||
buildFieldPositionMapIfNeeded(row.schema());
|
||||
Object partitionPathFieldVal = RowKeyGeneratorHelper.getNestedFieldVal(row, partitionPathPositions.get(getPartitionPathFields().get(0)));
|
||||
return getTimestampBasedPartitionPath(partitionPathFieldVal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPartitionPath(InternalRow internalRow, StructType structType) {
|
||||
buildFieldDataTypesMapIfNeeded(structType);
|
||||
validatePartitionFieldsForInternalRow();
|
||||
Object partitionPathFieldVal = RowKeyGeneratorHelper.getFieldValFromInternalRow(internalRow,
|
||||
partitionPathPositions.get(getPartitionPathFields().get(0)).get(0),
|
||||
partitionPathDataTypes.get(getPartitionPathFields().get(0)).get(0));
|
||||
return getTimestampBasedPartitionPath(partitionPathFieldVal);
|
||||
}
|
||||
|
||||
private String getTimestampBasedPartitionPath(Object partitionPathFieldVal) {
|
||||
Object fieldVal = null;
|
||||
try {
|
||||
if (partitionPathFieldVal == null || partitionPathFieldVal.toString().contains(DEFAULT_PARTITION_PATH) || partitionPathFieldVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER)
|
||||
|| partitionPathFieldVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) {
|
||||
|
||||
Reference in New Issue
Block a user