1
0

[HUDI-2161] Adding support to disable meta columns with bulk insert operation (#3247)

This commit is contained in:
Sivabalan Narayanan
2021-07-19 20:43:48 -04:00
committed by GitHub
parent 2099bf41db
commit d5026e9a24
53 changed files with 1063 additions and 269 deletions

View File

@@ -62,6 +62,10 @@ public class HoodieInternalWriteStatus implements Serializable {
totalRecords++; totalRecords++;
} }
public void markSuccess() {
totalRecords++;
}
public void markFailure(String recordKey, Throwable t) { public void markFailure(String recordKey, Throwable t) {
if (failedRecordKeys.isEmpty() || (random.nextDouble() <= failureFraction)) { if (failedRecordKeys.isEmpty() || (random.nextDouble() <= failureFraction)) {
failedRecordKeys.add(Pair.of(recordKey, t)); failedRecordKeys.add(Pair.of(recordKey, t));

View File

@@ -31,6 +31,7 @@ import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy; import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload; import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
import org.apache.hudi.common.model.WriteConcurrencyMode; import org.apache.hudi.common.model.WriteConcurrencyMode;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion; import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ReflectionUtils;
@@ -602,6 +603,11 @@ public class HoodieWriteConfig extends HoodieConfig {
return engineType; return engineType;
} }
public boolean populateMetaFields() {
return Boolean.parseBoolean(getStringOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS,
HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()));
}
/** /**
* compaction properties. * compaction properties.
*/ */

View File

@@ -403,6 +403,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
upgradeDowngrade.run(metaClient, HoodieTableVersion.current(), config, context, instantTime); upgradeDowngrade.run(metaClient, HoodieTableVersion.current(), config, context, instantTime);
} }
} }
metaClient.validateTableProperties(config.getProps(), operationType);
return getTableAndInitCtx(metaClient, operationType, instantTime); return getTableAndInitCtx(metaClient, operationType, instantTime);
} }

View File

@@ -16,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io.storage; package org.apache.hudi.io.storage.row;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
@@ -33,12 +33,19 @@ public interface HoodieInternalRowFileWriter {
boolean canWrite(); boolean canWrite();
/** /**
* Writes an {@link InternalRow} to the HoodieInternalRowFileWriter. * Writes an {@link InternalRow} to the HoodieInternalRowFileWriter. Also takes in associated record key to be added to bloom filter if required.
* *
* @throws IOException on any exception while writing. * @throws IOException on any exception while writing.
*/ */
void writeRow(String key, InternalRow row) throws IOException; void writeRow(String key, InternalRow row) throws IOException;
/**
* Writes an {@link InternalRow} to the HoodieInternalRowFileWriter.
*
* @throws IOException on any exception while writing.
*/
void writeRow(InternalRow row) throws IOException;
/** /**
* Closes the {@link HoodieInternalRowFileWriter} and may not take in any more writes. * Closes the {@link HoodieInternalRowFileWriter} and may not take in any more writes.
*/ */

View File

@@ -16,21 +16,22 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io.storage; package org.apache.hudi.io.storage.row;
import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.bloom.BloomFilterFactory;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTable;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
import java.io.IOException; import java.io.IOException;
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
/** /**
* Factory to assist in instantiating a new {@link HoodieInternalRowFileWriter}. * Factory to assist in instantiating a new {@link HoodieInternalRowFileWriter}.
*/ */
@@ -76,4 +77,29 @@ public class HoodieInternalRowFileWriterFactory {
writeSupport.getHadoopConf(), writeSupport.getHadoopConf(),
writeConfig.getParquetCompressionRatio())); writeConfig.getParquetCompressionRatio()));
} }
public static HoodieInternalRowFileWriter getInternalRowFileWriterWithoutMetaFields(
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
throws IOException {
if (PARQUET.getFileExtension().equals(hoodieTable.getBaseFileExtension())) {
return newParquetInternalRowFileWriterWithoutMetaFields(path, config, schema, hoodieTable);
}
throw new HoodieIOException(hoodieTable.getBaseFileExtension() + " format not supported yet in row writer path");
}
private static HoodieInternalRowFileWriter newParquetInternalRowFileWriterWithoutMetaFields(
Path path, HoodieWriteConfig writeConfig, StructType structType, HoodieTable table)
throws IOException {
HoodieRowParquetWriteSupport writeSupport =
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, null);
return new HoodieInternalRowParquetWriter(
path, new HoodieRowParquetConfig(
writeSupport,
writeConfig.getParquetCompressionCodec(),
writeConfig.getParquetBlockSize(),
writeConfig.getParquetPageSize(),
writeConfig.getParquetMaxFileSize(),
writeSupport.getHadoopConf(),
writeConfig.getParquetCompressionRatio()));
}
} }

View File

@@ -16,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io.storage; package org.apache.hudi.io.storage.row;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
@@ -65,6 +65,11 @@ public class HoodieInternalRowParquetWriter extends ParquetWriter<InternalRow>
writeSupport.add(key); writeSupport.add(key);
} }
@Override
public void writeRow(InternalRow row) throws IOException {
super.write(row);
}
@Override @Override
public void close() throws IOException { public void close() throws IOException {
super.close(); super.close();

View File

@@ -16,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io; package org.apache.hudi.io.storage.row;
import org.apache.hudi.client.HoodieInternalWriteStatus; import org.apache.hudi.client.HoodieInternalWriteStatus;
import org.apache.hudi.client.model.HoodieInternalRow; import org.apache.hudi.client.model.HoodieInternalRow;
@@ -30,8 +30,6 @@ import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieInsertException; import org.apache.hudi.exception.HoodieInsertException;
import org.apache.hudi.io.storage.HoodieInternalRowFileWriter;
import org.apache.hudi.io.storage.HoodieInternalRowFileWriterFactory;
import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.MarkerFiles; import org.apache.hudi.table.MarkerFiles;
@@ -61,12 +59,12 @@ public class HoodieRowCreateHandle implements Serializable {
private final long taskEpochId; private final long taskEpochId;
private final HoodieTable table; private final HoodieTable table;
private final HoodieWriteConfig writeConfig; private final HoodieWriteConfig writeConfig;
private final HoodieInternalRowFileWriter fileWriter; protected final HoodieInternalRowFileWriter fileWriter;
private final String partitionPath; private final String partitionPath;
private final Path path; private final Path path;
private final String fileId; private final String fileId;
private final FileSystem fs; private final FileSystem fs;
private final HoodieInternalWriteStatus writeStatus; protected final HoodieInternalWriteStatus writeStatus;
private final HoodieTimer currTimer; private final HoodieTimer currTimer;
public HoodieRowCreateHandle(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId, public HoodieRowCreateHandle(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId,
@@ -197,7 +195,7 @@ public class HoodieRowCreateHandle implements Serializable {
return taskPartitionId + "-" + taskId + "-" + taskEpochId; return taskPartitionId + "-" + taskId + "-" + taskEpochId;
} }
private HoodieInternalRowFileWriter createNewFileWriter( protected HoodieInternalRowFileWriter createNewFileWriter(
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema) Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
throws IOException { throws IOException {
return HoodieInternalRowFileWriterFactory.getInternalRowFileWriter( return HoodieInternalRowFileWriterFactory.getInternalRowFileWriter(

View File

@@ -0,0 +1,64 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.io.storage.row;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.table.HoodieTable;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import java.io.IOException;
/**
* RowCreateHandle to be used when meta fields are disabled.
*/
public class HoodieRowCreateHandleWithoutMetaFields extends HoodieRowCreateHandle {
public HoodieRowCreateHandleWithoutMetaFields(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId, String instantTime,
int taskPartitionId, long taskId, long taskEpochId, StructType structType) {
super(table, writeConfig, partitionPath, fileId, instantTime, taskPartitionId, taskId, taskEpochId, structType);
}
/**
* Write the incoming InternalRow as is.
*
* @param record instance of {@link InternalRow} that needs to be written to the fileWriter.
* @throws IOException
*/
@Override
public void write(InternalRow record) throws IOException {
try {
fileWriter.writeRow(record);
writeStatus.markSuccess();
} catch (Throwable ge) {
writeStatus.setGlobalError(ge);
throw new HoodieException("Exception thrown while writing spark InternalRows to file ", ge);
}
}
protected HoodieInternalRowFileWriter createNewFileWriter(
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
throws IOException {
return HoodieInternalRowFileWriterFactory.getInternalRowFileWriterWithoutMetaFields(
path, hoodieTable, config, schema);
}
}

View File

@@ -16,7 +16,9 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io.storage; package org.apache.hudi.io.storage.row;
import org.apache.hudi.io.storage.HoodieBaseParquetConfig;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.CompressionCodecName;

View File

@@ -16,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io.storage; package org.apache.hudi.io.storage.row;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilter;

View File

@@ -18,21 +18,28 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.ApiMaturityLevel; import org.apache.hudi.ApiMaturityLevel;
import org.apache.hudi.AvroConversionHelper; import org.apache.hudi.AvroConversionHelper;
import org.apache.hudi.HoodieSparkUtils;
import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.PublicAPIMethod;
import org.apache.hudi.client.utils.SparkRowSerDe;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieKeyException; import org.apache.hudi.exception.HoodieKeyException;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import scala.Function1;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import scala.Function1;
/** /**
* Base class for the built-in key generators. Contains methods structured for * Base class for the built-in key generators. Contains methods structured for
* code reuse amongst them. * code reuse amongst them.
@@ -42,10 +49,12 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
private static final String STRUCT_NAME = "hoodieRowTopLevelField"; private static final String STRUCT_NAME = "hoodieRowTopLevelField";
private static final String NAMESPACE = "hoodieRow"; private static final String NAMESPACE = "hoodieRow";
private transient Function1<Object, Object> converterFn = null; private transient Function1<Object, Object> converterFn = null;
private SparkRowSerDe sparkRowSerDe;
protected StructType structType; protected StructType structType;
protected Map<String, List<Integer>> recordKeyPositions = new HashMap<>(); protected Map<String, List<Integer>> recordKeyPositions = new HashMap<>();
protected Map<String, List<Integer>> partitionPathPositions = new HashMap<>(); protected Map<String, List<Integer>> partitionPathPositions = new HashMap<>();
protected Map<String, List<DataType>> partitionPathDataTypes = null;
protected BuiltinKeyGenerator(TypedProperties config) { protected BuiltinKeyGenerator(TypedProperties config) {
super(config); super(config);
@@ -81,6 +90,29 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
return getKey(genericRecord).getPartitionPath(); return getKey(genericRecord).getPartitionPath();
} }
/**
* Fetch partition path from {@link InternalRow}.
*
* @param internalRow {@link InternalRow} instance from which partition path needs to be fetched from.
* @param structType schema of the internalRow.
* @return the partition path.
*/
public String getPartitionPath(InternalRow internalRow, StructType structType) {
try {
initDeserializer(structType);
Row row = sparkRowSerDe.deserializeRow(internalRow);
return getPartitionPath(row);
} catch (Exception e) {
throw new HoodieIOException("Conversion of InternalRow to Row failed with exception " + e);
}
}
private void initDeserializer(StructType structType) {
if (sparkRowSerDe == null) {
sparkRowSerDe = HoodieSparkUtils.getDeserializer(structType);
}
}
void buildFieldPositionMapIfNeeded(StructType structType) { void buildFieldPositionMapIfNeeded(StructType structType) {
if (this.structType == null) { if (this.structType == null) {
// parse simple fields // parse simple fields
@@ -116,5 +148,39 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
this.structType = structType; this.structType = structType;
} }
} }
protected String getPartitionPathInternal(InternalRow row, StructType structType) {
buildFieldDataTypesMapIfNeeded(structType);
validatePartitionFieldsForInternalRow();
return RowKeyGeneratorHelper.getPartitionPathFromInternalRow(row, getPartitionPathFields(),
hiveStylePartitioning, partitionPathPositions, partitionPathDataTypes);
}
protected void validatePartitionFieldsForInternalRow() {
partitionPathPositions.entrySet().forEach(entry -> {
if (entry.getValue().size() > 1) {
throw new IllegalArgumentException("Nested column for partitioning is not supported with disabling meta columns");
}
});
}
void buildFieldDataTypesMapIfNeeded(StructType structType) {
buildFieldPositionMapIfNeeded(structType);
if (this.partitionPathDataTypes == null) {
this.partitionPathDataTypes = new HashMap<>();
if (getPartitionPathFields() != null) {
// populating simple fields are good enough
getPartitionPathFields().stream().filter(f -> !f.isEmpty()).filter(f -> !(f.contains(".")))
.forEach(f -> {
if (structType.getFieldIndex(f).isDefined()) {
partitionPathDataTypes.put(f,
Collections.singletonList((structType.fields()[structType.fieldIndex(f)].dataType())));
} else {
partitionPathDataTypes.put(f, Collections.singletonList(null));
}
});
}
}
}
} }

View File

@@ -17,10 +17,13 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import java.util.Arrays; import java.util.Arrays;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@@ -64,4 +67,9 @@ public class ComplexKeyGenerator extends BuiltinKeyGenerator {
hiveStylePartitioning, partitionPathPositions); hiveStylePartitioning, partitionPathPositions);
} }
@Override
public String getPartitionPath(InternalRow row, StructType structType) {
return getPartitionPathInternal(row, structType);
}
} }

View File

@@ -18,13 +18,17 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieKeyException; import org.apache.hudi.exception.HoodieKeyException;
import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.exception.HoodieKeyGeneratorException;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
@@ -74,10 +78,15 @@ public class CustomKeyGenerator extends BuiltinKeyGenerator {
@Override @Override
public String getPartitionPath(Row row) { public String getPartitionPath(Row row) {
return getPartitionPath(Option.empty(), Option.of(row)); return getPartitionPath(Option.empty(), Option.of(row), Option.empty());
} }
private String getPartitionPath(Option<GenericRecord> record, Option<Row> row) { @Override
public String getPartitionPath(InternalRow row, StructType structType) {
return getPartitionPath(Option.empty(), Option.empty(), Option.of(Pair.of(row, structType)));
}
private String getPartitionPath(Option<GenericRecord> record, Option<Row> row, Option<Pair<InternalRow, StructType>> internalRowStructTypePair) {
if (getPartitionPathFields() == null) { if (getPartitionPathFields() == null) {
throw new HoodieKeyException("Unable to find field names for partition path in cfg"); throw new HoodieKeyException("Unable to find field names for partition path in cfg");
} }
@@ -101,16 +110,22 @@ public class CustomKeyGenerator extends BuiltinKeyGenerator {
case SIMPLE: case SIMPLE:
if (record.isPresent()) { if (record.isPresent()) {
partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(record.get())); partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(record.get()));
} else { } else if (row.isPresent()) {
partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(row.get())); partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(row.get()));
} else {
partitionPath.append(new SimpleKeyGenerator(config, partitionPathField).getPartitionPath(internalRowStructTypePair.get().getKey(),
internalRowStructTypePair.get().getValue()));
} }
break; break;
case TIMESTAMP: case TIMESTAMP:
try { try {
if (record.isPresent()) { if (record.isPresent()) {
partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(record.get())); partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(record.get()));
} else { } else if (row.isPresent()) {
partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(row.get())); partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(row.get()));
} else {
partitionPath.append(new TimestampBasedKeyGenerator(config, partitionPathField).getPartitionPath(internalRowStructTypePair.get().getKey(),
internalRowStructTypePair.get().getValue()));
} }
} catch (IOException ioe) { } catch (IOException ioe) {
throw new HoodieKeyGeneratorException("Unable to initialise TimestampBasedKeyGenerator class", ioe); throw new HoodieKeyGeneratorException("Unable to initialise TimestampBasedKeyGenerator class", ioe);

View File

@@ -18,10 +18,13 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@@ -65,5 +68,10 @@ public class GlobalDeleteKeyGenerator extends BuiltinKeyGenerator {
public String getPartitionPath(Row row) { public String getPartitionPath(Row row) {
return globalAvroDeleteKeyGenerator.getEmptyPartition(); return globalAvroDeleteKeyGenerator.getEmptyPartition();
} }
@Override
public String getPartitionPath(InternalRow row, StructType structType) {
return globalAvroDeleteKeyGenerator.getEmptyPartition();
}
} }

View File

@@ -22,6 +22,8 @@ import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
@@ -63,5 +65,9 @@ public class NonpartitionedKeyGenerator extends BuiltinKeyGenerator {
return nonpartitionedAvroKeyGenerator.getEmptyPartition(); return nonpartitionedAvroKeyGenerator.getEmptyPartition();
} }
@Override
public String getPartitionPath(InternalRow internalRow, StructType structType) {
return nonpartitionedAvroKeyGenerator.getEmptyPartition();
}
} }

View File

@@ -19,10 +19,13 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.hudi.exception.HoodieKeyException; import org.apache.hudi.exception.HoodieKeyException;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import scala.Option;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@@ -33,6 +36,8 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import scala.Option;
import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH; import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH;
import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR; import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR;
import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER; import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER;
@@ -121,6 +126,54 @@ public class RowKeyGeneratorHelper {
}).collect(Collectors.joining(DEFAULT_PARTITION_PATH_SEPARATOR)); }).collect(Collectors.joining(DEFAULT_PARTITION_PATH_SEPARATOR));
} }
public static String getPartitionPathFromInternalRow(InternalRow row, List<String> partitionPathFields, boolean hiveStylePartitioning,
Map<String, List<Integer>> partitionPathPositions,
Map<String, List<DataType>> partitionPathDataTypes) {
return IntStream.range(0, partitionPathFields.size()).mapToObj(idx -> {
String field = partitionPathFields.get(idx);
String val = null;
List<Integer> fieldPositions = partitionPathPositions.get(field);
if (fieldPositions.size() == 1) { // simple
Integer fieldPos = fieldPositions.get(0);
// for partition path, if field is not found, index will be set to -1
if (fieldPos == -1 || row.isNullAt(fieldPos)) {
val = DEFAULT_PARTITION_PATH;
} else {
Object value = row.get(fieldPos, partitionPathDataTypes.get(field).get(0));
if (value == null || value.toString().isEmpty()) {
val = DEFAULT_PARTITION_PATH;
} else {
val = value.toString();
}
}
if (hiveStylePartitioning) {
val = field + "=" + val;
}
} else { // nested
throw new IllegalArgumentException("Nested partitioning is not supported with disabling meta columns.");
}
return val;
}).collect(Collectors.joining(DEFAULT_PARTITION_PATH_SEPARATOR));
}
public static Object getFieldValFromInternalRow(InternalRow internalRow,
Integer partitionPathPosition,
DataType partitionPathDataType) {
Object val = null;
if (internalRow.isNullAt(partitionPathPosition)) {
return DEFAULT_PARTITION_PATH;
} else {
Object value = partitionPathDataType == DataTypes.StringType ? internalRow.getString(partitionPathPosition) : internalRow.get(partitionPathPosition, partitionPathDataType);
if (value == null || value.toString().isEmpty()) {
val = DEFAULT_PARTITION_PATH;
} else {
val = value;
}
}
return val;
}
/** /**
* Fetch the field value located at the positions requested for. * Fetch the field value located at the positions requested for.
* *

View File

@@ -18,10 +18,13 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import java.util.Collections; import java.util.Collections;
@@ -72,4 +75,9 @@ public class SimpleKeyGenerator extends BuiltinKeyGenerator {
return RowKeyGeneratorHelper.getPartitionPathFromRow(row, getPartitionPathFields(), return RowKeyGeneratorHelper.getPartitionPathFromRow(row, getPartitionPathFields(),
hiveStylePartitioning, partitionPathPositions); hiveStylePartitioning, partitionPathPositions);
} }
@Override
public String getPartitionPath(InternalRow row, StructType structType) {
return getPartitionPathInternal(row, structType);
}
} }

View File

@@ -18,11 +18,14 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.exception.HoodieKeyGeneratorException;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import java.io.IOException; import java.io.IOException;
@@ -64,9 +67,23 @@ public class TimestampBasedKeyGenerator extends SimpleKeyGenerator {
@Override @Override
public String getPartitionPath(Row row) { public String getPartitionPath(Row row) {
Object fieldVal = null;
buildFieldPositionMapIfNeeded(row.schema()); buildFieldPositionMapIfNeeded(row.schema());
Object partitionPathFieldVal = RowKeyGeneratorHelper.getNestedFieldVal(row, partitionPathPositions.get(getPartitionPathFields().get(0))); Object partitionPathFieldVal = RowKeyGeneratorHelper.getNestedFieldVal(row, partitionPathPositions.get(getPartitionPathFields().get(0)));
return getTimestampBasedPartitionPath(partitionPathFieldVal);
}
@Override
public String getPartitionPath(InternalRow internalRow, StructType structType) {
buildFieldDataTypesMapIfNeeded(structType);
validatePartitionFieldsForInternalRow();
Object partitionPathFieldVal = RowKeyGeneratorHelper.getFieldValFromInternalRow(internalRow,
partitionPathPositions.get(getPartitionPathFields().get(0)).get(0),
partitionPathDataTypes.get(getPartitionPathFields().get(0)).get(0));
return getTimestampBasedPartitionPath(partitionPathFieldVal);
}
private String getTimestampBasedPartitionPath(Object partitionPathFieldVal) {
Object fieldVal = null;
try { try {
if (partitionPathFieldVal == null || partitionPathFieldVal.toString().contains(DEFAULT_PARTITION_PATH) || partitionPathFieldVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER) if (partitionPathFieldVal == null || partitionPathFieldVal.toString().contains(DEFAULT_PARTITION_PATH) || partitionPathFieldVal.toString().contains(NULL_RECORDKEY_PLACEHOLDER)
|| partitionPathFieldVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) { || partitionPathFieldVal.toString().contains(EMPTY_RECORDKEY_PLACEHOLDER)) {

View File

@@ -21,21 +21,24 @@ package org.apache.hudi
import org.apache.avro.Schema import org.apache.avro.Schema
import org.apache.avro.generic.GenericRecord import org.apache.avro.generic.GenericRecord
import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hudi.client.utils.SparkRowSerDe
import org.apache.hudi.common.model.HoodieRecord import org.apache.hudi.common.model.HoodieRecord
import org.apache.spark.SPARK_VERSION
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.avro.SchemaConverters import org.apache.spark.sql.avro.SchemaConverters
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal}
import org.apache.spark.sql.sources.{And, EqualNullSafe, EqualTo, Filter, GreaterThan, GreaterThanOrEqual, In, IsNotNull, IsNull, LessThan, LessThanOrEqual, Not, Or, StringContains, StringEndsWith, StringStartsWith}
import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, Literal}
import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex} import org.apache.spark.sql.execution.datasources.{FileStatusCache, InMemoryFileIndex}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters.asScalaBufferConverter
object HoodieSparkUtils extends SparkAdapterSupport { object HoodieSparkUtils extends SparkAdapterSupport {
def isSpark3: Boolean = SPARK_VERSION.startsWith("3.")
def getMetaSchema: StructType = { def getMetaSchema: StructType = {
StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => { StructType(HoodieRecord.HOODIE_META_COLUMNS.asScala.map(col => {
StructField(col, StringType, nullable = true) StructField(col, StringType, nullable = true)
@@ -110,6 +113,11 @@ object HoodieSparkUtils extends SparkAdapterSupport {
} }
} }
def getDeserializer(structType: StructType) : SparkRowSerDe = {
val encoder = RowEncoder.apply(structType).resolveAndBind()
sparkAdapter.createSparkRowSerDe(encoder)
}
/** /**
* Convert Filters to Catalyst Expressions and joined by And. If convert success return an * Convert Filters to Catalyst Expressions and joined by And. If convert success return an
* Non-Empty Option[Expression],or else return None. * Non-Empty Option[Expression],or else return None.

View File

@@ -1,10 +1,11 @@
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one
* contributor license agreements. See the NOTICE file distributed with * or more contributor license agreements. See the NOTICE file
* this work for additional information regarding copyright ownership. * distributed with this work for additional information
* The ASF licenses this file to You under the Apache License, Version 2.0 * regarding copyright ownership. The ASF licenses this file
* (the "License"); you may not use this file except in compliance with * to you under the Apache License, Version 2.0 (the
* the License. You may obtain a copy of the License at * "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
@@ -17,7 +18,7 @@
package org.apache.hudi package org.apache.hudi
import org.apache.spark.sql.hudi.{HoodieSqlUtils, SparkAdapter} import org.apache.spark.sql.hudi.SparkAdapter
/** /**
* Use the SparkAdapterSupport trait to get the SparkAdapter when we * Use the SparkAdapterSupport trait to get the SparkAdapter when we
@@ -26,7 +27,7 @@ import org.apache.spark.sql.hudi.{HoodieSqlUtils, SparkAdapter}
trait SparkAdapterSupport { trait SparkAdapterSupport {
lazy val sparkAdapter: SparkAdapter = { lazy val sparkAdapter: SparkAdapter = {
val adapterClass = if (HoodieSqlUtils.isSpark3) { val adapterClass = if (HoodieSparkUtils.isSpark3) {
"org.apache.spark.sql.adapter.Spark3Adapter" "org.apache.spark.sql.adapter.Spark3Adapter"
} else { } else {
"org.apache.spark.sql.adapter.Spark2Adapter" "org.apache.spark.sql.adapter.Spark2Adapter"

View File

@@ -1,10 +1,11 @@
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one
* contributor license agreements. See the NOTICE file distributed with * or more contributor license agreements. See the NOTICE file
* this work for additional information regarding copyright ownership. * distributed with this work for additional information
* The ASF licenses this file to You under the Apache License, Version 2.0 * regarding copyright ownership. The ASF licenses this file
* (the "License"); you may not use this file except in compliance with * to you under the Apache License, Version 2.0 (the
* the License. You may obtain a copy of the License at * "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
@@ -18,16 +19,16 @@
package org.apache.spark.sql.hudi package org.apache.spark.sql.hudi
import org.apache.hudi.client.utils.SparkRowSerDe import org.apache.hudi.client.utils.SparkRowSerDe
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.catalyst.plans.JoinType
import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier}
import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan} import org.apache.spark.sql.catalyst.plans.logical.{Join, LogicalPlan}
import org.apache.spark.sql.catalyst.{AliasIdentifier, TableIdentifier}
import org.apache.spark.sql.execution.datasources.SparkParsePartitionUtil import org.apache.spark.sql.execution.datasources.SparkParsePartitionUtil
import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.{Row, SparkSession}
/** /**
* An interface to adapter the difference between spark2 and spark3 * An interface to adapter the difference between spark2 and spark3

View File

@@ -16,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io.storage; package org.apache.hudi.io.storage.row;
import org.apache.hudi.common.bloom.BloomFilter; import org.apache.hudi.common.bloom.BloomFilter;
import org.apache.hudi.common.bloom.BloomFilterFactory; import org.apache.hudi.common.bloom.BloomFilterFactory;

View File

@@ -16,7 +16,7 @@
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hudi.io; package org.apache.hudi.io.storage.row;
import org.apache.hudi.client.HoodieInternalWriteStatus; import org.apache.hudi.client.HoodieInternalWriteStatus;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
@@ -27,6 +28,7 @@ import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hudi.testutils.KeyGeneratorTestUtilities; import org.apache.hudi.testutils.KeyGeneratorTestUtilities;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@@ -95,6 +97,9 @@ public class TestComplexKeyGenerator extends KeyGeneratorTestUtilities {
Row row = KeyGeneratorTestUtilities.getRow(record); Row row = KeyGeneratorTestUtilities.getRow(record);
Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi"); Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi");
Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686/ts_ms=2020-03-21"); Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686/ts_ms=2020-03-21");
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(keyGenerator.getPartitionPath(internalRow, row.schema()), "timestamp=4357686/ts_ms=2020-03-21");
} }
@Test @Test
@@ -112,6 +117,12 @@ public class TestComplexKeyGenerator extends KeyGeneratorTestUtilities {
HoodieKey hoodieKey = compositeKeyGenerator.getKey(record); HoodieKey hoodieKey = compositeKeyGenerator.getKey(record);
assertEquals("_row_key:" + rowKey, hoodieKey.getRecordKey()); assertEquals("_row_key:" + rowKey, hoodieKey.getRecordKey());
assertEquals(partitionPath, hoodieKey.getPartitionPath()); assertEquals(partitionPath, hoodieKey.getPartitionPath());
Row row = KeyGeneratorTestUtilities.getRow(record, HoodieTestDataGenerator.AVRO_SCHEMA,
AvroConversionUtils.convertAvroSchemaToStructType(HoodieTestDataGenerator.AVRO_SCHEMA));
Assertions.assertEquals(compositeKeyGenerator.getPartitionPath(row), partitionPath);
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(compositeKeyGenerator.getPartitionPath(internalRow, row.schema()), partitionPath);
} }
@Test @Test
@@ -131,6 +142,13 @@ public class TestComplexKeyGenerator extends KeyGeneratorTestUtilities {
HoodieKey hoodieKey = compositeKeyGenerator.getKey(record); HoodieKey hoodieKey = compositeKeyGenerator.getKey(record);
assertEquals(rowKey, hoodieKey.getRecordKey()); assertEquals(rowKey, hoodieKey.getRecordKey());
assertEquals(partitionPath, hoodieKey.getPartitionPath()); assertEquals(partitionPath, hoodieKey.getPartitionPath());
Row row = KeyGeneratorTestUtilities.getRow(record, HoodieTestDataGenerator.AVRO_SCHEMA,
AvroConversionUtils.convertAvroSchemaToStructType(HoodieTestDataGenerator.AVRO_SCHEMA));
Assertions.assertEquals(compositeKeyGenerator.getPartitionPath(row), partitionPath);
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(compositeKeyGenerator.getPartitionPath(internalRow, row.schema()), partitionPath);
} }
@Test @Test
@@ -150,5 +168,12 @@ public class TestComplexKeyGenerator extends KeyGeneratorTestUtilities {
HoodieKey hoodieKey = compositeKeyGenerator.getKey(record); HoodieKey hoodieKey = compositeKeyGenerator.getKey(record);
assertEquals(rowKey, hoodieKey.getRecordKey()); assertEquals(rowKey, hoodieKey.getRecordKey());
assertEquals(partitionPath, hoodieKey.getPartitionPath()); assertEquals(partitionPath, hoodieKey.getPartitionPath());
Row row = KeyGeneratorTestUtilities.getRow(record, HoodieTestDataGenerator.AVRO_SCHEMA,
AvroConversionUtils.convertAvroSchemaToStructType(HoodieTestDataGenerator.AVRO_SCHEMA));
Assertions.assertEquals(compositeKeyGenerator.getPartitionPath(row), partitionPath);
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(compositeKeyGenerator.getPartitionPath(internalRow, row.schema()), partitionPath);
} }
} }

View File

@@ -28,6 +28,7 @@ import org.apache.hudi.keygen.constant.KeyGeneratorType;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.testutils.KeyGeneratorTestUtilities; import org.apache.hudi.testutils.KeyGeneratorTestUtilities;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@@ -141,6 +142,8 @@ public class TestCustomKeyGenerator extends KeyGeneratorTestUtilities {
Row row = KeyGeneratorTestUtilities.getRow(record); Row row = KeyGeneratorTestUtilities.getRow(record);
Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1"); Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1");
Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686"); Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686");
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(keyGenerator.getPartitionPath(internalRow, row.schema()), "timestamp=4357686");
} }
@Test @Test
@@ -164,6 +167,8 @@ public class TestCustomKeyGenerator extends KeyGeneratorTestUtilities {
Row row = KeyGeneratorTestUtilities.getRow(record); Row row = KeyGeneratorTestUtilities.getRow(record);
Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1"); Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1");
Assertions.assertEquals(keyGenerator.getPartitionPath(row), "ts_ms=20200321"); Assertions.assertEquals(keyGenerator.getPartitionPath(row), "ts_ms=20200321");
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(keyGenerator.getPartitionPath(internalRow, row.schema()), "ts_ms=20200321");
} }
@Test @Test
@@ -187,6 +192,9 @@ public class TestCustomKeyGenerator extends KeyGeneratorTestUtilities {
Row row = KeyGeneratorTestUtilities.getRow(record); Row row = KeyGeneratorTestUtilities.getRow(record);
Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1"); Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1");
Assertions.assertTrue(keyGenerator.getPartitionPath(row).isEmpty()); Assertions.assertTrue(keyGenerator.getPartitionPath(row).isEmpty());
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertTrue(keyGenerator.getPartitionPath(internalRow, row.schema()).isEmpty());
} }
@Test @Test
@@ -335,6 +343,9 @@ public class TestCustomKeyGenerator extends KeyGeneratorTestUtilities {
Row row = KeyGeneratorTestUtilities.getRow(record); Row row = KeyGeneratorTestUtilities.getRow(record);
Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi"); Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi");
Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686"); Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686");
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(keyGenerator.getPartitionPath(internalRow, row.schema()), "timestamp=4357686");
} }
@Test @Test
@@ -359,5 +370,8 @@ public class TestCustomKeyGenerator extends KeyGeneratorTestUtilities {
Row row = KeyGeneratorTestUtilities.getRow(record); Row row = KeyGeneratorTestUtilities.getRow(record);
Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi"); Assertions.assertEquals(keyGenerator.getRecordKey(row), "_row_key:key1,pii_col:pi");
Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686/ts_ms=20200321"); Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686/ts_ms=20200321");
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(keyGenerator.getPartitionPath(internalRow, row.schema()), "timestamp=4357686/ts_ms=20200321");
} }
} }

View File

@@ -21,11 +21,12 @@ package org.apache.hudi.keygen;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.exception.HoodieKeyException; import org.apache.hudi.exception.HoodieKeyException;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hudi.testutils.KeyGeneratorTestUtilities; import org.apache.hudi.testutils.KeyGeneratorTestUtilities;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.ParameterizedTest;
@@ -130,6 +131,9 @@ public class TestSimpleKeyGenerator extends KeyGeneratorTestUtilities {
Row row = KeyGeneratorTestUtilities.getRow(record); Row row = KeyGeneratorTestUtilities.getRow(record);
Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1"); Assertions.assertEquals(keyGenerator.getRecordKey(row), "key1");
Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686"); Assertions.assertEquals(keyGenerator.getPartitionPath(row), "timestamp=4357686");
InternalRow internalRow = KeyGeneratorTestUtilities.getInternalRow(row);
Assertions.assertEquals(keyGenerator.getPartitionPath(internalRow, row.schema()), "timestamp=4357686");
} }
private static Stream<GenericRecord> nestedColTestRecords() { private static Stream<GenericRecord> nestedColTestRecords() {

View File

@@ -28,7 +28,10 @@ import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.exception.HoodieKeyGeneratorException; import org.apache.hudi.exception.HoodieKeyGeneratorException;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions; import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hudi.testutils.KeyGeneratorTestUtilities;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@@ -50,6 +53,7 @@ public class TestTimestampBasedKeyGenerator {
private Schema schema; private Schema schema;
private StructType structType; private StructType structType;
private Row baseRow; private Row baseRow;
private InternalRow internalRow;
@BeforeEach @BeforeEach
public void initialize() throws IOException { public void initialize() throws IOException {
@@ -58,6 +62,7 @@ public class TestTimestampBasedKeyGenerator {
baseRecord = SchemaTestUtil baseRecord = SchemaTestUtil
.generateAvroRecordFromJson(schema, 1, "001", "f1"); .generateAvroRecordFromJson(schema, 1, "001", "f1");
baseRow = genericRecordToRow(baseRecord); baseRow = genericRecordToRow(baseRecord);
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow);
properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY.key(), "field1"); properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_OPT_KEY.key(), "field1");
properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY.key(), "createTime"); properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_OPT_KEY.key(), "createTime");
@@ -77,7 +82,7 @@ public class TestTimestampBasedKeyGenerator {
} }
private Row genericRecordToRow(GenericRecord baseRecord) { private Row genericRecordToRow(GenericRecord baseRecord) {
Function1<Object, Object> convertor = AvroConversionHelper.createConverterToRow(schema, structType); Function1<Object, Object> convertor = AvroConversionHelper.createConverterToRow(baseRecord.getSchema(), structType);
Row row = (Row) convertor.apply(baseRecord); Row row = (Row) convertor.apply(baseRecord);
int fieldCount = structType.fieldNames().length; int fieldCount = structType.fieldNames().length;
Object[] values = new Object[fieldCount]; Object[] values = new Object[fieldCount];
@@ -117,6 +122,10 @@ public class TestTimestampBasedKeyGenerator {
TimestampBasedKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties); TimestampBasedKeyGenerator keyGen = new TimestampBasedKeyGenerator(properties);
HoodieKey hk1 = keyGen.getKey(baseRecord); HoodieKey hk1 = keyGen.getKey(baseRecord);
assertEquals("2020-01-06 12", hk1.getPartitionPath()); assertEquals("2020-01-06 12", hk1.getPartitionPath());
baseRow = genericRecordToRow(baseRecord);
assertEquals("2020-01-06 12", keyGen.getPartitionPath(baseRow));
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow);
assertEquals("2020-01-06 12", keyGen.getPartitionPath(internalRow, baseRow.schema()));
// timezone is GMT+8:00, createTime is BigDecimal // timezone is GMT+8:00, createTime is BigDecimal
baseRecord.put("createTime", new BigDecimal(1578283932000.00001)); baseRecord.put("createTime", new BigDecimal(1578283932000.00001));
@@ -169,6 +178,8 @@ public class TestTimestampBasedKeyGenerator {
// test w/ Row // test w/ Row
baseRow = genericRecordToRow(baseRecord); baseRow = genericRecordToRow(baseRecord);
assertEquals("1970-01-01 08", keyGen.getPartitionPath(baseRow)); assertEquals("1970-01-01 08", keyGen.getPartitionPath(baseRow));
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow);
assertEquals("1970-01-01 08", keyGen.getPartitionPath(internalRow, baseRow.schema()));
// timestamp is DATE_STRING, timezone is GMT, createTime is null // timestamp is DATE_STRING, timezone is GMT, createTime is null
baseRecord.put("createTime", null); baseRecord.put("createTime", null);
@@ -181,6 +192,8 @@ public class TestTimestampBasedKeyGenerator {
// test w/ Row // test w/ Row
baseRow = genericRecordToRow(baseRecord); baseRow = genericRecordToRow(baseRecord);
assertEquals("1970-01-01 12:00:00", keyGen.getPartitionPath(baseRow)); assertEquals("1970-01-01 12:00:00", keyGen.getPartitionPath(baseRow));
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow);
assertEquals("1970-01-01 12:00:00", keyGen.getPartitionPath(internalRow, baseRow.schema()));
} }
@Test @Test
@@ -197,6 +210,8 @@ public class TestTimestampBasedKeyGenerator {
// test w/ Row // test w/ Row
baseRow = genericRecordToRow(baseRecord); baseRow = genericRecordToRow(baseRecord);
assertEquals("2024-10-04 12", keyGen.getPartitionPath(baseRow)); assertEquals("2024-10-04 12", keyGen.getPartitionPath(baseRow));
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow);
assertEquals("2024-10-04 12", keyGen.getPartitionPath(internalRow, baseRow.schema()));
// timezone is GMT, createTime is null // timezone is GMT, createTime is null
baseRecord.put("createTime", null); baseRecord.put("createTime", null);
@@ -208,6 +223,8 @@ public class TestTimestampBasedKeyGenerator {
// test w/ Row // test w/ Row
baseRow = genericRecordToRow(baseRecord); baseRow = genericRecordToRow(baseRecord);
assertEquals("1970-01-02 12", keyGen.getPartitionPath(baseRow)); assertEquals("1970-01-02 12", keyGen.getPartitionPath(baseRow));
internalRow = KeyGeneratorTestUtilities.getInternalRow(baseRow);
assertEquals("1970-01-02 12", keyGen.getPartitionPath(internalRow, baseRow.schema()));
// timezone is GMT. number of days store integer in mysql // timezone is GMT. number of days store integer in mysql
baseRecord.put("createTime", 18736); baseRecord.put("createTime", 18736);
@@ -216,6 +233,9 @@ public class TestTimestampBasedKeyGenerator {
HoodieKey scalarSecondsKey = keyGen.getKey(baseRecord); HoodieKey scalarSecondsKey = keyGen.getKey(baseRecord);
assertEquals("2021-04-19", scalarSecondsKey.getPartitionPath()); assertEquals("2021-04-19", scalarSecondsKey.getPartitionPath());
// test w/ Row
baseRow = genericRecordToRow(baseRecord);
assertEquals("2021-04-19", keyGen.getPartitionPath(baseRow));
} }
@Test @Test

View File

@@ -107,6 +107,11 @@ public class HoodieClientTestUtils {
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
String instantTime) { String instantTime) {
return readCommit(basePath, sqlContext, commitTimeline, instantTime, true);
}
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
String instantTime, boolean filterByCommitTime) {
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime); HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, instantTime);
if (!commitTimeline.containsInstant(commitInstant)) { if (!commitTimeline.containsInstant(commitInstant)) {
throw new HoodieException("No commit exists at " + instantTime); throw new HoodieException("No commit exists at " + instantTime);
@@ -115,14 +120,21 @@ public class HoodieClientTestUtils {
HashMap<String, String> paths = HashMap<String, String> paths =
getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant)); getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant));
LOG.info("Path :" + paths.values()); LOG.info("Path :" + paths.values());
Dataset<Row> unFilteredRows = null;
if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.PARQUET)) { if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.PARQUET)) {
return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()])) unFilteredRows = sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]));
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime));
} else if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.ORC)) { } else if (HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP.defaultValue().equals(HoodieFileFormat.ORC)) {
return sqlContext.read().orc(paths.values().toArray(new String[paths.size()])) unFilteredRows = sqlContext.read().orc(paths.values().toArray(new String[paths.size()]));
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime));
} }
if (unFilteredRows != null) {
if (filterByCommitTime) {
return unFilteredRows.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, instantTime));
} else {
return unFilteredRows;
}
} else {
return sqlContext.emptyDataFrame(); return sqlContext.emptyDataFrame();
}
} catch (Exception e) { } catch (Exception e) {
throw new HoodieException("Error reading commit " + instantTime, e); throw new HoodieException("Error reading commit " + instantTime, e);
} }

View File

@@ -24,11 +24,24 @@ import org.apache.hudi.AvroConversionUtils;
import org.apache.avro.Schema; import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecord;
import org.apache.spark.package$;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.catalyst.expressions.Attribute;
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.List;
import java.util.stream.Collectors;
import scala.Function1; import scala.Function1;
import scala.collection.JavaConversions;
import scala.collection.JavaConverters;
public class KeyGeneratorTestUtilities { public class KeyGeneratorTestUtilities {
@@ -60,7 +73,7 @@ public class KeyGeneratorTestUtilities {
public static GenericRecord getRecord(GenericRecord nestedColRecord) { public static GenericRecord getRecord(GenericRecord nestedColRecord) {
GenericRecord record = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA)); GenericRecord record = new GenericData.Record(new Schema.Parser().parse(EXAMPLE_SCHEMA));
record.put("timestamp", 4357686); record.put("timestamp", 4357686L);
record.put("_row_key", "key1"); record.put("_row_key", "key1");
record.put("ts_ms", "2020-03-21"); record.put("ts_ms", "2020-03-21");
record.put("pii_col", "pi"); record.put("pii_col", "pi");
@@ -82,4 +95,39 @@ public class KeyGeneratorTestUtilities {
} }
return new GenericRowWithSchema(values, structType); return new GenericRowWithSchema(values, structType);
} }
public static InternalRow getInternalRow(Row row) {
try {
return getInternalRow(row, getEncoder(row.schema()));
} catch (Exception e) {
throw new IllegalStateException("Exception thrown while converting Row to InternalRow", e);
}
}
private static ExpressionEncoder getEncoder(StructType schema) {
List<Attribute> attributes = JavaConversions.asJavaCollection(schema.toAttributes()).stream()
.map(Attribute::toAttribute).collect(Collectors.toList());
return RowEncoder.apply(schema)
.resolveAndBind(JavaConverters.asScalaBufferConverter(attributes).asScala().toSeq(),
SimpleAnalyzer$.MODULE$);
}
public static InternalRow getInternalRow(Row row, ExpressionEncoder<Row> encoder) throws ClassNotFoundException, InvocationTargetException, IllegalAccessException, NoSuchMethodException {
return serializeRow(encoder, row);
}
private static InternalRow serializeRow(ExpressionEncoder encoder, Row row)
throws InvocationTargetException, IllegalAccessException, NoSuchMethodException, ClassNotFoundException {
// TODO remove reflection if Spark 2.x support is dropped
if (package$.MODULE$.SPARK_VERSION().startsWith("2.")) {
Method spark2method = encoder.getClass().getMethod("toRow", Object.class);
return (InternalRow) spark2method.invoke(encoder, row);
} else {
Class<?> serializerClass = Class.forName("org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer");
Object serializer = encoder.getClass().getMethod("createSerializer").invoke(encoder);
Method aboveSpark2method = serializerClass.getMethod("apply", Object.class);
return (InternalRow) aboveSpark2method.invoke(serializer, row);
}
}
} }

View File

@@ -59,12 +59,17 @@ import static org.apache.hudi.common.testutils.FileSystemTestUtils.RANDOM;
*/ */
public class SparkDatasetTestUtils { public class SparkDatasetTestUtils {
public static final String RECORD_KEY_FIELD_NAME = "record_key";
public static final String PARTITION_PATH_FIELD_NAME = "partition_path";
public static final StructType STRUCT_TYPE = new StructType(new StructField[] { public static final StructType STRUCT_TYPE = new StructType(new StructField[] {
new StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()),
new StructField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()),
new StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()),
new StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()),
new StructField(HoodieRecord.FILENAME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.FILENAME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()),
new StructField(RECORD_KEY_FIELD_NAME, DataTypes.StringType, false, Metadata.empty()),
new StructField(PARTITION_PATH_FIELD_NAME, DataTypes.StringType, false, Metadata.empty()),
new StructField("randomInt", DataTypes.IntegerType, false, Metadata.empty()), new StructField("randomInt", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("randomLong", DataTypes.LongType, false, Metadata.empty())}); new StructField("randomLong", DataTypes.LongType, false, Metadata.empty())});
@@ -74,6 +79,8 @@ public class SparkDatasetTestUtils {
new StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()),
new StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()),
new StructField(HoodieRecord.FILENAME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()), new StructField(HoodieRecord.FILENAME_METADATA_FIELD, DataTypes.StringType, false, Metadata.empty()),
new StructField(RECORD_KEY_FIELD_NAME, DataTypes.StringType, false, Metadata.empty()),
new StructField(PARTITION_PATH_FIELD_NAME, DataTypes.StringType, false, Metadata.empty()),
new StructField("randomInt", DataTypes.IntegerType, false, Metadata.empty()), new StructField("randomInt", DataTypes.IntegerType, false, Metadata.empty()),
new StructField("randomStr", DataTypes.StringType, false, Metadata.empty())}); new StructField("randomStr", DataTypes.StringType, false, Metadata.empty())});
@@ -117,7 +124,7 @@ public class SparkDatasetTestUtils {
*/ */
public static Row getRandomValue(String partitionPath, boolean isError) { public static Row getRandomValue(String partitionPath, boolean isError) {
// order commit time, seq no, record key, partition path, file name // order commit time, seq no, record key, partition path, file name
Object[] values = new Object[7]; Object[] values = new Object[9];
values[0] = ""; //commit time values[0] = ""; //commit time
if (!isError) { if (!isError) {
values[1] = ""; // commit seq no values[1] = ""; // commit seq no
@@ -127,11 +134,13 @@ public class SparkDatasetTestUtils {
values[2] = UUID.randomUUID().toString(); values[2] = UUID.randomUUID().toString();
values[3] = partitionPath; values[3] = partitionPath;
values[4] = ""; // filename values[4] = ""; // filename
values[5] = RANDOM.nextInt(); values[5] = UUID.randomUUID().toString();
values[6] = partitionPath;
values[7] = RANDOM.nextInt();
if (!isError) { if (!isError) {
values[6] = RANDOM.nextLong(); values[8] = RANDOM.nextLong();
} else { } else {
values[6] = UUID.randomUUID().toString(); values[8] = UUID.randomUUID().toString();
} }
return new GenericRow(values); return new GenericRow(values);
} }
@@ -154,14 +163,16 @@ public class SparkDatasetTestUtils {
public static InternalRow getInternalRowWithError(String partitionPath) { public static InternalRow getInternalRowWithError(String partitionPath) {
// order commit time, seq no, record key, partition path, file name // order commit time, seq no, record key, partition path, file name
String recordKey = UUID.randomUUID().toString(); String recordKey = UUID.randomUUID().toString();
Object[] values = new Object[7]; Object[] values = new Object[9];
values[0] = ""; values[0] = "";
values[1] = ""; values[1] = "";
values[2] = recordKey; values[2] = recordKey;
values[3] = partitionPath; values[3] = partitionPath;
values[4] = ""; values[4] = "";
values[5] = RANDOM.nextInt(); values[5] = recordKey;
values[6] = RANDOM.nextBoolean(); values[6] = partitionPath;
values[7] = RANDOM.nextInt();
values[8] = RANDOM.nextBoolean();
return new GenericInternalRow(values); return new GenericInternalRow(values);
} }

View File

@@ -141,6 +141,12 @@ public class HoodieTableConfig extends HoodieConfig implements Serializable {
.noDefaultValue() .noDefaultValue()
.withDocumentation("Base path of the dataset that needs to be bootstrapped as a Hudi table"); .withDocumentation("Base path of the dataset that needs to be bootstrapped as a Hudi table");
public static final ConfigProperty<String> HOODIE_POPULATE_META_FIELDS = ConfigProperty
.key("hoodie.populate.meta.fields")
.defaultValue("true")
.withDocumentation("When enabled, populates all meta fields. When disabled, no meta fields are populated "
+ "and incremental queries will not be functional. This is only meant to be used for append only/immutable data for batch processing");
public static final String NO_OP_BOOTSTRAP_INDEX_CLASS = NoOpBootstrapIndex.class.getName(); public static final String NO_OP_BOOTSTRAP_INDEX_CLASS = NoOpBootstrapIndex.class.getName();
public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName) { public HoodieTableConfig(FileSystem fs, String metaPath, String payloadClassName) {
@@ -313,6 +319,13 @@ public class HoodieTableConfig extends HoodieConfig implements Serializable {
return getStringOrDefault(HOODIE_ARCHIVELOG_FOLDER_PROP); return getStringOrDefault(HOODIE_ARCHIVELOG_FOLDER_PROP);
} }
/**
* @returns true is meta fields need to be populated. else returns false.
*/
public boolean populateMetaFields() {
return Boolean.parseBoolean(getStringOrDefault(HOODIE_POPULATE_META_FIELDS));
}
public Map<String, String> propsMap() { public Map<String, String> propsMap() {
return props.entrySet().stream() return props.entrySet().stream()
.collect(Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue()))); .collect(Collectors.toMap(e -> String.valueOf(e.getKey()), e -> String.valueOf(e.getValue())));

View File

@@ -27,6 +27,7 @@ import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
import org.apache.hudi.common.fs.NoOpConsistencyGuard; import org.apache.hudi.common.fs.NoOpConsistencyGuard;
import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline; import org.apache.hudi.common.table.timeline.HoodieArchivedTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant;
@@ -312,6 +313,25 @@ public class HoodieTableMetaClient implements Serializable {
return archivedTimeline; return archivedTimeline;
} }
/**
* Validate table properties.
* @param properties Properties from writeConfig.
* @param operationType operation type to be executed.
*/
public void validateTableProperties(Properties properties, WriteOperationType operationType) {
// disabling meta fields are allowed only for bulk_insert operation
if (!Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()))
&& operationType != WriteOperationType.BULK_INSERT) {
throw new HoodieException(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key() + " can only be disabled for " + WriteOperationType.BULK_INSERT
+ " operation");
}
// once meta fields are disabled, it cant be re-enabled for a given table.
if (!getTableConfig().populateMetaFields()
&& Boolean.parseBoolean((String) properties.getOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()))) {
throw new HoodieException(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key() + " already disabled for the table. Can't be re-enabled back");
}
}
/** /**
* Helper method to initialize a given path as a hoodie table with configs passed in as Properties. * Helper method to initialize a given path as a hoodie table with configs passed in as Properties.
* *
@@ -602,6 +622,7 @@ public class HoodieTableMetaClient implements Serializable {
private String partitionColumns; private String partitionColumns;
private String bootstrapIndexClass; private String bootstrapIndexClass;
private String bootstrapBasePath; private String bootstrapBasePath;
private Boolean populateMetaFields;
private PropertyBuilder() { private PropertyBuilder() {
@@ -675,6 +696,11 @@ public class HoodieTableMetaClient implements Serializable {
return this; return this;
} }
public PropertyBuilder setPopulateMetaFields(boolean populateMetaFields) {
this.populateMetaFields = populateMetaFields;
return this;
}
public PropertyBuilder fromMetaClient(HoodieTableMetaClient metaClient) { public PropertyBuilder fromMetaClient(HoodieTableMetaClient metaClient) {
return setTableType(metaClient.getTableType()) return setTableType(metaClient.getTableType())
.setTableName(metaClient.getTableConfig().getTableName()) .setTableName(metaClient.getTableConfig().getTableName())
@@ -725,6 +751,9 @@ public class HoodieTableMetaClient implements Serializable {
if (hoodieConfig.contains(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA)) { if (hoodieConfig.contains(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA)) {
setTableCreateSchema(hoodieConfig.getString(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA)); setTableCreateSchema(hoodieConfig.getString(HoodieTableConfig.HOODIE_TABLE_CREATE_SCHEMA));
} }
if (hoodieConfig.contains(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS)) {
setPopulateMetaFields(hoodieConfig.getBoolean(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS));
}
return this; return this;
} }
@@ -778,6 +807,9 @@ public class HoodieTableMetaClient implements Serializable {
if (null != recordKeyFields) { if (null != recordKeyFields) {
tableConfig.setValue(HoodieTableConfig.HOODIE_TABLE_RECORDKEY_FIELDS, recordKeyFields); tableConfig.setValue(HoodieTableConfig.HOODIE_TABLE_RECORDKEY_FIELDS, recordKeyFields);
} }
if (null != populateMetaFields) {
tableConfig.setValue(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS, Boolean.toString(populateMetaFields));
}
return tableConfig.getProps(); return tableConfig.getProps();
} }

View File

@@ -18,15 +18,25 @@
package org.apache.hudi.internal; package org.apache.hudi.internal;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.client.HoodieInternalWriteStatus; import org.apache.hudi.client.HoodieInternalWriteStatus;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.io.HoodieRowCreateHandle; import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.storage.row.HoodieRowCreateHandleWithoutMetaFields;
import org.apache.hudi.io.storage.row.HoodieRowCreateHandle;
import org.apache.hudi.keygen.BuiltinKeyGenerator;
import org.apache.hudi.keygen.NonpartitionedKeyGenerator;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTable;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import java.io.IOException; import java.io.IOException;
@@ -34,6 +44,7 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties;
import java.util.UUID; import java.util.UUID;
/** /**
@@ -52,15 +63,20 @@ public class BulkInsertDataInternalWriterHelper {
private final StructType structType; private final StructType structType;
private final Boolean arePartitionRecordsSorted; private final Boolean arePartitionRecordsSorted;
private final List<HoodieInternalWriteStatus> writeStatusList = new ArrayList<>(); private final List<HoodieInternalWriteStatus> writeStatusList = new ArrayList<>();
private HoodieRowCreateHandle handle; private HoodieRowCreateHandle handle;
private String lastKnownPartitionPath = null; private String lastKnownPartitionPath = null;
private String fileIdPrefix; private String fileIdPrefix;
private int numFilesWritten = 0; private int numFilesWritten = 0;
private Map<String, HoodieRowCreateHandle> handles = new HashMap<>(); private Map<String, HoodieRowCreateHandle> handles = new HashMap<>();
private final boolean populateMetaFields;
private Option<BuiltinKeyGenerator> keyGeneratorOpt = null;
private boolean simpleKeyGen = false;
private int simplePartitionFieldIndex = -1;
private DataType simplePartitionFieldDataType;
public BulkInsertDataInternalWriterHelper(HoodieTable hoodieTable, HoodieWriteConfig writeConfig, public BulkInsertDataInternalWriterHelper(HoodieTable hoodieTable, HoodieWriteConfig writeConfig,
String instantTime, int taskPartitionId, long taskId, long taskEpochId, StructType structType, boolean arePartitionRecordsSorted) { String instantTime, int taskPartitionId, long taskId, long taskEpochId, StructType structType,
boolean populateMetaFields, boolean arePartitionRecordsSorted) {
this.hoodieTable = hoodieTable; this.hoodieTable = hoodieTable;
this.writeConfig = writeConfig; this.writeConfig = writeConfig;
this.instantTime = instantTime; this.instantTime = instantTime;
@@ -68,14 +84,57 @@ public class BulkInsertDataInternalWriterHelper {
this.taskId = taskId; this.taskId = taskId;
this.taskEpochId = taskEpochId; this.taskEpochId = taskEpochId;
this.structType = structType; this.structType = structType;
this.populateMetaFields = populateMetaFields;
this.arePartitionRecordsSorted = arePartitionRecordsSorted; this.arePartitionRecordsSorted = arePartitionRecordsSorted;
this.fileIdPrefix = UUID.randomUUID().toString(); this.fileIdPrefix = UUID.randomUUID().toString();
if (!populateMetaFields) {
this.keyGeneratorOpt = getKeyGenerator(writeConfig.getProps());
if (keyGeneratorOpt.isPresent() && keyGeneratorOpt.get() instanceof SimpleKeyGenerator) {
simpleKeyGen = true;
simplePartitionFieldIndex = (Integer) structType.getFieldIndex((keyGeneratorOpt.get()).getPartitionPathFields().get(0)).get();
simplePartitionFieldDataType = structType.fields()[simplePartitionFieldIndex].dataType();
}
}
}
/**
* Instantiate {@link BuiltinKeyGenerator}.
*
* @param properties properties map.
* @return the key generator thus instantiated.
*/
private Option<BuiltinKeyGenerator> getKeyGenerator(Properties properties) {
TypedProperties typedProperties = new TypedProperties();
typedProperties.putAll(properties);
if (properties.get(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY().key()).equals(NonpartitionedKeyGenerator.class.getName())) {
return Option.empty(); // Do not instantiate NonPartitionKeyGen
} else {
try {
return Option.of((BuiltinKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(typedProperties));
} catch (ClassCastException cce) {
throw new HoodieIOException("Only those key generators implementing BuiltInKeyGenerator interface is supported with virtual keys");
} catch (IOException e) {
throw new HoodieIOException("Key generator instantiation failed ", e);
}
}
} }
public void write(InternalRow record) throws IOException { public void write(InternalRow record) throws IOException {
try { try {
String partitionPath = record.getUTF8String( String partitionPath = null;
if (populateMetaFields) { // usual path where meta fields are pre populated in prep step.
partitionPath = record.getUTF8String(
HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toString(); HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toString();
} else { // if meta columns are disabled.
if (!keyGeneratorOpt.isPresent()) { // NoPartitionerKeyGen
partitionPath = "";
} else if (simpleKeyGen) { // SimpleKeyGen
partitionPath = (record.get(simplePartitionFieldIndex, simplePartitionFieldDataType)).toString();
} else {
// only BuiltIn key generators are supported if meta fields are disabled.
partitionPath = keyGeneratorOpt.get().getPartitionPath(record, structType);
}
}
if ((lastKnownPartitionPath == null) || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) { if ((lastKnownPartitionPath == null) || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) {
LOG.info("Creating new file for partition path " + partitionPath); LOG.info("Creating new file for partition path " + partitionPath);
@@ -103,14 +162,18 @@ public class BulkInsertDataInternalWriterHelper {
if (arePartitionRecordsSorted) { if (arePartitionRecordsSorted) {
close(); close();
} }
handles.put(partitionPath, new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(), HoodieRowCreateHandle rowCreateHandle = populateMetaFields ? new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType)); instantTime, taskPartitionId, taskId, taskEpochId, structType) : new HoodieRowCreateHandleWithoutMetaFields(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType);
handles.put(partitionPath, rowCreateHandle);
} else if (!handles.get(partitionPath).canWrite()) { } else if (!handles.get(partitionPath).canWrite()) {
// even if there is a handle to the partition path, it could have reached its max size threshold. So, we close the handle here and // even if there is a handle to the partition path, it could have reached its max size threshold. So, we close the handle here and
// create a new one. // create a new one.
writeStatusList.add(handles.remove(partitionPath).close()); writeStatusList.add(handles.remove(partitionPath).close());
handles.put(partitionPath, new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(), HoodieRowCreateHandle rowCreateHandle = populateMetaFields ? new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType)); instantTime, taskPartitionId, taskId, taskEpochId, structType) : new HoodieRowCreateHandleWithoutMetaFields(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType);
handles.put(partitionPath, rowCreateHandle);
} }
return handles.get(partitionPath); return handles.get(partitionPath);
} }

View File

@@ -66,6 +66,7 @@ public class DataSourceInternalWriterHelper {
writeClient.startCommitWithTime(instantTime); writeClient.startCommitWithTime(instantTime);
this.metaClient = HoodieTableMetaClient.builder().setConf(configuration).setBasePath(writeConfig.getBasePath()).build(); this.metaClient = HoodieTableMetaClient.builder().setConf(configuration).setBasePath(writeConfig.getBasePath()).build();
this.metaClient.validateTableProperties(writeConfig.getProps(), WriteOperationType.BULK_INSERT);
this.hoodieTable = HoodieSparkTable.create(writeConfig, new HoodieSparkEngineContext(new JavaSparkContext(sparkSession.sparkContext())), metaClient); this.hoodieTable = HoodieSparkTable.create(writeConfig, new HoodieSparkEngineContext(new JavaSparkContext(sparkSession.sparkContext())), metaClient);
} }

View File

@@ -18,12 +18,17 @@
package org.apache.hudi.internal; package org.apache.hudi.internal;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.client.HoodieInternalWriteStatus; import org.apache.hudi.client.HoodieInternalWriteStatus;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.testutils.HoodieClientTestHarness; import org.apache.hudi.testutils.HoodieClientTestHarness;
import org.apache.hudi.testutils.SparkDatasetTestUtils;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
@@ -33,8 +38,10 @@ import org.junit.jupiter.api.BeforeEach;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Properties;
import java.util.Random; import java.util.Random;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getConfigBuilder;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -62,6 +69,17 @@ public class HoodieBulkInsertInternalWriterTestBase extends HoodieClientTestHarn
cleanupResources(); cleanupResources();
} }
protected HoodieWriteConfig getWriteConfig(boolean populateMetaFields) {
Properties properties = new Properties();
if (!populateMetaFields) {
properties.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY().key(), SimpleKeyGenerator.class.getName());
properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY().key(), SparkDatasetTestUtils.RECORD_KEY_FIELD_NAME);
properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY().key(), SparkDatasetTestUtils.PARTITION_PATH_FIELD_NAME);
properties.setProperty(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), "false");
}
return getConfigBuilder(basePath).withProperties(properties).build();
}
protected void assertWriteStatuses(List<HoodieInternalWriteStatus> writeStatuses, int batches, int size, protected void assertWriteStatuses(List<HoodieInternalWriteStatus> writeStatuses, int batches, int size,
Option<List<String>> fileAbsPaths, Option<List<String>> fileNames) { Option<List<String>> fileAbsPaths, Option<List<String>> fileNames) {
assertWriteStatuses(writeStatuses, batches, size, false, fileAbsPaths, fileNames); assertWriteStatuses(writeStatuses, batches, size, false, fileAbsPaths, fileNames);
@@ -126,7 +144,9 @@ public class HoodieBulkInsertInternalWriterTestBase extends HoodieClientTestHarn
} }
} }
protected void assertOutput(Dataset<Row> expectedRows, Dataset<Row> actualRows, String instantTime, Option<List<String>> fileNames) { protected void assertOutput(Dataset<Row> expectedRows, Dataset<Row> actualRows, String instantTime, Option<List<String>> fileNames,
boolean populateMetaColumns) {
if (populateMetaColumns) {
// verify 3 meta fields that are filled in within create handle // verify 3 meta fields that are filled in within create handle
actualRows.collectAsList().forEach(entry -> { actualRows.collectAsList().forEach(entry -> {
assertEquals(entry.get(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).toString(), instantTime); assertEquals(entry.get(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).toString(), instantTime);
@@ -142,5 +162,9 @@ public class HoodieBulkInsertInternalWriterTestBase extends HoodieClientTestHarn
Dataset<Row> trimmedExpected = expectedRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD); Dataset<Row> trimmedExpected = expectedRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD);
Dataset<Row> trimmedActual = actualRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD); Dataset<Row> trimmedActual = actualRows.drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, HoodieRecord.COMMIT_TIME_METADATA_FIELD, HoodieRecord.FILENAME_METADATA_FIELD);
assertEquals(0, trimmedActual.except(trimmedExpected).count()); assertEquals(0, trimmedActual.except(trimmedExpected).count());
} else { // operation = BULK_INSERT_APPEND_ONLY
// all meta columns are untouched
assertEquals(0, expectedRows.except(actualRows).count());
}
} }
} }

View File

@@ -18,12 +18,6 @@
package org.apache.hudi; package org.apache.hudi;
import static org.apache.spark.sql.functions.callUDF;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ReflectionUtils;
@@ -41,8 +35,17 @@ import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.functions; import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import scala.collection.JavaConverters; import scala.collection.JavaConverters;
import static org.apache.spark.sql.functions.callUDF;
/** /**
* Helper class to assist in preparing {@link Dataset<Row>}s for bulk insert with datasource implementation. * Helper class to assist in preparing {@link Dataset<Row>}s for bulk insert with datasource implementation.
*/ */
@@ -112,4 +115,40 @@ public class HoodieDatasetBulkInsertHelper {
return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism()); return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
} }
/**
* Add empty meta fields and reorder such that meta fields are at the beginning.
*
* @param rows
* @return
*/
public static Dataset<Row> prepareHoodieDatasetForBulkInsertWithoutMetaFields(Dataset<Row> rows) {
// add empty meta cols.
Dataset<Row> rowsWithMetaCols = rows
.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.FILENAME_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType));
List<Column> originalFields =
Arrays.stream(rowsWithMetaCols.schema().fields()).filter(field -> !field.name().contains("_hoodie_")).map(f -> new Column(f.name())).collect(Collectors.toList());
List<Column> metaFields =
Arrays.stream(rowsWithMetaCols.schema().fields()).filter(field -> field.name().contains("_hoodie_")).map(f -> new Column(f.name())).collect(Collectors.toList());
// reorder such that all meta columns are at the beginning followed by original columns
List<Column> allCols = new ArrayList<>();
allCols.addAll(metaFields);
allCols.addAll(originalFields);
return rowsWithMetaCols.select(
JavaConverters.collectionAsScalaIterableConverter(allCols).asScala().toSeq());
}
} }

View File

@@ -32,7 +32,7 @@ import org.apache.hudi.common.util.{CommitUtils, ReflectionUtils}
import org.apache.hudi.config.HoodieBootstrapConfig.{BOOTSTRAP_BASE_PATH_PROP, BOOTSTRAP_INDEX_CLASS_PROP} import org.apache.hudi.config.HoodieBootstrapConfig.{BOOTSTRAP_BASE_PATH_PROP, BOOTSTRAP_INDEX_CLASS_PROP}
import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig} import org.apache.hudi.config.{HoodieInternalConfig, HoodieWriteConfig}
import org.apache.hudi.exception.HoodieException import org.apache.hudi.exception.HoodieException
import org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerWithRowsFactory import org.apache.hudi.execution.bulkinsert.{BulkInsertInternalPartitionerWithRowsFactory, NonSortPartitionerWithRows}
import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool} import org.apache.hudi.hive.{HiveSyncConfig, HiveSyncTool}
import org.apache.hudi.index.SparkHoodieIndex import org.apache.hudi.index.SparkHoodieIndex
import org.apache.hudi.internal.DataSourceInternalWriterHelper import org.apache.hudi.internal.DataSourceInternalWriterHelper
@@ -128,6 +128,7 @@ object HoodieSparkSqlWriter {
.setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_OPT_KEY)) .setPayloadClassName(hoodieConfig.getString(PAYLOAD_CLASS_OPT_KEY))
.setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null)) .setPreCombineField(hoodieConfig.getStringOrDefault(PRECOMBINE_FIELD_OPT_KEY, null))
.setPartitionColumns(partitionColumns) .setPartitionColumns(partitionColumns)
.setPopulateMetaFields(parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(), HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
.initTable(sparkContext.hadoopConfiguration, path.get) .initTable(sparkContext.hadoopConfiguration, path.get)
tableConfig = tableMetaClient.getTableConfig tableConfig = tableMetaClient.getTableConfig
} }
@@ -139,7 +140,8 @@ object HoodieSparkSqlWriter {
if (hoodieConfig.getBoolean(ENABLE_ROW_WRITER_OPT_KEY) && if (hoodieConfig.getBoolean(ENABLE_ROW_WRITER_OPT_KEY) &&
operation == WriteOperationType.BULK_INSERT) { operation == WriteOperationType.BULK_INSERT) {
val (success, commitTime: common.util.Option[String]) = bulkInsertAsRow(sqlContext, parameters, df, tblName, val (success, commitTime: common.util.Option[String]) = bulkInsertAsRow(sqlContext, parameters, df, tblName,
basePath, path, instantTime) basePath, path, instantTime, parameters.getOrElse(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(),
HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()).toBoolean)
return (success, commitTime, common.util.Option.empty(), common.util.Option.empty(), hoodieWriteClient.orNull, tableConfig) return (success, commitTime, common.util.Option.empty(), common.util.Option.empty(), hoodieWriteClient.orNull, tableConfig)
} }
// scalastyle:on // scalastyle:on
@@ -330,7 +332,8 @@ object HoodieSparkSqlWriter {
tblName: String, tblName: String,
basePath: Path, basePath: Path,
path: Option[String], path: Option[String],
instantTime: String): (Boolean, common.util.Option[String]) = { instantTime: String,
populateMetaFields: Boolean): (Boolean, common.util.Option[String]) = {
val sparkContext = sqlContext.sparkContext val sparkContext = sqlContext.sparkContext
// register classes & schemas // register classes & schemas
val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName) val (structName, nameSpace) = AvroConversionUtils.getAvroRecordNameAndNamespace(tblName)
@@ -345,22 +348,36 @@ object HoodieSparkSqlWriter {
} }
val params = parameters.updated(HoodieWriteConfig.AVRO_SCHEMA.key, schema.toString) val params = parameters.updated(HoodieWriteConfig.AVRO_SCHEMA.key, schema.toString)
val writeConfig = DataSourceUtils.createHoodieConfig(schema.toString, path.get, tblName, mapAsJavaMap(params)) val writeConfig = DataSourceUtils.createHoodieConfig(schema.toString, path.get, tblName, mapAsJavaMap(params))
val bulkInsertPartitionerRows : BulkInsertPartitioner[Dataset[Row]] = if (populateMetaFields) {
val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig) val userDefinedBulkInsertPartitionerOpt = DataSourceUtils.createUserDefinedBulkInsertPartitionerWithRows(writeConfig)
val bulkInsertPartitionerRows : BulkInsertPartitioner[Dataset[Row]] = if (userDefinedBulkInsertPartitionerOpt.isPresent) { if (userDefinedBulkInsertPartitionerOpt.isPresent) {
userDefinedBulkInsertPartitionerOpt.get userDefinedBulkInsertPartitionerOpt.get
} }
else { else {
BulkInsertInternalPartitionerWithRowsFactory.get(writeConfig.getBulkInsertSortMode) BulkInsertInternalPartitionerWithRowsFactory.get(writeConfig.getBulkInsertSortMode)
} }
} else {
// Sort modes are not yet supported when meta fields are disabled
new NonSortPartitionerWithRows()
}
val arePartitionRecordsSorted = bulkInsertPartitionerRows.arePartitionRecordsSorted(); val arePartitionRecordsSorted = bulkInsertPartitionerRows.arePartitionRecordsSorted();
parameters.updated(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED, arePartitionRecordsSorted.toString) parameters.updated(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED, arePartitionRecordsSorted.toString)
val isGlobalIndex = SparkHoodieIndex.isGlobalIndex(writeConfig) val isGlobalIndex = if (populateMetaFields) {
val hoodieDF = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace, SparkHoodieIndex.isGlobalIndex(writeConfig)
} else {
false
}
val hoodieDF = if (populateMetaFields) {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace,
bulkInsertPartitionerRows, isGlobalIndex) bulkInsertPartitionerRows, isGlobalIndex)
} else {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsertWithoutMetaFields(df)
}
if (SPARK_VERSION.startsWith("2.")) { if (SPARK_VERSION.startsWith("2.")) {
hoodieDF.write.format("org.apache.hudi.internal") hoodieDF.write.format("org.apache.hudi.internal")
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime) .option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)
.options(params) .options(params)
.mode(SaveMode.Append)
.save() .save()
} else if (SPARK_VERSION.startsWith("3.")) { } else if (SPARK_VERSION.startsWith("3.")) {
hoodieDF.write.format("org.apache.hudi.spark3.internal") hoodieDF.write.format("org.apache.hudi.spark3.internal")

View File

@@ -113,6 +113,33 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
assertTrue(dataset.except(trimmedOutput).count() == 0); assertTrue(dataset.except(trimmedOutput).count() == 0);
} }
@Test
public void testBulkInsertHelperNoMetaFields() {
List<Row> rows = DataSourceTestUtils.generateRandomRows(10);
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsertWithoutMetaFields(dataset);
StructType resultSchema = result.schema();
assertEquals(result.count(), 10);
assertEquals(resultSchema.fieldNames().length, structType.fieldNames().length + HoodieRecord.HOODIE_META_COLUMNS.size());
for (Map.Entry<String, Integer> entry : HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.entrySet()) {
assertTrue(resultSchema.fieldIndex(entry.getKey()) == entry.getValue());
}
result.toJavaRDD().foreach(entry -> {
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD)).equals(""));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).equals(""));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD)).equals(""));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).equals(""));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.FILENAME_METADATA_FIELD)).equals(""));
});
Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
.drop(HoodieRecord.FILENAME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
assertTrue(dataset.except(trimmedOutput).count() == 0);
}
@ParameterizedTest @ParameterizedTest
@MethodSource("providePreCombineArgs") @MethodSource("providePreCombineArgs")
public void testBulkInsertPreCombine(boolean enablePreCombine) { public void testBulkInsertPreCombine(boolean enablePreCombine) {

View File

@@ -26,6 +26,7 @@ import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.client.{SparkRDDWriteClient, TestBootstrap} import org.apache.hudi.client.{SparkRDDWriteClient, TestBootstrap}
import org.apache.hudi.common.config.HoodieConfig import org.apache.hudi.common.config.HoodieConfig
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord, HoodieRecordPayload} import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord, HoodieRecordPayload}
import org.apache.hudi.common.table.HoodieTableConfig
import org.apache.hudi.common.testutils.HoodieTestDataGenerator import org.apache.hudi.common.testutils.HoodieTestDataGenerator
import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig} import org.apache.hudi.config.{HoodieBootstrapConfig, HoodieWriteConfig}
import org.apache.hudi.exception.HoodieException import org.apache.hudi.exception.HoodieException
@@ -119,15 +120,37 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
} }
} }
List(BulkInsertSortMode.GLOBAL_SORT.name(), BulkInsertSortMode.NONE.name(), BulkInsertSortMode.PARTITION_SORT.name()) List(BulkInsertSortMode.GLOBAL_SORT, BulkInsertSortMode.NONE, BulkInsertSortMode.PARTITION_SORT)
.foreach(sortMode => { .foreach(sortMode => {
test("test_bulk_insert_for_" + sortMode) { test("test_bulk_insert_for_" + sortMode) {
initSparkContext("test_bulk_insert_datasource") initSparkContext("test_bulk_insert_datasource")
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path") val path = java.nio.file.Files.createTempDirectory("hoodie_test_path")
try { try {
testBulkInsertWithSortMode(sortMode, path)
} finally {
spark.stop()
FileUtils.deleteDirectory(path.toFile)
}
}
})
List(true, false)
.foreach(populateMetaFields => {
test("test_bulk_insert_for_populate_meta_fields_" + populateMetaFields) {
initSparkContext("test_bulk_insert_datasource_populate_meta_fields")
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path_populate_meta_fields")
try {
testBulkInsertWithSortMode(BulkInsertSortMode.NONE, path, populateMetaFields)
} finally {
spark.stop()
FileUtils.deleteDirectory(path.toFile)
}
}
})
def testBulkInsertWithSortMode(sortMode: BulkInsertSortMode, path: java.nio.file.Path, populateMetaFields : Boolean = true) : Unit = {
val hoodieFooTableName = "hoodie_foo_tbl" val hoodieFooTableName = "hoodie_foo_tbl"
//create a new table //create a new table
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString, val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName, HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
@@ -135,8 +158,10 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
"hoodie.bulkinsert.shuffle.parallelism" -> "4", "hoodie.bulkinsert.shuffle.parallelism" -> "4",
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL, DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
DataSourceWriteOptions.ENABLE_ROW_WRITER_OPT_KEY.key -> "true", DataSourceWriteOptions.ENABLE_ROW_WRITER_OPT_KEY.key -> "true",
HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key() -> String.valueOf(populateMetaFields),
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key", DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition", DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
HoodieWriteConfig.BULKINSERT_SORT_MODE.key() -> sortMode.name(),
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator") DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier) val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
@@ -166,20 +191,60 @@ class HoodieSparkSqlWriterSuite extends FunSuite with Matchers {
// fetch all records from parquet files generated from write to hudi // fetch all records from parquet files generated from write to hudi
val actualDf = sqlContext.read.parquet(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2)) val actualDf = sqlContext.read.parquet(fullPartitionPaths(0), fullPartitionPaths(1), fullPartitionPaths(2))
val resultRows = actualDf.collectAsList()
if (!populateMetaFields) {
assertEquals(0, actualDf.select(HoodieRecord.HOODIE_META_COLUMNS.get(0)).filter(entry => !(entry.mkString(",").equals(""))).count())
assertEquals(0, actualDf.select(HoodieRecord.HOODIE_META_COLUMNS.get(1)).filter(entry => !(entry.mkString(",").equals(""))).count())
assertEquals(0, actualDf.select(HoodieRecord.HOODIE_META_COLUMNS.get(2)).filter(entry => !(entry.mkString(",").equals(""))).count())
assertEquals(0, actualDf.select(HoodieRecord.HOODIE_META_COLUMNS.get(3)).filter(entry => !(entry.mkString(",").equals(""))).count())
assertEquals(0, actualDf.select(HoodieRecord.HOODIE_META_COLUMNS.get(4)).filter(entry => !(entry.mkString(",").equals(""))).count())
}
// remove metadata columns so that expected and actual DFs can be compared as is // remove metadata columns so that expected and actual DFs can be compared as is
val trimmedDf = actualDf.drop(HoodieRecord.HOODIE_META_COLUMNS.get(0)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(1)) val trimmedDf = actualDf.drop(HoodieRecord.HOODIE_META_COLUMNS.get(0)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(1))
.drop(HoodieRecord.HOODIE_META_COLUMNS.get(2)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(3)) .drop(HoodieRecord.HOODIE_META_COLUMNS.get(2)).drop(HoodieRecord.HOODIE_META_COLUMNS.get(3))
.drop(HoodieRecord.HOODIE_META_COLUMNS.get(4)) .drop(HoodieRecord.HOODIE_META_COLUMNS.get(4))
assert(df.except(trimmedDf).count() == 0) assert(df.except(trimmedDf).count() == 0)
}
test("test disable and enable meta fields") {
initSparkContext("test_disable_enable_meta_fields")
val path = java.nio.file.Files.createTempDirectory("hoodie_test_path")
try {
testBulkInsertWithSortMode(BulkInsertSortMode.NONE, path, false)
// enabling meta fields back should throw exception
val hoodieFooTableName = "hoodie_foo_tbl"
//create a new table
val fooTableModifier = Map("path" -> path.toAbsolutePath.toString,
HoodieWriteConfig.TABLE_NAME.key -> hoodieFooTableName,
DataSourceWriteOptions.TABLE_TYPE_OPT_KEY.key -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL,
"hoodie.bulkinsert.shuffle.parallelism" -> "4",
DataSourceWriteOptions.OPERATION_OPT_KEY.key -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
DataSourceWriteOptions.ENABLE_ROW_WRITER_OPT_KEY.key -> "true",
DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY.key -> "_row_key",
DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY.key -> "partition",
HoodieWriteConfig.BULKINSERT_SORT_MODE.key() -> BulkInsertSortMode.NONE.name(),
DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY.key -> "org.apache.hudi.keygen.SimpleKeyGenerator")
val fooTableParams = HoodieWriterUtils.parametersWithWriteDefaults(fooTableModifier)
// generate the inserts
val schema = DataSourceTestUtils.getStructTypeExampleSchema
val structType = AvroConversionUtils.convertAvroSchemaToStructType(schema)
val inserts = DataSourceTestUtils.generateRandomRows(1000)
val df = spark.createDataFrame(sc.parallelize(inserts), structType)
try {
// write to Hudi
HoodieSparkSqlWriter.write(sqlContext, SaveMode.Append, fooTableParams, df)
fail("Should have thrown exception")
} catch {
case e: HoodieException => assertTrue(e.getMessage.contains("hoodie.populate.meta.fields already disabled for the table. Can't be re-enabled back"))
}
} finally { } finally {
spark.stop() spark.stop()
FileUtils.deleteDirectory(path.toFile) FileUtils.deleteDirectory(path.toFile)
} }
} }
})
test("test drop duplicates row writing for bulk_insert") { test("test drop duplicates row writing for bulk_insert") {
initSparkContext("test_append_mode") initSparkContext("test_append_mode")

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.internal; package org.apache.hudi.internal;
import org.apache.hudi.DataSourceUtils; import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.config.HoodieInternalConfig; import org.apache.hudi.config.HoodieInternalConfig;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
@@ -61,12 +62,14 @@ public class DefaultSource extends BaseDefaultSource implements DataSourceV2,
String instantTime = options.get(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY).get(); String instantTime = options.get(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY).get();
String path = options.get("path").get(); String path = options.get("path").get();
String tblName = options.get(HoodieWriteConfig.TABLE_NAME.key()).get(); String tblName = options.get(HoodieWriteConfig.TABLE_NAME.key()).get();
boolean populateMetaFields = options.getBoolean(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(),
Boolean.parseBoolean(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()));
// 1st arg to createHooodieConfig is not really reuqired to be set. but passing it anyways. // 1st arg to createHooodieConfig is not really reuqired to be set. but passing it anyways.
HoodieWriteConfig config = DataSourceUtils.createHoodieConfig(options.get(HoodieWriteConfig.AVRO_SCHEMA.key()).get(), path, tblName, options.asMap()); HoodieWriteConfig config = DataSourceUtils.createHoodieConfig(options.get(HoodieWriteConfig.AVRO_SCHEMA.key()).get(), path, tblName, options.asMap());
boolean arePartitionRecordsSorted = HoodieInternalConfig.getBulkInsertIsPartitionRecordsSorted( boolean arePartitionRecordsSorted = HoodieInternalConfig.getBulkInsertIsPartitionRecordsSorted(
options.get(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED).isPresent() options.get(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED).isPresent()
? options.get(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED).get() : null); ? options.get(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED).get() : null);
return Optional.of(new HoodieDataSourceInternalWriter(instantTime, config, schema, getSparkSession(), return Optional.of(new HoodieDataSourceInternalWriter(instantTime, config, schema, getSparkSession(),
getConfiguration(), options, arePartitionRecordsSorted)); getConfiguration(), options, populateMetaFields, arePartitionRecordsSorted));
} }
} }

View File

@@ -37,9 +37,9 @@ public class HoodieBulkInsertDataInternalWriter implements DataWriter<InternalRo
public HoodieBulkInsertDataInternalWriter(HoodieTable hoodieTable, HoodieWriteConfig writeConfig, public HoodieBulkInsertDataInternalWriter(HoodieTable hoodieTable, HoodieWriteConfig writeConfig,
String instantTime, int taskPartitionId, long taskId, long taskEpochId, String instantTime, int taskPartitionId, long taskId, long taskEpochId,
StructType structType, boolean arePartitionRecordsSorted) { StructType structType, boolean populateMetaFields, boolean arePartitionRecordsSorted) {
this.bulkInsertWriterHelper = new BulkInsertDataInternalWriterHelper(hoodieTable, this.bulkInsertWriterHelper = new BulkInsertDataInternalWriterHelper(hoodieTable,
writeConfig, instantTime, taskPartitionId, taskId, taskEpochId, structType, arePartitionRecordsSorted); writeConfig, instantTime, taskPartitionId, taskId, taskEpochId, structType, populateMetaFields, arePartitionRecordsSorted);
} }
@Override @Override

View File

@@ -36,19 +36,22 @@ public class HoodieBulkInsertDataInternalWriterFactory implements DataWriterFact
private final HoodieWriteConfig writeConfig; private final HoodieWriteConfig writeConfig;
private final StructType structType; private final StructType structType;
private final boolean arePartitionRecordsSorted; private final boolean arePartitionRecordsSorted;
private final boolean populateMetaFields;
public HoodieBulkInsertDataInternalWriterFactory(HoodieTable hoodieTable, HoodieWriteConfig writeConfig, public HoodieBulkInsertDataInternalWriterFactory(HoodieTable hoodieTable, HoodieWriteConfig writeConfig,
String instantTime, StructType structType, boolean arePartitionRecordsSorted) { String instantTime, StructType structType, boolean populateMetaFields,
boolean arePartitionRecordsSorted) {
this.hoodieTable = hoodieTable; this.hoodieTable = hoodieTable;
this.writeConfig = writeConfig; this.writeConfig = writeConfig;
this.instantTime = instantTime; this.instantTime = instantTime;
this.structType = structType; this.structType = structType;
this.populateMetaFields = populateMetaFields;
this.arePartitionRecordsSorted = arePartitionRecordsSorted; this.arePartitionRecordsSorted = arePartitionRecordsSorted;
} }
@Override @Override
public DataWriter<InternalRow> createDataWriter(int partitionId, long taskId, long epochId) { public DataWriter<InternalRow> createDataWriter(int partitionId, long taskId, long epochId) {
return new HoodieBulkInsertDataInternalWriter(hoodieTable, writeConfig, instantTime, partitionId, taskId, epochId, return new HoodieBulkInsertDataInternalWriter(hoodieTable, writeConfig, instantTime, partitionId, taskId, epochId,
structType, arePartitionRecordsSorted); structType, populateMetaFields, arePartitionRecordsSorted);
} }
} }

View File

@@ -49,15 +49,17 @@ public class HoodieDataSourceInternalWriter implements DataSourceWriter {
private final HoodieWriteConfig writeConfig; private final HoodieWriteConfig writeConfig;
private final StructType structType; private final StructType structType;
private final DataSourceInternalWriterHelper dataSourceInternalWriterHelper; private final DataSourceInternalWriterHelper dataSourceInternalWriterHelper;
private final boolean populateMetaFields;
private final Boolean arePartitionRecordsSorted; private final Boolean arePartitionRecordsSorted;
private Map<String, String> extraMetadataMap = new HashMap<>(); private Map<String, String> extraMetadataMap = new HashMap<>();
public HoodieDataSourceInternalWriter(String instantTime, HoodieWriteConfig writeConfig, StructType structType, public HoodieDataSourceInternalWriter(String instantTime, HoodieWriteConfig writeConfig, StructType structType,
SparkSession sparkSession, Configuration configuration, DataSourceOptions dataSourceOptions, SparkSession sparkSession, Configuration configuration, DataSourceOptions dataSourceOptions,
boolean arePartitionRecordsSorted) { boolean populateMetaFields, boolean arePartitionRecordsSorted) {
this.instantTime = instantTime; this.instantTime = instantTime;
this.writeConfig = writeConfig; this.writeConfig = writeConfig;
this.structType = structType; this.structType = structType;
this.populateMetaFields = populateMetaFields;
this.arePartitionRecordsSorted = arePartitionRecordsSorted; this.arePartitionRecordsSorted = arePartitionRecordsSorted;
this.extraMetadataMap = DataSourceUtils.getExtraMetadata(dataSourceOptions.asMap()); this.extraMetadataMap = DataSourceUtils.getExtraMetadata(dataSourceOptions.asMap());
this.dataSourceInternalWriterHelper = new DataSourceInternalWriterHelper(instantTime, writeConfig, structType, this.dataSourceInternalWriterHelper = new DataSourceInternalWriterHelper(instantTime, writeConfig, structType,
@@ -69,7 +71,7 @@ public class HoodieDataSourceInternalWriter implements DataSourceWriter {
dataSourceInternalWriterHelper.createInflightCommit(); dataSourceInternalWriterHelper.createInflightCommit();
if (WriteOperationType.BULK_INSERT == dataSourceInternalWriterHelper.getWriteOperationType()) { if (WriteOperationType.BULK_INSERT == dataSourceInternalWriterHelper.getWriteOperationType()) {
return new HoodieBulkInsertDataInternalWriterFactory(dataSourceInternalWriterHelper.getHoodieTable(), return new HoodieBulkInsertDataInternalWriterFactory(dataSourceInternalWriterHelper.getHoodieTable(),
writeConfig, instantTime, structType, arePartitionRecordsSorted); writeConfig, instantTime, structType, populateMetaFields, arePartitionRecordsSorted);
} else { } else {
throw new IllegalArgumentException("Write Operation Type + " + dataSourceInternalWriterHelper.getWriteOperationType() + " not supported "); throw new IllegalArgumentException("Write Operation Type + " + dataSourceInternalWriterHelper.getWriteOperationType() + " not supported ");
} }

View File

@@ -38,7 +38,6 @@ import java.util.stream.Stream;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER; import static org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE; import static org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getConfigBuilder;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getInternalRowWithError; import static org.apache.hudi.testutils.SparkDatasetTestUtils.getInternalRowWithError;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows; import static org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows; import static org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows;
@@ -51,6 +50,16 @@ public class TestHoodieBulkInsertDataInternalWriter extends
HoodieBulkInsertInternalWriterTestBase { HoodieBulkInsertInternalWriterTestBase {
private static Stream<Arguments> configParams() { private static Stream<Arguments> configParams() {
Object[][] data = new Object[][] {
{true, true},
{true, false},
{false, true},
{false, false}
};
return Stream.of(data).map(Arguments::of);
}
private static Stream<Arguments> bulkInsertTypeParams() {
Object[][] data = new Object[][] { Object[][] data = new Object[][] {
{true}, {true},
{false} {false}
@@ -60,16 +69,16 @@ public class TestHoodieBulkInsertDataInternalWriter extends
@ParameterizedTest @ParameterizedTest
@MethodSource("configParams") @MethodSource("configParams")
public void testDataInternalWriter(boolean sorted) throws Exception { public void testDataInternalWriter(boolean sorted, boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
// execute N rounds // execute N rounds
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
String instantTime = "00" + i; String instantTime = "00" + i;
// init writer // init writer
HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), STRUCT_TYPE, HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(),
sorted); STRUCT_TYPE, populateMetaFields, sorted);
int size = 10 + RANDOM.nextInt(1000); int size = 10 + RANDOM.nextInt(1000);
// write N rows to partition1, N rows to partition2 and N rows to partition3 ... Each batch should create a new RowCreateHandle and a new file // write N rows to partition1, N rows to partition2 and N rows to partition3 ... Each batch should create a new RowCreateHandle and a new file
@@ -96,11 +105,10 @@ public class TestHoodieBulkInsertDataInternalWriter extends
// verify rows // verify rows
Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0])); Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0]));
assertOutput(totalInputRows, result, instantTime, fileNames); assertOutput(totalInputRows, result, instantTime, fileNames, populateMetaFields);
} }
} }
/** /**
* Issue some corrupted or wrong schematized InternalRow after few valid InternalRows so that global error is thrown. write batch 1 of valid records write batch2 of invalid records which is expected * Issue some corrupted or wrong schematized InternalRow after few valid InternalRows so that global error is thrown. write batch 1 of valid records write batch2 of invalid records which is expected
* to throw Global Error. Verify global error is set appropriately and only first batch of records are written to disk. * to throw Global Error. Verify global error is set appropriately and only first batch of records are written to disk.
@@ -108,13 +116,13 @@ public class TestHoodieBulkInsertDataInternalWriter extends
@Test @Test
public void testGlobalFailure() throws Exception { public void testGlobalFailure() throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(true);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0]; String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
String instantTime = "001"; String instantTime = "001";
HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), STRUCT_TYPE, HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(),
false); STRUCT_TYPE, true, false);
int size = 10 + RANDOM.nextInt(100); int size = 10 + RANDOM.nextInt(100);
int totalFailures = 5; int totalFailures = 5;
@@ -150,7 +158,7 @@ public class TestHoodieBulkInsertDataInternalWriter extends
// verify rows // verify rows
Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0])); Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0]));
assertOutput(inputRows, result, instantTime, fileNames); assertOutput(inputRows, result, instantTime, fileNames, true);
} }
private void writeRows(Dataset<Row> inputRows, HoodieBulkInsertDataInternalWriter writer) private void writeRows(Dataset<Row> inputRows, HoodieBulkInsertDataInternalWriter writer)

View File

@@ -31,6 +31,9 @@ import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.sources.v2.DataSourceOptions; import org.apache.spark.sql.sources.v2.DataSourceOptions;
import org.apache.spark.sql.sources.v2.writer.DataWriter; import org.apache.spark.sql.sources.v2.writer.DataWriter;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@@ -38,10 +41,10 @@ import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Stream;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER; import static org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE; import static org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getConfigBuilder;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows; import static org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows; import static org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -53,18 +56,28 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestHoodieDataSourceInternalWriter extends public class TestHoodieDataSourceInternalWriter extends
HoodieBulkInsertInternalWriterTestBase { HoodieBulkInsertInternalWriterTestBase {
@Test private static Stream<Arguments> bulkInsertTypeParams() {
public void testDataSourceWriter() throws Exception { Object[][] data = new Object[][] {
testDataSourceWriterInternal(Collections.EMPTY_MAP, Collections.EMPTY_MAP); {true},
{false}
};
return Stream.of(data).map(Arguments::of);
} }
private void testDataSourceWriterInternal(Map<String, String> extraMetadata, Map<String, String> expectedExtraMetadata) throws Exception { @ParameterizedTest
@MethodSource("bulkInsertTypeParams")
public void testDataSourceWriter(boolean populateMetaFields) throws Exception {
testDataSourceWriterInternal(Collections.EMPTY_MAP, Collections.EMPTY_MAP, populateMetaFields);
}
private void testDataSourceWriterInternal(Map<String, String> extraMetadata, Map<String, String> expectedExtraMetadata, boolean populateMetaFields)
throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
String instantTime = "001"; String instantTime = "001";
// init writer // init writer
HoodieDataSourceInternalWriter dataSourceInternalWriter = HoodieDataSourceInternalWriter dataSourceInternalWriter =
new HoodieDataSourceInternalWriter(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(extraMetadata), false); new HoodieDataSourceInternalWriter(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(extraMetadata), populateMetaFields, false);
DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(0, RANDOM.nextLong(), RANDOM.nextLong()); DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(0, RANDOM.nextLong(), RANDOM.nextLong());
String[] partitionPaths = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS; String[] partitionPaths = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS;
@@ -95,7 +108,7 @@ public class TestHoodieDataSourceInternalWriter extends
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0])); Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
// verify output // verify output
assertOutput(totalInputRows, result, instantTime, Option.empty()); assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty()); assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
// verify extra metadata // verify extra metadata
@@ -122,7 +135,7 @@ public class TestHoodieDataSourceInternalWriter extends
expectedMetadata.remove(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX_OPT_KEY().key()); expectedMetadata.remove(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX_OPT_KEY().key());
expectedMetadata.remove("commit_extra_c"); expectedMetadata.remove("commit_extra_c");
testDataSourceWriterInternal(extraMeta, expectedMetadata); testDataSourceWriterInternal(extraMeta, expectedMetadata, true);
} }
@Test @Test
@@ -134,13 +147,14 @@ public class TestHoodieDataSourceInternalWriter extends
extraMeta.put("keyB", "valB"); extraMeta.put("keyB", "valB");
extraMeta.put("commit_extra_c", "valC"); extraMeta.put("commit_extra_c", "valC");
// none of the keys has commit metadata key prefix. // none of the keys has commit metadata key prefix.
testDataSourceWriterInternal(extraMeta, Collections.EMPTY_MAP); testDataSourceWriterInternal(extraMeta, Collections.EMPTY_MAP, true);
} }
@Test @ParameterizedTest
public void testMultipleDataSourceWrites() throws Exception { @MethodSource("bulkInsertTypeParams")
public void testMultipleDataSourceWrites(boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
int partitionCounter = 0; int partitionCounter = 0;
// execute N rounds // execute N rounds
@@ -148,7 +162,7 @@ public class TestHoodieDataSourceInternalWriter extends
String instantTime = "00" + i; String instantTime = "00" + i;
// init writer // init writer
HoodieDataSourceInternalWriter dataSourceInternalWriter = HoodieDataSourceInternalWriter dataSourceInternalWriter =
new HoodieDataSourceInternalWriter(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(Collections.EMPTY_MAP), false); new HoodieDataSourceInternalWriter(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(Collections.EMPTY_MAP), populateMetaFields, false);
List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>(); List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
Dataset<Row> totalInputRows = null; Dataset<Row> totalInputRows = null;
DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(partitionCounter++, RANDOM.nextLong(), RANDOM.nextLong()); DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(partitionCounter++, RANDOM.nextLong(), RANDOM.nextLong());
@@ -172,18 +186,20 @@ public class TestHoodieDataSourceInternalWriter extends
dataSourceInternalWriter.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0])); dataSourceInternalWriter.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime); Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime,
populateMetaFields);
// verify output // verify output
assertOutput(totalInputRows, result, instantTime, Option.empty()); assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty()); assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
} }
} }
@Test @ParameterizedTest
public void testLargeWrites() throws Exception { @MethodSource("bulkInsertTypeParams")
public void testLargeWrites(boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
int partitionCounter = 0; int partitionCounter = 0;
// execute N rounds // execute N rounds
@@ -191,7 +207,7 @@ public class TestHoodieDataSourceInternalWriter extends
String instantTime = "00" + i; String instantTime = "00" + i;
// init writer // init writer
HoodieDataSourceInternalWriter dataSourceInternalWriter = HoodieDataSourceInternalWriter dataSourceInternalWriter =
new HoodieDataSourceInternalWriter(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(Collections.EMPTY_MAP), false); new HoodieDataSourceInternalWriter(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(Collections.EMPTY_MAP), populateMetaFields, false);
List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>(); List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
Dataset<Row> totalInputRows = null; Dataset<Row> totalInputRows = null;
DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(partitionCounter++, RANDOM.nextLong(), RANDOM.nextLong()); DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(partitionCounter++, RANDOM.nextLong(), RANDOM.nextLong());
@@ -215,10 +231,11 @@ public class TestHoodieDataSourceInternalWriter extends
dataSourceInternalWriter.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0])); dataSourceInternalWriter.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime); Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime,
populateMetaFields);
// verify output // verify output
assertOutput(totalInputRows, result, instantTime, Option.empty()); assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty()); assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
} }
} }
@@ -229,15 +246,16 @@ public class TestHoodieDataSourceInternalWriter extends
* abort batch2 * abort batch2
* verify only records from batch1 is available to read * verify only records from batch1 is available to read
*/ */
@Test @ParameterizedTest
public void testAbort() throws Exception { @MethodSource("bulkInsertTypeParams")
public void testAbort(boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
String instantTime0 = "00" + 0; String instantTime0 = "00" + 0;
// init writer // init writer
HoodieDataSourceInternalWriter dataSourceInternalWriter = HoodieDataSourceInternalWriter dataSourceInternalWriter =
new HoodieDataSourceInternalWriter(instantTime0, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(Collections.EMPTY_MAP), false); new HoodieDataSourceInternalWriter(instantTime0, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(Collections.EMPTY_MAP), populateMetaFields, false);
DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(0, RANDOM.nextLong(), RANDOM.nextLong()); DataWriter<InternalRow> writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(0, RANDOM.nextLong(), RANDOM.nextLong());
List<String> partitionPaths = Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS); List<String> partitionPaths = Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS);
@@ -269,13 +287,13 @@ public class TestHoodieDataSourceInternalWriter extends
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0])); Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
// verify rows // verify rows
assertOutput(totalInputRows, result, instantTime0, Option.empty()); assertOutput(totalInputRows, result, instantTime0, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty()); assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
// 2nd batch. abort in the end // 2nd batch. abort in the end
String instantTime1 = "00" + 1; String instantTime1 = "00" + 1;
dataSourceInternalWriter = dataSourceInternalWriter =
new HoodieDataSourceInternalWriter(instantTime1, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(Collections.EMPTY_MAP), false); new HoodieDataSourceInternalWriter(instantTime1, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, new DataSourceOptions(Collections.EMPTY_MAP), populateMetaFields, false);
writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(1, RANDOM.nextLong(), RANDOM.nextLong()); writer = dataSourceInternalWriter.createWriterFactory().createDataWriter(1, RANDOM.nextLong(), RANDOM.nextLong());
for (int j = 0; j < batches; j++) { for (int j = 0; j < batches; j++) {
@@ -293,7 +311,7 @@ public class TestHoodieDataSourceInternalWriter extends
result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0])); result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
// verify rows // verify rows
// only rows from first batch should be present // only rows from first batch should be present
assertOutput(totalInputRows, result, instantTime0, Option.empty()); assertOutput(totalInputRows, result, instantTime0, Option.empty(), populateMetaFields);
} }
private void writeRows(Dataset<Row> inputRows, DataWriter<InternalRow> writer) throws Exception { private void writeRows(Dataset<Row> inputRows, DataWriter<InternalRow> writer) throws Exception {

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.spark3.internal; package org.apache.hudi.spark3.internal;
import org.apache.hudi.DataSourceUtils; import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.config.HoodieInternalConfig; import org.apache.hudi.config.HoodieInternalConfig;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.internal.BaseDefaultSource; import org.apache.hudi.internal.BaseDefaultSource;
@@ -48,11 +49,13 @@ public class DefaultSource extends BaseDefaultSource implements TableProvider {
String instantTime = properties.get(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY); String instantTime = properties.get(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY);
String path = properties.get("path"); String path = properties.get("path");
String tblName = properties.get(HoodieWriteConfig.TABLE_NAME.key()); String tblName = properties.get(HoodieWriteConfig.TABLE_NAME.key());
boolean populateMetaFields = Boolean.parseBoolean(properties.getOrDefault(HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.key(),
HoodieTableConfig.HOODIE_POPULATE_META_FIELDS.defaultValue()));
boolean arePartitionRecordsSorted = Boolean.parseBoolean(properties.getOrDefault(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED, boolean arePartitionRecordsSorted = Boolean.parseBoolean(properties.getOrDefault(HoodieInternalConfig.BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED,
Boolean.toString(HoodieInternalConfig.DEFAULT_BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED))); Boolean.toString(HoodieInternalConfig.DEFAULT_BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED)));
// 1st arg to createHooodieConfig is not really reuqired to be set. but passing it anyways. // 1st arg to createHooodieConfig is not really reuqired to be set. but passing it anyways.
HoodieWriteConfig config = DataSourceUtils.createHoodieConfig(properties.get(HoodieWriteConfig.AVRO_SCHEMA.key()), path, tblName, properties); HoodieWriteConfig config = DataSourceUtils.createHoodieConfig(properties.get(HoodieWriteConfig.AVRO_SCHEMA.key()), path, tblName, properties);
return new HoodieDataSourceInternalTable(instantTime, config, schema, getSparkSession(), return new HoodieDataSourceInternalTable(instantTime, config, schema, getSparkSession(),
getConfiguration(), properties, arePartitionRecordsSorted); getConfiguration(), properties, populateMetaFields, arePartitionRecordsSorted);
} }
} }

View File

@@ -37,9 +37,10 @@ public class HoodieBulkInsertDataInternalWriter implements DataWriter<InternalRo
private final BulkInsertDataInternalWriterHelper bulkInsertWriterHelper; private final BulkInsertDataInternalWriterHelper bulkInsertWriterHelper;
public HoodieBulkInsertDataInternalWriter(HoodieTable hoodieTable, HoodieWriteConfig writeConfig, public HoodieBulkInsertDataInternalWriter(HoodieTable hoodieTable, HoodieWriteConfig writeConfig,
String instantTime, int taskPartitionId, long taskId, StructType structType, boolean arePartitionRecordsSorted) { String instantTime, int taskPartitionId, long taskId, StructType structType, boolean populateMetaFields,
boolean arePartitionRecordsSorted) {
this.bulkInsertWriterHelper = new BulkInsertDataInternalWriterHelper(hoodieTable, this.bulkInsertWriterHelper = new BulkInsertDataInternalWriterHelper(hoodieTable,
writeConfig, instantTime, taskPartitionId, taskId, 0, structType, arePartitionRecordsSorted); writeConfig, instantTime, taskPartitionId, taskId, 0, structType, populateMetaFields, arePartitionRecordsSorted);
} }
@Override @Override

View File

@@ -35,20 +35,23 @@ public class HoodieBulkInsertDataInternalWriterFactory implements DataWriterFact
private final HoodieTable hoodieTable; private final HoodieTable hoodieTable;
private final HoodieWriteConfig writeConfig; private final HoodieWriteConfig writeConfig;
private final StructType structType; private final StructType structType;
private final boolean populateMetaFields;
private final boolean arePartitionRecordsSorted; private final boolean arePartitionRecordsSorted;
public HoodieBulkInsertDataInternalWriterFactory(HoodieTable hoodieTable, HoodieWriteConfig writeConfig, public HoodieBulkInsertDataInternalWriterFactory(HoodieTable hoodieTable, HoodieWriteConfig writeConfig,
String instantTime, StructType structType, boolean arePartitionRecordsSorted) { String instantTime, StructType structType, boolean populateMetaFields,
boolean arePartitionRecordsSorted) {
this.hoodieTable = hoodieTable; this.hoodieTable = hoodieTable;
this.writeConfig = writeConfig; this.writeConfig = writeConfig;
this.instantTime = instantTime; this.instantTime = instantTime;
this.structType = structType; this.structType = structType;
this.populateMetaFields = populateMetaFields;
this.arePartitionRecordsSorted = arePartitionRecordsSorted; this.arePartitionRecordsSorted = arePartitionRecordsSorted;
} }
@Override @Override
public DataWriter<InternalRow> createWriter(int partitionId, long taskId) { public DataWriter<InternalRow> createWriter(int partitionId, long taskId) {
return new HoodieBulkInsertDataInternalWriter(hoodieTable, writeConfig, instantTime, partitionId, taskId, return new HoodieBulkInsertDataInternalWriter(hoodieTable, writeConfig, instantTime, partitionId, taskId,
structType, arePartitionRecordsSorted); structType, populateMetaFields, arePartitionRecordsSorted);
} }
} }

View File

@@ -27,10 +27,10 @@ import org.apache.hudi.internal.DataSourceInternalWriterHelper;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.connector.write.DataWriterFactory;
import org.apache.spark.sql.connector.write.WriterCommitMessage;
import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.BatchWrite;
import org.apache.spark.sql.connector.write.DataWriterFactory;
import org.apache.spark.sql.connector.write.PhysicalWriteInfo; import org.apache.spark.sql.connector.write.PhysicalWriteInfo;
import org.apache.spark.sql.connector.write.WriterCommitMessage;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import java.util.Arrays; import java.util.Arrays;
@@ -49,14 +49,16 @@ public class HoodieDataSourceInternalBatchWrite implements BatchWrite {
private final HoodieWriteConfig writeConfig; private final HoodieWriteConfig writeConfig;
private final StructType structType; private final StructType structType;
private final boolean arePartitionRecordsSorted; private final boolean arePartitionRecordsSorted;
private final boolean populateMetaFields;
private final DataSourceInternalWriterHelper dataSourceInternalWriterHelper; private final DataSourceInternalWriterHelper dataSourceInternalWriterHelper;
private Map<String, String> extraMetadata = new HashMap<>(); private Map<String, String> extraMetadata = new HashMap<>();
public HoodieDataSourceInternalBatchWrite(String instantTime, HoodieWriteConfig writeConfig, StructType structType, public HoodieDataSourceInternalBatchWrite(String instantTime, HoodieWriteConfig writeConfig, StructType structType,
SparkSession jss, Configuration hadoopConfiguration, Map<String, String> properties, boolean arePartitionRecordsSorted) { SparkSession jss, Configuration hadoopConfiguration, Map<String, String> properties, boolean populateMetaFields, boolean arePartitionRecordsSorted) {
this.instantTime = instantTime; this.instantTime = instantTime;
this.writeConfig = writeConfig; this.writeConfig = writeConfig;
this.structType = structType; this.structType = structType;
this.populateMetaFields = populateMetaFields;
this.arePartitionRecordsSorted = arePartitionRecordsSorted; this.arePartitionRecordsSorted = arePartitionRecordsSorted;
this.extraMetadata = DataSourceUtils.getExtraMetadata(properties); this.extraMetadata = DataSourceUtils.getExtraMetadata(properties);
this.dataSourceInternalWriterHelper = new DataSourceInternalWriterHelper(instantTime, writeConfig, structType, this.dataSourceInternalWriterHelper = new DataSourceInternalWriterHelper(instantTime, writeConfig, structType,
@@ -68,7 +70,7 @@ public class HoodieDataSourceInternalBatchWrite implements BatchWrite {
dataSourceInternalWriterHelper.createInflightCommit(); dataSourceInternalWriterHelper.createInflightCommit();
if (WriteOperationType.BULK_INSERT == dataSourceInternalWriterHelper.getWriteOperationType()) { if (WriteOperationType.BULK_INSERT == dataSourceInternalWriterHelper.getWriteOperationType()) {
return new HoodieBulkInsertDataInternalWriterFactory(dataSourceInternalWriterHelper.getHoodieTable(), return new HoodieBulkInsertDataInternalWriterFactory(dataSourceInternalWriterHelper.getHoodieTable(),
writeConfig, instantTime, structType, arePartitionRecordsSorted); writeConfig, instantTime, structType, populateMetaFields, arePartitionRecordsSorted);
} else { } else {
throw new IllegalArgumentException("Write Operation Type + " + dataSourceInternalWriterHelper.getWriteOperationType() + " not supported "); throw new IllegalArgumentException("Write Operation Type + " + dataSourceInternalWriterHelper.getWriteOperationType() + " not supported ");
} }

View File

@@ -39,23 +39,26 @@ public class HoodieDataSourceInternalBatchWriteBuilder implements WriteBuilder {
private final StructType structType; private final StructType structType;
private final SparkSession jss; private final SparkSession jss;
private final Configuration hadoopConfiguration; private final Configuration hadoopConfiguration;
private final boolean arePartitionRecordsSorted;
private final Map<String, String> properties; private final Map<String, String> properties;
private final boolean populateMetaFields;
private final boolean arePartitionRecordsSorted;
public HoodieDataSourceInternalBatchWriteBuilder(String instantTime, HoodieWriteConfig writeConfig, StructType structType, public HoodieDataSourceInternalBatchWriteBuilder(String instantTime, HoodieWriteConfig writeConfig, StructType structType,
SparkSession jss, Configuration hadoopConfiguration, Map<String, String> properties, boolean arePartitionRecordsSorted) { SparkSession jss, Configuration hadoopConfiguration, Map<String, String> properties, boolean populateMetaFields,
boolean arePartitionRecordsSorted) {
this.instantTime = instantTime; this.instantTime = instantTime;
this.writeConfig = writeConfig; this.writeConfig = writeConfig;
this.structType = structType; this.structType = structType;
this.jss = jss; this.jss = jss;
this.hadoopConfiguration = hadoopConfiguration; this.hadoopConfiguration = hadoopConfiguration;
this.arePartitionRecordsSorted = arePartitionRecordsSorted;
this.properties = properties; this.properties = properties;
this.populateMetaFields = populateMetaFields;
this.arePartitionRecordsSorted = arePartitionRecordsSorted;
} }
@Override @Override
public BatchWrite buildForBatch() { public BatchWrite buildForBatch() {
return new HoodieDataSourceInternalBatchWrite(instantTime, writeConfig, structType, jss, return new HoodieDataSourceInternalBatchWrite(instantTime, writeConfig, structType, jss,
hadoopConfiguration, properties, arePartitionRecordsSorted); hadoopConfiguration, properties, populateMetaFields, arePartitionRecordsSorted);
} }
} }

View File

@@ -18,9 +18,9 @@
package org.apache.hudi.spark3.internal; package org.apache.hudi.spark3.internal;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.SupportsWrite;
import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCapability;
@@ -44,17 +44,19 @@ class HoodieDataSourceInternalTable implements SupportsWrite {
private final Configuration hadoopConfiguration; private final Configuration hadoopConfiguration;
private final boolean arePartitionRecordsSorted; private final boolean arePartitionRecordsSorted;
private final Map<String, String> properties; private final Map<String, String> properties;
private final boolean populateMetaFields;
public HoodieDataSourceInternalTable(String instantTime, HoodieWriteConfig config, public HoodieDataSourceInternalTable(String instantTime, HoodieWriteConfig config,
StructType schema, SparkSession jss, Configuration hadoopConfiguration, Map<String, String> properties, StructType schema, SparkSession jss, Configuration hadoopConfiguration, Map<String, String> properties,
boolean arePartitionRecordsSorted) { boolean populateMetaFields, boolean arePartitionRecordsSorted) {
this.instantTime = instantTime; this.instantTime = instantTime;
this.writeConfig = config; this.writeConfig = config;
this.structType = schema; this.structType = schema;
this.jss = jss; this.jss = jss;
this.hadoopConfiguration = hadoopConfiguration; this.hadoopConfiguration = hadoopConfiguration;
this.arePartitionRecordsSorted = arePartitionRecordsSorted;
this.properties = properties; this.properties = properties;
this.populateMetaFields = populateMetaFields;
this.arePartitionRecordsSorted = arePartitionRecordsSorted;
} }
@Override @Override
@@ -69,7 +71,8 @@ class HoodieDataSourceInternalTable implements SupportsWrite {
@Override @Override
public Set<TableCapability> capabilities() { public Set<TableCapability> capabilities() {
return new HashSet<TableCapability>() {{ return new HashSet<TableCapability>() {
{
add(TableCapability.BATCH_WRITE); add(TableCapability.BATCH_WRITE);
add(TableCapability.TRUNCATE); add(TableCapability.TRUNCATE);
} }
@@ -79,6 +82,6 @@ class HoodieDataSourceInternalTable implements SupportsWrite {
@Override @Override
public WriteBuilder newWriteBuilder(LogicalWriteInfo logicalWriteInfo) { public WriteBuilder newWriteBuilder(LogicalWriteInfo logicalWriteInfo) {
return new HoodieDataSourceInternalBatchWriteBuilder(instantTime, writeConfig, structType, jss, return new HoodieDataSourceInternalBatchWriteBuilder(instantTime, writeConfig, structType, jss,
hadoopConfiguration, properties, arePartitionRecordsSorted); hadoopConfiguration, properties, populateMetaFields, arePartitionRecordsSorted);
} }
} }

View File

@@ -39,7 +39,6 @@ import java.util.stream.Stream;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER; import static org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE; import static org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getConfigBuilder;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getInternalRowWithError; import static org.apache.hudi.testutils.SparkDatasetTestUtils.getInternalRowWithError;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows; import static org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows; import static org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows;
@@ -52,6 +51,16 @@ public class TestHoodieBulkInsertDataInternalWriter extends
HoodieBulkInsertInternalWriterTestBase { HoodieBulkInsertInternalWriterTestBase {
private static Stream<Arguments> configParams() { private static Stream<Arguments> configParams() {
Object[][] data = new Object[][] {
{true, true},
{true, false},
{false, true},
{false, false}
};
return Stream.of(data).map(Arguments::of);
}
private static Stream<Arguments> bulkInsertTypeParams() {
Object[][] data = new Object[][] { Object[][] data = new Object[][] {
{true}, {true},
{false} {false}
@@ -61,16 +70,16 @@ public class TestHoodieBulkInsertDataInternalWriter extends
@ParameterizedTest @ParameterizedTest
@MethodSource("configParams") @MethodSource("configParams")
public void testDataInternalWriter(boolean sorted) throws Exception { public void testDataInternalWriter(boolean sorted, boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
// execute N rounds // execute N rounds
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
String instantTime = "00" + i; String instantTime = "00" + i;
// init writer // init writer
HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), STRUCT_TYPE, HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000),
sorted); RANDOM.nextLong(), STRUCT_TYPE, populateMetaFields, sorted);
int size = 10 + RANDOM.nextInt(1000); int size = 10 + RANDOM.nextInt(1000);
// write N rows to partition1, N rows to partition2 and N rows to partition3 ... Each batch should create a new RowCreateHandle and a new file // write N rows to partition1, N rows to partition2 and N rows to partition3 ... Each batch should create a new RowCreateHandle and a new file
@@ -97,7 +106,7 @@ public class TestHoodieBulkInsertDataInternalWriter extends
// verify rows // verify rows
Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0])); Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0]));
assertOutput(totalInputRows, result, instantTime, fileNames); assertOutput(totalInputRows, result, instantTime, fileNames, populateMetaFields);
} }
} }
@@ -109,12 +118,13 @@ public class TestHoodieBulkInsertDataInternalWriter extends
@Test @Test
public void testGlobalFailure() throws Exception { public void testGlobalFailure() throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(true);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0]; String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
String instantTime = "001"; String instantTime = "001";
HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), STRUCT_TYPE, false); HoodieBulkInsertDataInternalWriter writer = new HoodieBulkInsertDataInternalWriter(table, cfg, instantTime, RANDOM.nextInt(100000),
RANDOM.nextLong(), STRUCT_TYPE, true, false);
int size = 10 + RANDOM.nextInt(100); int size = 10 + RANDOM.nextInt(100);
int totalFailures = 5; int totalFailures = 5;
@@ -150,7 +160,7 @@ public class TestHoodieBulkInsertDataInternalWriter extends
// verify rows // verify rows
Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0])); Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.get().toArray(new String[0]));
assertOutput(inputRows, result, instantTime, fileNames); assertOutput(inputRows, result, instantTime, fileNames, true);
} }
private void writeRows(Dataset<Row> inputRows, HoodieBulkInsertDataInternalWriter writer) private void writeRows(Dataset<Row> inputRows, HoodieBulkInsertDataInternalWriter writer)

View File

@@ -33,6 +33,9 @@ import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.connector.write.DataWriter; import org.apache.spark.sql.connector.write.DataWriter;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@@ -40,10 +43,10 @@ import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Stream;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER; import static org.apache.hudi.testutils.SparkDatasetTestUtils.ENCODER;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE; import static org.apache.hudi.testutils.SparkDatasetTestUtils.STRUCT_TYPE;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getConfigBuilder;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows; import static org.apache.hudi.testutils.SparkDatasetTestUtils.getRandomRows;
import static org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows; import static org.apache.hudi.testutils.SparkDatasetTestUtils.toInternalRows;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -55,19 +58,28 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestHoodieDataSourceInternalBatchWrite extends public class TestHoodieDataSourceInternalBatchWrite extends
HoodieBulkInsertInternalWriterTestBase { HoodieBulkInsertInternalWriterTestBase {
@Test private static Stream<Arguments> bulkInsertTypeParams() {
public void testDataSourceWriter() throws Exception { Object[][] data = new Object[][] {
testDataSourceWriterInternal(Collections.EMPTY_MAP, Collections.EMPTY_MAP); {true},
{false}
};
return Stream.of(data).map(Arguments::of);
} }
private void testDataSourceWriterInternal(Map<String, String> extraMetadata, Map<String, String> expectedExtraMetadata) throws Exception { @ParameterizedTest
@MethodSource("bulkInsertTypeParams")
public void testDataSourceWriter(boolean populateMetaFields) throws Exception {
testDataSourceWriterInternal(Collections.EMPTY_MAP, Collections.EMPTY_MAP, populateMetaFields);
}
private void testDataSourceWriterInternal(Map<String, String> extraMetadata, Map<String, String> expectedExtraMetadata, boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
String instantTime = "001"; String instantTime = "001";
// init writer // init writer
HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite =
new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, extraMetadata, false); new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, extraMetadata, populateMetaFields, false);
DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(0, RANDOM.nextLong()); DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(0, RANDOM.nextLong());
String[] partitionPaths = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS; String[] partitionPaths = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS;
@@ -99,7 +111,7 @@ public class TestHoodieDataSourceInternalBatchWrite extends
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0])); Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
// verify output // verify output
assertOutput(totalInputRows, result, instantTime, Option.empty()); assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty()); assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
// verify extra metadata // verify extra metadata
@@ -125,7 +137,7 @@ public class TestHoodieDataSourceInternalBatchWrite extends
expectedMetadata.remove(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX_OPT_KEY().key()); expectedMetadata.remove(DataSourceWriteOptions.COMMIT_METADATA_KEYPREFIX_OPT_KEY().key());
expectedMetadata.remove("commit_extra_c"); expectedMetadata.remove("commit_extra_c");
testDataSourceWriterInternal(extraMeta, expectedMetadata); testDataSourceWriterInternal(extraMeta, expectedMetadata, true);
} }
@Test @Test
@@ -137,13 +149,14 @@ public class TestHoodieDataSourceInternalBatchWrite extends
extraMeta.put("keyB", "valB"); extraMeta.put("keyB", "valB");
extraMeta.put("commit_extra_c", "valC"); extraMeta.put("commit_extra_c", "valC");
// none of the keys has commit metadata key prefix. // none of the keys has commit metadata key prefix.
testDataSourceWriterInternal(extraMeta, Collections.EMPTY_MAP); testDataSourceWriterInternal(extraMeta, Collections.EMPTY_MAP, true);
} }
@Test @ParameterizedTest
public void testMultipleDataSourceWrites() throws Exception { @MethodSource("bulkInsertTypeParams")
public void testMultipleDataSourceWrites(boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
int partitionCounter = 0; int partitionCounter = 0;
@@ -152,7 +165,7 @@ public class TestHoodieDataSourceInternalBatchWrite extends
String instantTime = "00" + i; String instantTime = "00" + i;
// init writer // init writer
HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite =
new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, false); new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, populateMetaFields, false);
List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>(); List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
Dataset<Row> totalInputRows = null; Dataset<Row> totalInputRows = null;
DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(partitionCounter++, RANDOM.nextLong()); DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(partitionCounter++, RANDOM.nextLong());
@@ -176,18 +189,19 @@ public class TestHoodieDataSourceInternalBatchWrite extends
dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0])); dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime); Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime, populateMetaFields);
// verify output // verify output
assertOutput(totalInputRows, result, instantTime, Option.empty()); assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty()); assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
} }
} }
@Test @ParameterizedTest
public void testLargeWrites() throws Exception { @MethodSource("bulkInsertTypeParams")
public void testLargeWrites(boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
int partitionCounter = 0; int partitionCounter = 0;
@@ -196,7 +210,7 @@ public class TestHoodieDataSourceInternalBatchWrite extends
String instantTime = "00" + i; String instantTime = "00" + i;
// init writer // init writer
HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite =
new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, false); new HoodieDataSourceInternalBatchWrite(instantTime, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, populateMetaFields, false);
List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>(); List<HoodieWriterCommitMessage> commitMessages = new ArrayList<>();
Dataset<Row> totalInputRows = null; Dataset<Row> totalInputRows = null;
DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(partitionCounter++, RANDOM.nextLong()); DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(partitionCounter++, RANDOM.nextLong());
@@ -220,10 +234,11 @@ public class TestHoodieDataSourceInternalBatchWrite extends
dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0])); dataSourceInternalBatchWrite.commit(commitMessages.toArray(new HoodieWriterCommitMessage[0]));
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime); Dataset<Row> result = HoodieClientTestUtils.readCommit(basePath, sqlContext, metaClient.getCommitTimeline(), instantTime,
populateMetaFields);
// verify output // verify output
assertOutput(totalInputRows, result, instantTime, Option.empty()); assertOutput(totalInputRows, result, instantTime, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty()); assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
} }
} }
@@ -234,15 +249,16 @@ public class TestHoodieDataSourceInternalBatchWrite extends
* abort batch2 * abort batch2
* verify only records from batch1 is available to read * verify only records from batch1 is available to read
*/ */
@Test @ParameterizedTest
public void testAbort() throws Exception { @MethodSource("bulkInsertTypeParams")
public void testAbort(boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = getConfigBuilder(basePath).build(); HoodieWriteConfig cfg = getWriteConfig(populateMetaFields);
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
String instantTime0 = "00" + 0; String instantTime0 = "00" + 0;
// init writer // init writer
HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite = HoodieDataSourceInternalBatchWrite dataSourceInternalBatchWrite =
new HoodieDataSourceInternalBatchWrite(instantTime0, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, false); new HoodieDataSourceInternalBatchWrite(instantTime0, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, populateMetaFields, false);
DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(0, RANDOM.nextLong()); DataWriter<InternalRow> writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(0, RANDOM.nextLong());
List<String> partitionPaths = Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS); List<String> partitionPaths = Arrays.asList(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS);
@@ -274,13 +290,13 @@ public class TestHoodieDataSourceInternalBatchWrite extends
metaClient.reloadActiveTimeline(); metaClient.reloadActiveTimeline();
Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0])); Dataset<Row> result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
// verify rows // verify rows
assertOutput(totalInputRows, result, instantTime0, Option.empty()); assertOutput(totalInputRows, result, instantTime0, Option.empty(), populateMetaFields);
assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty()); assertWriteStatuses(commitMessages.get(0).getWriteStatuses(), batches, size, Option.empty(), Option.empty());
// 2nd batch. abort in the end // 2nd batch. abort in the end
String instantTime1 = "00" + 1; String instantTime1 = "00" + 1;
dataSourceInternalBatchWrite = dataSourceInternalBatchWrite =
new HoodieDataSourceInternalBatchWrite(instantTime1, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, false); new HoodieDataSourceInternalBatchWrite(instantTime1, cfg, STRUCT_TYPE, sqlContext.sparkSession(), hadoopConf, Collections.EMPTY_MAP, populateMetaFields, false);
writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(1, RANDOM.nextLong()); writer = dataSourceInternalBatchWrite.createBatchWriterFactory(null).createWriter(1, RANDOM.nextLong());
for (int j = 0; j < batches; j++) { for (int j = 0; j < batches; j++) {
@@ -298,7 +314,7 @@ public class TestHoodieDataSourceInternalBatchWrite extends
result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0])); result = HoodieClientTestUtils.read(jsc, basePath, sqlContext, metaClient.getFs(), partitionPathsAbs.toArray(new String[0]));
// verify rows // verify rows
// only rows from first batch should be present // only rows from first batch should be present
assertOutput(totalInputRows, result, instantTime0, Option.empty()); assertOutput(totalInputRows, result, instantTime0, Option.empty(), populateMetaFields);
} }
private void writeRows(Dataset<Row> inputRows, DataWriter<InternalRow> writer) throws Exception { private void writeRows(Dataset<Row> inputRows, DataWriter<InternalRow> writer) throws Exception {