1
0

[HUDI-3993] Replacing UDF in Bulk Insert w/ RDD transformation (#5470)

This commit is contained in:
Alexey Kudinkin
2022-07-21 06:20:47 -07:00
committed by GitHub
parent c7fe3fd01d
commit a33bdd32e3
41 changed files with 1180 additions and 870 deletions

View File

@@ -55,6 +55,10 @@ public class HoodieInternalWriteStatus implements Serializable {
this.random = new Random(RANDOM_SEED); this.random = new Random(RANDOM_SEED);
} }
public boolean isTrackingSuccessfulWrites() {
return trackSuccessRecords;
}
public void markSuccess(String recordKey) { public void markSuccess(String recordKey) {
if (trackSuccessRecords) { if (trackSuccessRecords) {
this.successRecordKeys.add(recordKey); this.successRecordKeys.add(recordKey);

View File

@@ -56,7 +56,7 @@ public class NonpartitionedAvroKeyGenerator extends BaseKeyGenerator {
// for backward compatibility, we need to use the right format according to the number of record key fields // for backward compatibility, we need to use the right format according to the number of record key fields
// 1. if there is only one record key field, the format of record key is just "<value>" // 1. if there is only one record key field, the format of record key is just "<value>"
// 2. if there are multiple record key fields, the format is "<field1>:<value1>,<field2>:<value2>,..." // 2. if there are multiple record key fields, the format is "<field1>:<value1>,<field2>:<value2>,..."
if (getRecordKeyFieldNames().size() == 1) { if (getRecordKeyFields().size() == 1) {
return KeyGenUtils.getRecordKey(record, getRecordKeyFields().get(0), isConsistentLogicalTimestampEnabled()); return KeyGenUtils.getRecordKey(record, getRecordKeyFields().get(0), isConsistentLogicalTimestampEnabled());
} }
return KeyGenUtils.getRecordKey(record, getRecordKeyFields(), isConsistentLogicalTimestampEnabled()); return KeyGenUtils.getRecordKey(record, getRecordKeyFields(), isConsistentLogicalTimestampEnabled());

View File

@@ -24,31 +24,66 @@ import org.apache.spark.sql.catalyst.util.ArrayData;
import org.apache.spark.sql.catalyst.util.MapData; import org.apache.spark.sql.catalyst.util.MapData;
import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.types.Decimal;
import org.apache.spark.sql.types.StringType$;
import org.apache.spark.unsafe.types.CalendarInterval; import org.apache.spark.unsafe.types.CalendarInterval;
import org.apache.spark.unsafe.types.UTF8String; import org.apache.spark.unsafe.types.UTF8String;
import java.util.Arrays;
/** /**
* Internal Row implementation for Hoodie Row. It wraps an {@link InternalRow} and keeps meta columns locally. But the {@link InternalRow} * Hudi internal implementation of the {@link InternalRow} allowing to extend arbitrary
* does include the meta columns as well just that {@link HoodieInternalRow} will intercept queries for meta columns and serve from its * {@link InternalRow} overlaying Hudi-internal meta-fields on top of it.
* copy rather than fetching from {@link InternalRow}. *
* Capable of overlaying meta-fields in both cases: whether original {@link #row} contains
* meta columns or not. This allows to handle following use-cases allowing to avoid any
* manipulation (reshuffling) of the source row, by simply creating new instance
* of {@link HoodieInternalRow} with all the meta-values provided
*
* <ul>
* <li>When meta-fields need to be prepended to the source {@link InternalRow}</li>
* <li>When meta-fields need to be updated w/in the source {@link InternalRow}
* ({@link org.apache.spark.sql.catalyst.expressions.UnsafeRow} currently does not
* allow in-place updates due to its memory layout)</li>
* </ul>
*/ */
public class HoodieInternalRow extends InternalRow { public class HoodieInternalRow extends InternalRow {
private String commitTime; /**
private String commitSeqNumber; * Collection of meta-fields as defined by {@link HoodieRecord#HOODIE_META_COLUMNS}
private String recordKey; */
private String partitionPath; private final UTF8String[] metaFields;
private String fileName; private final InternalRow row;
private InternalRow row;
/**
* Specifies whether source {@link #row} contains meta-fields
*/
private final boolean containsMetaFields;
public HoodieInternalRow(UTF8String commitTime,
UTF8String commitSeqNumber,
UTF8String recordKey,
UTF8String partitionPath,
UTF8String fileName,
InternalRow row,
boolean containsMetaFields) {
this.metaFields = new UTF8String[] {
commitTime,
commitSeqNumber,
recordKey,
partitionPath,
fileName
};
public HoodieInternalRow(String commitTime, String commitSeqNumber, String recordKey, String partitionPath,
String fileName, InternalRow row) {
this.commitTime = commitTime;
this.commitSeqNumber = commitSeqNumber;
this.recordKey = recordKey;
this.partitionPath = partitionPath;
this.fileName = fileName;
this.row = row; this.row = row;
this.containsMetaFields = containsMetaFields;
}
private HoodieInternalRow(UTF8String[] metaFields,
InternalRow row,
boolean containsMetaFields) {
this.metaFields = metaFields;
this.row = row;
this.containsMetaFields = containsMetaFields;
} }
@Override @Override
@@ -57,187 +92,153 @@ public class HoodieInternalRow extends InternalRow {
} }
@Override @Override
public void setNullAt(int i) { public void setNullAt(int ordinal) {
if (i < HoodieRecord.HOODIE_META_COLUMNS.size()) { if (ordinal < metaFields.length) {
switch (i) { metaFields[ordinal] = null;
case 0: {
this.commitTime = null;
break;
}
case 1: {
this.commitSeqNumber = null;
break;
}
case 2: {
this.recordKey = null;
break;
}
case 3: {
this.partitionPath = null;
break;
}
case 4: {
this.fileName = null;
break;
}
default: throw new IllegalArgumentException("Not expected");
}
} else { } else {
row.setNullAt(i); row.setNullAt(rebaseOrdinal(ordinal));
} }
} }
@Override @Override
public void update(int i, Object value) { public void update(int ordinal, Object value) {
if (i < HoodieRecord.HOODIE_META_COLUMNS.size()) { if (ordinal < metaFields.length) {
switch (i) { if (value instanceof UTF8String) {
case 0: { metaFields[ordinal] = (UTF8String) value;
this.commitTime = value.toString(); } else if (value instanceof String) {
break; metaFields[ordinal] = UTF8String.fromString((String) value);
} } else {
case 1: { throw new IllegalArgumentException(
this.commitSeqNumber = value.toString(); String.format("Could not update the row at (%d) with value of type (%s), either UTF8String or String are expected", ordinal, value.getClass().getSimpleName()));
break;
}
case 2: {
this.recordKey = value.toString();
break;
}
case 3: {
this.partitionPath = value.toString();
break;
}
case 4: {
this.fileName = value.toString();
break;
}
default: throw new IllegalArgumentException("Not expected");
} }
} else { } else {
row.update(i, value); row.update(rebaseOrdinal(ordinal), value);
}
}
private String getMetaColumnVal(int ordinal) {
switch (ordinal) {
case 0: {
return commitTime;
}
case 1: {
return commitSeqNumber;
}
case 2: {
return recordKey;
}
case 3: {
return partitionPath;
}
case 4: {
return fileName;
}
default: throw new IllegalArgumentException("Not expected");
} }
} }
@Override @Override
public boolean isNullAt(int ordinal) { public boolean isNullAt(int ordinal) {
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) { if (ordinal < metaFields.length) {
return null == getMetaColumnVal(ordinal); return metaFields[ordinal] == null;
} }
return row.isNullAt(ordinal); return row.isNullAt(rebaseOrdinal(ordinal));
}
@Override
public boolean getBoolean(int ordinal) {
return row.getBoolean(ordinal);
}
@Override
public byte getByte(int ordinal) {
return row.getByte(ordinal);
}
@Override
public short getShort(int ordinal) {
return row.getShort(ordinal);
}
@Override
public int getInt(int ordinal) {
return row.getInt(ordinal);
}
@Override
public long getLong(int ordinal) {
return row.getLong(ordinal);
}
@Override
public float getFloat(int ordinal) {
return row.getFloat(ordinal);
}
@Override
public double getDouble(int ordinal) {
return row.getDouble(ordinal);
}
@Override
public Decimal getDecimal(int ordinal, int precision, int scale) {
return row.getDecimal(ordinal, precision, scale);
} }
@Override @Override
public UTF8String getUTF8String(int ordinal) { public UTF8String getUTF8String(int ordinal) {
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) { if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) {
return UTF8String.fromBytes(getMetaColumnVal(ordinal).getBytes()); return metaFields[ordinal];
} }
return row.getUTF8String(ordinal); return row.getUTF8String(rebaseOrdinal(ordinal));
}
@Override
public String getString(int ordinal) {
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) {
return new String(getMetaColumnVal(ordinal).getBytes());
}
return row.getString(ordinal);
}
@Override
public byte[] getBinary(int ordinal) {
return row.getBinary(ordinal);
}
@Override
public CalendarInterval getInterval(int ordinal) {
return row.getInterval(ordinal);
}
@Override
public InternalRow getStruct(int ordinal, int numFields) {
return row.getStruct(ordinal, numFields);
}
@Override
public ArrayData getArray(int ordinal) {
return row.getArray(ordinal);
}
@Override
public MapData getMap(int ordinal) {
return row.getMap(ordinal);
} }
@Override @Override
public Object get(int ordinal, DataType dataType) { public Object get(int ordinal, DataType dataType) {
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) { if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) {
return UTF8String.fromBytes(getMetaColumnVal(ordinal).getBytes()); validateMetaFieldDataType(dataType);
return metaFields[ordinal];
} }
return row.get(ordinal, dataType); return row.get(rebaseOrdinal(ordinal), dataType);
}
@Override
public boolean getBoolean(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, Boolean.class);
return row.getBoolean(rebaseOrdinal(ordinal));
}
@Override
public byte getByte(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, Byte.class);
return row.getByte(rebaseOrdinal(ordinal));
}
@Override
public short getShort(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, Short.class);
return row.getShort(rebaseOrdinal(ordinal));
}
@Override
public int getInt(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, Integer.class);
return row.getInt(rebaseOrdinal(ordinal));
}
@Override
public long getLong(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, Long.class);
return row.getLong(rebaseOrdinal(ordinal));
}
@Override
public float getFloat(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, Float.class);
return row.getFloat(rebaseOrdinal(ordinal));
}
@Override
public double getDouble(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, Double.class);
return row.getDouble(rebaseOrdinal(ordinal));
}
@Override
public Decimal getDecimal(int ordinal, int precision, int scale) {
ruleOutMetaFieldsAccess(ordinal, Decimal.class);
return row.getDecimal(rebaseOrdinal(ordinal), precision, scale);
}
@Override
public byte[] getBinary(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, Byte[].class);
return row.getBinary(rebaseOrdinal(ordinal));
}
@Override
public CalendarInterval getInterval(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, CalendarInterval.class);
return row.getInterval(rebaseOrdinal(ordinal));
}
@Override
public InternalRow getStruct(int ordinal, int numFields) {
ruleOutMetaFieldsAccess(ordinal, InternalRow.class);
return row.getStruct(rebaseOrdinal(ordinal), numFields);
}
@Override
public ArrayData getArray(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, ArrayData.class);
return row.getArray(rebaseOrdinal(ordinal));
}
@Override
public MapData getMap(int ordinal) {
ruleOutMetaFieldsAccess(ordinal, MapData.class);
return row.getMap(rebaseOrdinal(ordinal));
} }
@Override @Override
public InternalRow copy() { public InternalRow copy() {
return new HoodieInternalRow(commitTime, commitSeqNumber, recordKey, partitionPath, fileName, row.copy()); return new HoodieInternalRow(Arrays.copyOf(metaFields, metaFields.length), row.copy(), containsMetaFields);
}
private int rebaseOrdinal(int ordinal) {
// NOTE: In cases when source row does not contain meta fields, we will have to
// rebase ordinal onto its indexes
return containsMetaFields ? ordinal : ordinal - metaFields.length;
}
private void validateMetaFieldDataType(DataType dataType) {
if (!dataType.sameType(StringType$.MODULE$)) {
throw new ClassCastException(String.format("Can not cast meta-field of type UTF8String to %s", dataType.simpleString()));
}
}
private void ruleOutMetaFieldsAccess(int ordinal, Class<?> expectedDataType) {
if (ordinal < metaFields.length) {
throw new ClassCastException(String.format("Can not cast meta-field of type UTF8String at (%d) as %s", ordinal, expectedDataType.getName()));
}
} }
} }

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.io.storage.row; package org.apache.hudi.io.storage.row;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.unsafe.types.UTF8String;
import java.io.IOException; import java.io.IOException;
@@ -37,7 +38,7 @@ public interface HoodieInternalRowFileWriter {
* *
* @throws IOException on any exception while writing. * @throws IOException on any exception while writing.
*/ */
void writeRow(String key, InternalRow row) throws IOException; void writeRow(UTF8String key, InternalRow row) throws IOException;
/** /**
* Writes an {@link InternalRow} to the HoodieInternalRowFileWriter. * Writes an {@link InternalRow} to the HoodieInternalRowFileWriter.

View File

@@ -22,6 +22,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hudi.io.storage.HoodieParquetConfig; import org.apache.hudi.io.storage.HoodieParquetConfig;
import org.apache.hudi.io.storage.HoodieBaseParquetWriter; import org.apache.hudi.io.storage.HoodieBaseParquetWriter;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.unsafe.types.UTF8String;
import java.io.IOException; import java.io.IOException;
@@ -41,7 +42,7 @@ public class HoodieInternalRowParquetWriter extends HoodieBaseParquetWriter<Inte
} }
@Override @Override
public void writeRow(String key, InternalRow row) throws IOException { public void writeRow(UTF8String key, InternalRow row) throws IOException {
super.write(row); super.write(row);
writeSupport.add(key); writeSupport.add(key);
} }

View File

@@ -25,11 +25,11 @@ import org.apache.hudi.common.model.HoodiePartitionMetadata;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.model.IOType; import org.apache.hudi.common.model.IOType;
import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.exception.HoodieInsertException; import org.apache.hudi.exception.HoodieInsertException;
import org.apache.hudi.hadoop.CachingPath;
import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.marker.WriteMarkersFactory; import org.apache.hudi.table.marker.WriteMarkersFactory;
@@ -39,10 +39,12 @@ import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import org.apache.spark.unsafe.types.UTF8String;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
/** /**
* Create handle with InternalRow for datasource implementation of bulk insert. * Create handle with InternalRow for datasource implementation of bulk insert.
@@ -50,38 +52,61 @@ import java.util.concurrent.atomic.AtomicLong;
public class HoodieRowCreateHandle implements Serializable { public class HoodieRowCreateHandle implements Serializable {
private static final long serialVersionUID = 1L; private static final long serialVersionUID = 1L;
private static final Logger LOG = LogManager.getLogger(HoodieRowCreateHandle.class);
private static final AtomicLong SEQGEN = new AtomicLong(1);
private final String instantTime; private static final Logger LOG = LogManager.getLogger(HoodieRowCreateHandle.class);
private final int taskPartitionId; private static final AtomicLong GLOBAL_SEQ_NO = new AtomicLong(1);
private final long taskId;
private final long taskEpochId; private static final Integer RECORD_KEY_META_FIELD_ORD =
HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.RECORD_KEY_METADATA_FIELD);
private static final Integer PARTITION_PATH_META_FIELD_ORD =
HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD);
private final HoodieTable table; private final HoodieTable table;
private final HoodieWriteConfig writeConfig; private final HoodieWriteConfig writeConfig;
protected final HoodieInternalRowFileWriter fileWriter;
private final String partitionPath; private final String partitionPath;
private final Path path; private final Path path;
private final String fileId; private final String fileId;
private final FileSystem fs;
protected final HoodieInternalWriteStatus writeStatus; private final boolean populateMetaFields;
private final UTF8String fileName;
private final UTF8String commitTime;
private final Function<Long, String> seqIdGenerator;
private final HoodieTimer currTimer; private final HoodieTimer currTimer;
public HoodieRowCreateHandle(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId, protected final HoodieInternalRowFileWriter fileWriter;
String instantTime, int taskPartitionId, long taskId, long taskEpochId, protected final HoodieInternalWriteStatus writeStatus;
StructType structType) {
public HoodieRowCreateHandle(HoodieTable table,
HoodieWriteConfig writeConfig,
String partitionPath,
String fileId,
String instantTime,
int taskPartitionId,
long taskId,
long taskEpochId,
StructType structType,
boolean populateMetaFields) {
this.partitionPath = partitionPath; this.partitionPath = partitionPath;
this.table = table; this.table = table;
this.writeConfig = writeConfig; this.writeConfig = writeConfig;
this.instantTime = instantTime;
this.taskPartitionId = taskPartitionId;
this.taskId = taskId;
this.taskEpochId = taskEpochId;
this.fileId = fileId; this.fileId = fileId;
this.currTimer = new HoodieTimer();
this.currTimer.startTimer(); this.currTimer = new HoodieTimer(true);
this.fs = table.getMetaClient().getFs();
this.path = makeNewPath(partitionPath); FileSystem fs = table.getMetaClient().getFs();
String writeToken = getWriteToken(taskPartitionId, taskId, taskEpochId);
String fileName = FSUtils.makeBaseFileName(instantTime, writeToken, this.fileId, table.getBaseFileExtension());
this.path = makeNewPath(fs, partitionPath, fileName, writeConfig);
this.populateMetaFields = populateMetaFields;
this.fileName = UTF8String.fromString(path.getName());
this.commitTime = UTF8String.fromString(instantTime);
this.seqIdGenerator = (id) -> HoodieRecord.generateSequenceId(instantTime, taskPartitionId, id);
this.writeStatus = new HoodieInternalWriteStatus(!table.getIndex().isImplicitWithStorage(), this.writeStatus = new HoodieInternalWriteStatus(!table.getIndex().isImplicitWithStorage(),
writeConfig.getWriteStatusFailureFraction()); writeConfig.getWriteStatusFailureFraction());
writeStatus.setPartitionPath(partitionPath); writeStatus.setPartitionPath(partitionPath);
@@ -96,7 +121,7 @@ public class HoodieRowCreateHandle implements Serializable {
FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath), FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath),
table.getPartitionMetafileFormat()); table.getPartitionMetafileFormat());
partitionMetadata.trySave(taskPartitionId); partitionMetadata.trySave(taskPartitionId);
createMarkerFile(partitionPath, FSUtils.makeBaseFileName(this.instantTime, getWriteToken(), this.fileId, table.getBaseFileExtension())); createMarkerFile(partitionPath, fileName, instantTime, table, writeConfig);
this.fileWriter = createNewFileWriter(path, table, writeConfig, structType); this.fileWriter = createNewFileWriter(path, table, writeConfig, structType);
} catch (IOException e) { } catch (IOException e) {
throw new HoodieInsertException("Failed to initialize file writer for path " + path, e); throw new HoodieInsertException("Failed to initialize file writer for path " + path, e);
@@ -108,21 +133,42 @@ public class HoodieRowCreateHandle implements Serializable {
* Writes an {@link InternalRow} to the underlying HoodieInternalRowFileWriter. Before writing, value for meta columns are computed as required * Writes an {@link InternalRow} to the underlying HoodieInternalRowFileWriter. Before writing, value for meta columns are computed as required
* and wrapped in {@link HoodieInternalRow}. {@link HoodieInternalRow} is what gets written to HoodieInternalRowFileWriter. * and wrapped in {@link HoodieInternalRow}. {@link HoodieInternalRow} is what gets written to HoodieInternalRowFileWriter.
* *
* @param record instance of {@link InternalRow} that needs to be written to the fileWriter. * @param row instance of {@link InternalRow} that needs to be written to the fileWriter.
* @throws IOException * @throws IOException
*/ */
public void write(InternalRow record) throws IOException { public void write(InternalRow row) throws IOException {
try { try {
final String partitionPath = String.valueOf(record.getUTF8String(HoodieRecord.PARTITION_PATH_META_FIELD_POS)); // NOTE: PLEASE READ THIS CAREFULLY BEFORE MODIFYING
final String seqId = HoodieRecord.generateSequenceId(instantTime, taskPartitionId, SEQGEN.getAndIncrement()); // This code lays in the hot-path, and substantial caution should be
final String recordKey = String.valueOf(record.getUTF8String(HoodieRecord.RECORD_KEY_META_FIELD_POS)); // exercised making changes to it to minimize amount of excessive:
HoodieInternalRow internalRow = new HoodieInternalRow(instantTime, seqId, recordKey, partitionPath, path.getName(), // - Conversions b/w Spark internal (low-level) types and JVM native ones (like
record); // [[UTF8String]] and [[String]])
// - Repeated computations (for ex, converting file-path to [[UTF8String]] over and
// over again)
UTF8String recordKey = row.getUTF8String(RECORD_KEY_META_FIELD_ORD);
InternalRow updatedRow;
// In cases when no meta-fields need to be added we simply relay provided row to
// the writer as is
if (!populateMetaFields) {
updatedRow = row;
} else {
UTF8String partitionPath = row.getUTF8String(PARTITION_PATH_META_FIELD_ORD);
// This is the only meta-field that is generated dynamically, hence conversion b/w
// [[String]] and [[UTF8String]] is unavoidable
UTF8String seqId = UTF8String.fromString(seqIdGenerator.apply(GLOBAL_SEQ_NO.getAndIncrement()));
updatedRow = new HoodieInternalRow(commitTime, seqId, recordKey,
partitionPath, fileName, row, true);
}
try { try {
fileWriter.writeRow(recordKey, internalRow); fileWriter.writeRow(recordKey, updatedRow);
writeStatus.markSuccess(recordKey); // NOTE: To avoid conversion on the hot-path we only convert [[UTF8String]] into [[String]]
// in cases when successful records' writes are being tracked
writeStatus.markSuccess(writeStatus.isTrackingSuccessfulWrites() ? recordKey.toString() : null);
} catch (Throwable t) { } catch (Throwable t) {
writeStatus.markFailure(recordKey, t); writeStatus.markFailure(recordKey.toString(), t);
} }
} catch (Throwable ge) { } catch (Throwable ge) {
writeStatus.setGlobalError(ge); writeStatus.setGlobalError(ge);
@@ -168,7 +214,7 @@ public class HoodieRowCreateHandle implements Serializable {
return path.getName(); return path.getName();
} }
private Path makeNewPath(String partitionPath) { private static Path makeNewPath(FileSystem fs, String partitionPath, String fileName, HoodieWriteConfig writeConfig) {
Path path = FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath); Path path = FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath);
try { try {
if (!fs.exists(path)) { if (!fs.exists(path)) {
@@ -177,9 +223,7 @@ public class HoodieRowCreateHandle implements Serializable {
} catch (IOException e) { } catch (IOException e) {
throw new HoodieIOException("Failed to make dir " + path, e); throw new HoodieIOException("Failed to make dir " + path, e);
} }
HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig(); return new CachingPath(path.toString(), fileName);
return new Path(path.toString(), FSUtils.makeBaseFileName(instantTime, getWriteToken(), fileId,
tableConfig.getBaseFileFormat().getFileExtension()));
} }
/** /**
@@ -187,12 +231,17 @@ public class HoodieRowCreateHandle implements Serializable {
* *
* @param partitionPath Partition path * @param partitionPath Partition path
*/ */
private void createMarkerFile(String partitionPath, String dataFileName) { private static void createMarkerFile(String partitionPath,
String dataFileName,
String instantTime,
HoodieTable<?, ?, ?, ?> table,
HoodieWriteConfig writeConfig) {
WriteMarkersFactory.get(writeConfig.getMarkersType(), table, instantTime) WriteMarkersFactory.get(writeConfig.getMarkersType(), table, instantTime)
.create(partitionPath, dataFileName, IOType.CREATE); .create(partitionPath, dataFileName, IOType.CREATE);
} }
private String getWriteToken() { // TODO extract to utils
private static String getWriteToken(int taskPartitionId, long taskId, long taskEpochId) {
return taskPartitionId + "-" + taskId + "-" + taskEpochId; return taskPartitionId + "-" + taskId + "-" + taskEpochId;
} }

View File

@@ -1,64 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.io.storage.row;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.table.HoodieTable;
import org.apache.hadoop.fs.Path;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.StructType;
import java.io.IOException;
/**
* RowCreateHandle to be used when meta fields are disabled.
*/
public class HoodieRowCreateHandleWithoutMetaFields extends HoodieRowCreateHandle {
public HoodieRowCreateHandleWithoutMetaFields(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId, String instantTime,
int taskPartitionId, long taskId, long taskEpochId, StructType structType) {
super(table, writeConfig, partitionPath, fileId, instantTime, taskPartitionId, taskId, taskEpochId, structType);
}
/**
* Write the incoming InternalRow as is.
*
* @param record instance of {@link InternalRow} that needs to be written to the fileWriter.
* @throws IOException
*/
@Override
public void write(InternalRow record) throws IOException {
try {
fileWriter.writeRow(record);
writeStatus.markSuccess();
} catch (Throwable ge) {
writeStatus.setGlobalError(ge);
throw new HoodieException("Exception thrown while writing spark InternalRows to file ", ge);
}
}
protected HoodieInternalRowFileWriter createNewFileWriter(
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, StructType schema)
throws IOException {
return HoodieInternalRowFileWriterFactory.getInternalRowFileWriterWithoutMetaFields(
path, hoodieTable, config, schema);
}
}

View File

@@ -25,6 +25,7 @@ import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.parquet.hadoop.api.WriteSupport; import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport; import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import org.apache.spark.unsafe.types.UTF8String;
import java.util.HashMap; import java.util.HashMap;
@@ -38,10 +39,11 @@ import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_
*/ */
public class HoodieRowParquetWriteSupport extends ParquetWriteSupport { public class HoodieRowParquetWriteSupport extends ParquetWriteSupport {
private Configuration hadoopConf; private final Configuration hadoopConf;
private BloomFilter bloomFilter; private final BloomFilter bloomFilter;
private String minRecordKey;
private String maxRecordKey; private UTF8String minRecordKey;
private UTF8String maxRecordKey;
public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, BloomFilter bloomFilter, HoodieWriteConfig writeConfig) { public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, BloomFilter bloomFilter, HoodieWriteConfig writeConfig) {
super(); super();
@@ -63,8 +65,8 @@ public class HoodieRowParquetWriteSupport extends ParquetWriteSupport {
if (bloomFilter != null) { if (bloomFilter != null) {
extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString()); extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString());
if (minRecordKey != null && maxRecordKey != null) { if (minRecordKey != null && maxRecordKey != null) {
extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey); extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey.toString());
extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey); extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey.toString());
} }
if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) { if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name()); extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name());
@@ -73,18 +75,18 @@ public class HoodieRowParquetWriteSupport extends ParquetWriteSupport {
return new WriteSupport.FinalizedWriteContext(extraMetaData); return new WriteSupport.FinalizedWriteContext(extraMetaData);
} }
public void add(String recordKey) { public void add(UTF8String recordKey) {
this.bloomFilter.add(recordKey); this.bloomFilter.add(recordKey.getBytes());
if (minRecordKey != null) {
minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey; if (minRecordKey == null || minRecordKey.compareTo(recordKey) < 0) {
} else { // NOTE: [[clone]] is performed here (rather than [[copy]]) to only copy underlying buffer in
minRecordKey = recordKey; // cases when [[UTF8String]] is pointing into a buffer storing the whole containing record,
// and simply do a pass over when it holds a (immutable) buffer holding just the string
minRecordKey = recordKey.clone();
} }
if (maxRecordKey != null) { if (maxRecordKey == null || maxRecordKey.compareTo(recordKey) > 0) {
maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey; maxRecordKey = recordKey.clone();
} else {
maxRecordKey = recordKey;
} }
} }
} }

View File

@@ -18,26 +18,24 @@
package org.apache.hudi.keygen; package org.apache.hudi.keygen;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.ApiMaturityLevel; import org.apache.hudi.ApiMaturityLevel;
import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.PublicAPIMethod; import org.apache.hudi.PublicAPIMethod;
import org.apache.hudi.common.config.TypedProperties; import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieException;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import scala.Function1;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import scala.Function1;
/** /**
* Base class for the built-in key generators. Contains methods structured for * Base class for the built-in key generators. Contains methods structured for
* code reuse amongst them. * code reuse amongst them.
@@ -66,18 +64,32 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
@Override @Override
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public String getRecordKey(Row row) { public String getRecordKey(Row row) {
// TODO avoid conversion to avro
// since converterFn is transient this will be repeatedly initialized over and over again
if (null == converterFn) { if (null == converterFn) {
converterFn = AvroConversionUtils.createConverterToAvro(row.schema(), STRUCT_NAME, NAMESPACE); converterFn = AvroConversionUtils.createConverterToAvro(row.schema(), STRUCT_NAME, NAMESPACE);
} }
return getKey(converterFn.apply(row)).getRecordKey(); return getKey(converterFn.apply(row)).getRecordKey();
} }
@Override
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public String getRecordKey(InternalRow internalRow, StructType schema) {
try {
// TODO fix
buildFieldSchemaInfoIfNeeded(schema);
return RowKeyGeneratorHelper.getRecordKeyFromInternalRow(internalRow, getRecordKeyFields(), recordKeySchemaInfo, false);
} catch (Exception e) {
throw new HoodieException("Conversion of InternalRow to Row failed with exception", e);
}
}
/** /**
* Fetch partition path from {@link Row}. * Fetch partition path from {@link Row}.
* *
* @param row instance of {@link Row} from which partition path is requested * @param row instance of {@link Row} from which partition path is requested
* @return the partition path of interest from {@link Row}. * @return the partition path of interest from {@link Row}.
*/ */
@Override @Override
@PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING) @PublicAPIMethod(maturity = ApiMaturityLevel.EVOLVING)
public String getPartitionPath(Row row) { public String getPartitionPath(Row row) {
@@ -102,12 +114,13 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
return RowKeyGeneratorHelper.getPartitionPathFromInternalRow(internalRow, getPartitionPathFields(), return RowKeyGeneratorHelper.getPartitionPathFromInternalRow(internalRow, getPartitionPathFields(),
hiveStylePartitioning, partitionPathSchemaInfo); hiveStylePartitioning, partitionPathSchemaInfo);
} catch (Exception e) { } catch (Exception e) {
throw new HoodieIOException("Conversion of InternalRow to Row failed with exception " + e); throw new HoodieException("Conversion of InternalRow to Row failed with exception", e);
} }
} }
void buildFieldSchemaInfoIfNeeded(StructType structType) { void buildFieldSchemaInfoIfNeeded(StructType structType) {
if (this.structType == null) { if (this.structType == null) {
this.structType = structType;
getRecordKeyFields() getRecordKeyFields()
.stream().filter(f -> !f.isEmpty()) .stream().filter(f -> !f.isEmpty())
.forEach(f -> recordKeySchemaInfo.put(f, RowKeyGeneratorHelper.getFieldSchemaInfo(structType, f, true))); .forEach(f -> recordKeySchemaInfo.put(f, RowKeyGeneratorHelper.getFieldSchemaInfo(structType, f, true)));
@@ -115,7 +128,6 @@ public abstract class BuiltinKeyGenerator extends BaseKeyGenerator implements Sp
getPartitionPathFields().stream().filter(f -> !f.isEmpty()) getPartitionPathFields().stream().filter(f -> !f.isEmpty())
.forEach(f -> partitionPathSchemaInfo.put(f, RowKeyGeneratorHelper.getFieldSchemaInfo(structType, f, false))); .forEach(f -> partitionPathSchemaInfo.put(f, RowKeyGeneratorHelper.getFieldSchemaInfo(structType, f, false)));
} }
this.structType = structType;
} }
} }

View File

@@ -64,6 +64,12 @@ public class ComplexKeyGenerator extends BuiltinKeyGenerator {
return RowKeyGeneratorHelper.getRecordKeyFromRow(row, getRecordKeyFields(), recordKeySchemaInfo, true); return RowKeyGeneratorHelper.getRecordKeyFromRow(row, getRecordKeyFields(), recordKeySchemaInfo, true);
} }
@Override
public String getRecordKey(InternalRow internalRow, StructType schema) {
buildFieldSchemaInfoIfNeeded(schema);
return RowKeyGeneratorHelper.getRecordKeyFromInternalRow(internalRow, getRecordKeyFields(), recordKeySchemaInfo, true);
}
@Override @Override
public String getPartitionPath(Row row) { public String getPartitionPath(Row row) {
buildFieldSchemaInfoIfNeeded(row.schema()); buildFieldSchemaInfoIfNeeded(row.schema());

View File

@@ -64,6 +64,12 @@ public class GlobalDeleteKeyGenerator extends BuiltinKeyGenerator {
return RowKeyGeneratorHelper.getRecordKeyFromRow(row, getRecordKeyFields(), recordKeySchemaInfo, true); return RowKeyGeneratorHelper.getRecordKeyFromRow(row, getRecordKeyFields(), recordKeySchemaInfo, true);
} }
@Override
public String getRecordKey(InternalRow internalRow, StructType schema) {
buildFieldSchemaInfoIfNeeded(schema);
return RowKeyGeneratorHelper.getRecordKeyFromInternalRow(internalRow, getRecordKeyFields(), recordKeySchemaInfo, true);
}
@Override @Override
public String getPartitionPath(Row row) { public String getPartitionPath(Row row) {
return globalAvroDeleteKeyGenerator.getEmptyPartition(); return globalAvroDeleteKeyGenerator.getEmptyPartition();

View File

@@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.keygen;
import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DateType;
import org.apache.spark.sql.types.TimestampType;
import java.sql.Timestamp;
import java.time.Instant;
import java.time.LocalDate;
public class RowKeyGenUtils {
/**
* Converts provided (raw) value extracted from the {@link InternalRow} object into a deserialized,
* JVM native format (for ex, converting {@code Long} into {@link Instant},
* {@code Integer} to {@link LocalDate}, etc)
*
* This method allows to avoid costly full-row deserialization sequence. Note, that this method
* should be maintained in sync w/
*
* <ol>
* <li>{@code RowEncoder#deserializerFor}, as well as</li>
* <li>{@code HoodieAvroUtils#convertValueForAvroLogicalTypes}</li>
* </ol>
*
* @param dataType target data-type of the given value
* @param value target value to be converted
*/
public static Object convertToLogicalDataType(DataType dataType, Object value) {
if (dataType instanceof TimestampType) {
// Provided value have to be [[Long]] in this case, representing micros since epoch
return new Timestamp((Long) value / 1000);
} else if (dataType instanceof DateType) {
// Provided value have to be [[Int]] in this case
return LocalDate.ofEpochDay((Integer) value);
}
return value;
}
}

View File

@@ -39,18 +39,56 @@ import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import org.apache.spark.sql.types.StructType$;
import scala.Option; import scala.Option;
import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR; import static org.apache.hudi.keygen.KeyGenUtils.DEFAULT_PARTITION_PATH_SEPARATOR;
import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER; import static org.apache.hudi.keygen.KeyGenUtils.EMPTY_RECORDKEY_PLACEHOLDER;
import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH; import static org.apache.hudi.keygen.KeyGenUtils.HUDI_DEFAULT_PARTITION_PATH;
import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER; import static org.apache.hudi.keygen.KeyGenUtils.NULL_RECORDKEY_PLACEHOLDER;
import static org.apache.hudi.keygen.RowKeyGenUtils.convertToLogicalDataType;
/** /**
* Helper class to fetch fields from Row. * Helper class to fetch fields from Row.
*
* TODO cleanup
*/ */
@Deprecated
public class RowKeyGeneratorHelper { public class RowKeyGeneratorHelper {
public static String getRecordKeyFromInternalRow(InternalRow internalRow, List<String> recordKeyFields,
Map<String, Pair<List<Integer>, DataType>> recordKeyPositions, boolean prefixFieldName) {
AtomicBoolean keyIsNullOrEmpty = new AtomicBoolean(true);
String toReturn = recordKeyFields.stream().map(field -> {
String val = null;
List<Integer> fieldPositions = recordKeyPositions.get(field).getKey();
if (fieldPositions.size() == 1) { // simple field
Integer fieldPos = fieldPositions.get(0);
if (internalRow.isNullAt(fieldPos)) {
val = NULL_RECORDKEY_PLACEHOLDER;
} else {
DataType dataType = recordKeyPositions.get(field).getValue();
val = convertToLogicalDataType(dataType, internalRow.get(fieldPos, dataType)).toString();
if (val.isEmpty()) {
val = EMPTY_RECORDKEY_PLACEHOLDER;
} else {
keyIsNullOrEmpty.set(false);
}
}
} else { // nested fields
val = getNestedFieldVal(internalRow, recordKeyPositions.get(field)).toString();
if (!val.contains(NULL_RECORDKEY_PLACEHOLDER) && !val.contains(EMPTY_RECORDKEY_PLACEHOLDER)) {
keyIsNullOrEmpty.set(false);
}
}
return prefixFieldName ? (field + ":" + val) : val;
}).collect(Collectors.joining(","));
if (keyIsNullOrEmpty.get()) {
throw new HoodieKeyException("recordKey value: \"" + toReturn + "\" for fields: \"" + Arrays.toString(recordKeyFields.toArray()) + "\" cannot be null or empty.");
}
return toReturn;
}
/** /**
* Generates record key for the corresponding {@link Row}. * Generates record key for the corresponding {@link Row}.
* *
@@ -146,7 +184,7 @@ public class RowKeyGeneratorHelper {
if (fieldPos == -1 || internalRow.isNullAt(fieldPos)) { if (fieldPos == -1 || internalRow.isNullAt(fieldPos)) {
val = HUDI_DEFAULT_PARTITION_PATH; val = HUDI_DEFAULT_PARTITION_PATH;
} else { } else {
Object value = internalRow.get(fieldPos, dataType); Object value = convertToLogicalDataType(dataType, internalRow.get(fieldPos, dataType));
if (value == null || value.toString().isEmpty()) { if (value == null || value.toString().isEmpty()) {
val = HUDI_DEFAULT_PARTITION_PATH; val = HUDI_DEFAULT_PARTITION_PATH;
} else { } else {
@@ -231,6 +269,35 @@ public class RowKeyGeneratorHelper {
return toReturn; return toReturn;
} }
public static Object getNestedFieldVal(InternalRow internalRow, Pair<List<Integer>, DataType> positionsAndType) {
if (positionsAndType.getKey().size() == 1 && positionsAndType.getKey().get(0) == -1) {
return HUDI_DEFAULT_PARTITION_PATH;
}
int index = 0;
int totalCount = positionsAndType.getKey().size();
InternalRow valueToProcess = internalRow;
Object toReturn = null;
while (index < totalCount) {
if (valueToProcess.isNullAt(positionsAndType.getKey().get(index))) {
toReturn = NULL_RECORDKEY_PLACEHOLDER;
break;
}
if (index < totalCount - 1) {
valueToProcess = (InternalRow) valueToProcess.get(positionsAndType.getKey().get(index), StructType$.MODULE$.defaultConcreteType());
} else { // last index
if (valueToProcess.get(positionsAndType.getKey().get(index), positionsAndType.getValue()).toString().isEmpty()) {
toReturn = EMPTY_RECORDKEY_PLACEHOLDER;
break;
}
toReturn = valueToProcess.get(positionsAndType.getKey().get(index), positionsAndType.getValue());
}
index++;
}
return toReturn;
}
/** /**
* Generate the tree style positions for the field requested for as per the defined struct type. * Generate the tree style positions for the field requested for as per the defined struct type.
* *

View File

@@ -29,6 +29,8 @@ public interface SparkKeyGeneratorInterface extends KeyGeneratorInterface {
String getRecordKey(Row row); String getRecordKey(Row row);
String getRecordKey(InternalRow row, StructType schema);
String getPartitionPath(Row row); String getPartitionPath(Row row);
String getPartitionPath(InternalRow internalRow, StructType structType); String getPartitionPath(InternalRow internalRow, StructType structType);

View File

@@ -19,112 +19,17 @@
package org.apache.hudi.util; package org.apache.hudi.util;
import org.apache.spark.sql.types.ArrayType; import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.ByteType$;
import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.types.Decimal;
import org.apache.spark.sql.types.DecimalType; import org.apache.spark.sql.types.DecimalType;
import org.apache.spark.sql.types.DoubleType$;
import org.apache.spark.sql.types.FloatType$;
import org.apache.spark.sql.types.IntegerType$;
import org.apache.spark.sql.types.LongType$;
import org.apache.spark.sql.types.MapType; import org.apache.spark.sql.types.MapType;
import org.apache.spark.sql.types.ShortType$;
import org.apache.spark.sql.types.StringType$;
import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.types.VarcharType$;
import javax.annotation.Nonnull;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
public class DataTypeUtils { public class DataTypeUtils {
private static Map<Class<?>, Set<Class<?>>> sparkPrimitiveTypesCompatibilityMap =
new HashMap<Class<?>, Set<Class<?>>>() {{
// Integral types
put(ShortType$.class,
newHashSet(ByteType$.class, ShortType$.class));
put(IntegerType$.class,
newHashSet(ByteType$.class, ShortType$.class, IntegerType$.class));
put(LongType$.class,
newHashSet(ByteType$.class, ShortType$.class, IntegerType$.class, LongType$.class));
// Float types
put(DoubleType$.class,
newHashSet(FloatType$.class, DoubleType$.class));
// String types
put(StringType$.class,
newHashSet(VarcharType$.class, StringType$.class));
}
};
/**
* Validates whether one {@link StructType} is compatible w/ the other one.
* Compatibility rules are defined like following: types A and B are considered
* compatible iff
*
* <ol>
* <li>A and B are identical</li>
* <li>All values comprising A domain are contained w/in B domain (for ex, {@code ShortType}
* in this sense is compatible w/ {@code IntegerType})</li>
* </ol>
*
* @param left operand
* @param right operand
* @return true if {@code left} instance of {@link StructType} is compatible w/ the {@code right}
*/
public static boolean areCompatible(@Nonnull DataType left, @Nonnull DataType right) {
// First, check if types are equal
if (Objects.equals(left, right)) {
return true;
}
// If not, check whether both are instances of {@code StructType} that
// should be matched structurally
if (left instanceof StructType && right instanceof StructType) {
return areCompatible((StructType) left, (StructType) right);
}
// If not, simply check if those data-types constitute compatibility
// relationship outlined above; otherwise return false
return sparkPrimitiveTypesCompatibilityMap.getOrDefault(left.getClass(), Collections.emptySet())
.contains(right.getClass());
}
private static boolean areCompatible(@Nonnull StructType left, @Nonnull StructType right) {
StructField[] oneSchemaFields = left.fields();
StructField[] anotherSchemaFields = right.fields();
if (oneSchemaFields.length != anotherSchemaFields.length) {
return false;
}
for (int i = 0; i < oneSchemaFields.length; ++i) {
StructField oneField = oneSchemaFields[i];
StructField anotherField = anotherSchemaFields[i];
// NOTE: Metadata is deliberately omitted from comparison
if (!Objects.equals(oneField.name(), anotherField.name())
|| !areCompatible(oneField.dataType(), anotherField.dataType())
|| oneField.nullable() != anotherField.nullable()) {
return false;
}
}
return true;
}
private static <T> HashSet<T> newHashSet(T... ts) {
return new HashSet<>(Arrays.asList(ts));
}
/** /**
* Checks whether provided {@link DataType} contains {@link DecimalType} whose scale is less than * Checks whether provided {@link DataType} contains {@link DecimalType} whose scale is less than
* {@link Decimal#MAX_LONG_DIGITS()} * {@link Decimal#MAX_LONG_DIGITS()}

View File

@@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types.{StructField, StructType}
import scala.collection.mutable.ArrayBuffer
object HoodieUnsafeRowUtils {
/**
* Fetches (nested) value w/in provided [[Row]] uniquely identified by the provided nested-field path
* previously composed by [[composeNestedFieldPath]]
*/
def getNestedRowValue(row: Row, nestedFieldPath: Array[(Int, StructField)]): Any = {
var curRow = row
for (idx <- nestedFieldPath.indices) {
val (ord, f) = nestedFieldPath(idx)
if (curRow.isNullAt(ord)) {
// scalastyle:off return
if (f.nullable) return null
else throw new IllegalArgumentException(s"Found null value for the field that is declared as non-nullable: $f")
// scalastyle:on return
} else if (idx == nestedFieldPath.length - 1) {
// scalastyle:off return
return curRow.get(ord)
// scalastyle:on return
} else {
curRow = f.dataType match {
case _: StructType =>
curRow.getStruct(ord)
case dt@_ =>
throw new IllegalArgumentException(s"Invalid nested-field path: expected StructType, but was $dt")
}
}
}
}
/**
* Fetches (nested) value w/in provided [[InternalRow]] uniquely identified by the provided nested-field path
* previously composed by [[composeNestedFieldPath]]
*/
def getNestedInternalRowValue(row: InternalRow, nestedFieldPath: Array[(Int, StructField)]): Any = {
if (nestedFieldPath.length == 0) {
throw new IllegalArgumentException("Nested field-path could not be empty")
}
var curRow = row
var idx = 0
while (idx < nestedFieldPath.length) {
val (ord, f) = nestedFieldPath(idx)
if (curRow.isNullAt(ord)) {
// scalastyle:off return
if (f.nullable) return null
else throw new IllegalArgumentException(s"Found null value for the field that is declared as non-nullable: $f")
// scalastyle:on return
} else if (idx == nestedFieldPath.length - 1) {
// scalastyle:off return
return curRow.get(ord, f.dataType)
// scalastyle:on return
} else {
curRow = f.dataType match {
case st: StructType =>
curRow.getStruct(ord, st.fields.length)
case dt@_ =>
throw new IllegalArgumentException(s"Invalid nested-field path: expected StructType, but was $dt")
}
}
idx += 1
}
}
/**
* For the provided [[nestedFieldRef]] (of the form "a.b.c") and [[schema]], produces nested-field path comprised
* of (ordinal, data-type) tuples of the respective fields w/in the provided schema.
*
* This method produces nested-field path, that is subsequently used by [[getNestedInternalRowValue]], [[getNestedRowValue]]
*/
def composeNestedFieldPath(schema: StructType, nestedFieldRef: String): Array[(Int, StructField)] = {
val fieldRefParts = nestedFieldRef.split('.')
val ordSeq = ArrayBuffer[(Int, StructField)]()
var curSchema = schema
var idx = 0
while (idx < fieldRefParts.length) {
val fieldRefPart = fieldRefParts(idx)
val ord = curSchema.fieldIndex(fieldRefPart)
val field = curSchema(ord)
// Append current field's (ordinal, data-type)
ordSeq.append((ord, field))
// Update current schema, unless terminal field-ref part
if (idx < fieldRefParts.length - 1) {
curSchema = field.dataType match {
case st: StructType => st
case dt@_ =>
throw new IllegalArgumentException(s"Invalid nested field reference ${fieldRefParts.drop(idx).mkString(".")} into $dt")
}
}
idx += 1
}
ordSeq.toArray
}
}

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.client.model;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow; import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.unsafe.types.UTF8String;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.util.ArrayList; import java.util.ArrayList;
@@ -64,7 +65,13 @@ public class TestHoodieInternalRow {
Object[] values = getRandomValue(true); Object[] values = getRandomValue(true);
InternalRow row = new GenericInternalRow(values); InternalRow row = new GenericInternalRow(values);
HoodieInternalRow hoodieInternalRow = new HoodieInternalRow("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", row); HoodieInternalRow hoodieInternalRow = new HoodieInternalRow(UTF8String.fromString("commitTime"),
UTF8String.fromString("commitSeqNo"),
UTF8String.fromString("recordKey"),
UTF8String.fromString("partitionPath"),
UTF8String.fromString("fileName"),
row,
true);
assertValues(hoodieInternalRow, "commitTime", "commitSeqNo", "recordKey", "partitionPath", assertValues(hoodieInternalRow, "commitTime", "commitSeqNo", "recordKey", "partitionPath",
"fileName", values, nullIndices); "fileName", values, nullIndices);
@@ -74,7 +81,13 @@ public class TestHoodieInternalRow {
public void testUpdate() { public void testUpdate() {
Object[] values = getRandomValue(true); Object[] values = getRandomValue(true);
InternalRow row = new GenericInternalRow(values); InternalRow row = new GenericInternalRow(values);
HoodieInternalRow hoodieInternalRow = new HoodieInternalRow("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", row); HoodieInternalRow hoodieInternalRow = new HoodieInternalRow(UTF8String.fromString("commitTime"),
UTF8String.fromString("commitSeqNo"),
UTF8String.fromString("recordKey"),
UTF8String.fromString("partitionPath"),
UTF8String.fromString("fileName"),
row,
true);
hoodieInternalRow.update(0, "commitTime_updated"); hoodieInternalRow.update(0, "commitTime_updated");
hoodieInternalRow.update(1, "commitSeqNo_updated"); hoodieInternalRow.update(1, "commitSeqNo_updated");
@@ -106,7 +119,13 @@ public class TestHoodieInternalRow {
Object[] values = getRandomValue(true); Object[] values = getRandomValue(true);
InternalRow row = new GenericInternalRow(values); InternalRow row = new GenericInternalRow(values);
HoodieInternalRow hoodieInternalRow = new HoodieInternalRow("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", row); HoodieInternalRow hoodieInternalRow = new HoodieInternalRow(UTF8String.fromString("commitTime"),
UTF8String.fromString("commitSeqNo"),
UTF8String.fromString("recordKey"),
UTF8String.fromString("partitionPath"),
UTF8String.fromString("fileName"),
row,
true);
hoodieInternalRow.setNullAt(i); hoodieInternalRow.setNullAt(i);
nullIndices.clear(); nullIndices.clear();
@@ -129,7 +148,13 @@ public class TestHoodieInternalRow {
Object[] values = getRandomValue(true); Object[] values = getRandomValue(true);
InternalRow row = new GenericInternalRow(values); InternalRow row = new GenericInternalRow(values);
HoodieInternalRow hoodieInternalRow = new HoodieInternalRow("commitTime", "commitSeqNo", "recordKey", "partitionPath", "fileName", row); HoodieInternalRow hoodieInternalRow = new HoodieInternalRow(UTF8String.fromString("commitTime"),
UTF8String.fromString("commitSeqNo"),
UTF8String.fromString("recordKey"),
UTF8String.fromString("partitionPath"),
UTF8String.fromString("fileName"),
row,
true);
nullIndices.clear(); nullIndices.clear();
@@ -173,7 +198,7 @@ public class TestHoodieInternalRow {
} }
private void assertValues(HoodieInternalRow hoodieInternalRow, String commitTime, String commitSeqNo, String recordKey, String partitionPath, String filename, Object[] values, private void assertValues(HoodieInternalRow hoodieInternalRow, String commitTime, String commitSeqNo, String recordKey, String partitionPath, String filename, Object[] values,
List<Integer> nullIndexes) { List<Integer> nullIndexes) {
for (Integer index : nullIndexes) { for (Integer index : nullIndexes) {
assertTrue(hoodieInternalRow.isNullAt(index)); assertTrue(hoodieInternalRow.isNullAt(index));
} }

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator; import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieInsertException; import org.apache.hudi.exception.HoodieInsertException;
import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.exception.TableNotFoundException;
@@ -75,8 +76,9 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
cleanupResources(); cleanupResources();
} }
@Test @ParameterizedTest
public void testRowCreateHandle() throws Exception { @ValueSource(booleans = { true, false })
public void testRowCreateHandle(boolean populateMetaFields) throws Exception {
// init config and table // init config and table
HoodieWriteConfig cfg = HoodieWriteConfig cfg =
SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort).build(); SparkDatasetTestUtils.getConfigBuilder(basePath, timelineServicePort).build();
@@ -93,7 +95,8 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
String fileId = UUID.randomUUID().toString(); String fileId = UUID.randomUUID().toString();
String instantTime = "000"; String instantTime = "000";
HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); HoodieRowCreateHandle handle = new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime,
RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, populateMetaFields);
int size = 10 + RANDOM.nextInt(1000); int size = 10 + RANDOM.nextInt(1000);
// Generate inputs // Generate inputs
Dataset<Row> inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size, partitionPath, false); Dataset<Row> inputRows = SparkDatasetTestUtils.getRandomRows(sqlContext, size, partitionPath, false);
@@ -109,7 +112,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
fileAbsPaths.add(basePath + "/" + writeStatus.getStat().getPath()); fileAbsPaths.add(basePath + "/" + writeStatus.getStat().getPath());
fileNames.add(handle.getFileName()); fileNames.add(handle.getFileName());
// verify output // verify output
assertOutput(writeStatus, size, fileId, partitionPath, instantTime, totalInputRows, fileNames, fileAbsPaths); assertOutput(writeStatus, size, fileId, partitionPath, instantTime, totalInputRows, fileNames, fileAbsPaths, populateMetaFields);
} }
} }
@@ -130,7 +133,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
String instantTime = "000"; String instantTime = "000";
HoodieRowCreateHandle handle = HoodieRowCreateHandle handle =
new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); new HoodieRowCreateHandle(table, cfg, partitionPath, fileId, instantTime, RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, true);
int size = 10 + RANDOM.nextInt(1000); int size = 10 + RANDOM.nextInt(1000);
int totalFailures = 5; int totalFailures = 5;
// Generate first batch of valid rows // Generate first batch of valid rows
@@ -169,7 +172,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
// verify rows // verify rows
Dataset<Row> result = sqlContext.read().parquet(basePath + "/" + partitionPath); Dataset<Row> result = sqlContext.read().parquet(basePath + "/" + partitionPath);
// passing only first batch of inputRows since after first batch global error would have been thrown // passing only first batch of inputRows since after first batch global error would have been thrown
assertRows(inputRows, result, instantTime, fileNames); assertRows(inputRows, result, instantTime, fileNames, true);
} }
@ParameterizedTest @ParameterizedTest
@@ -183,7 +186,7 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
try { try {
HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient); HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE); new HoodieRowCreateHandle(table, cfg, " def", UUID.randomUUID().toString(), "001", RANDOM.nextInt(100000), RANDOM.nextLong(), RANDOM.nextLong(), SparkDatasetTestUtils.STRUCT_TYPE, true);
fail("Should have thrown exception"); fail("Should have thrown exception");
} catch (HoodieInsertException ioe) { } catch (HoodieInsertException ioe) {
// expected without metadata table // expected without metadata table
@@ -209,8 +212,8 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
return handle.close(); return handle.close();
} }
private void assertOutput(HoodieInternalWriteStatus writeStatus, int size, String fileId, String partitionPath, String instantTime, Dataset<Row> inputRows, List<String> filenames, private void assertOutput(HoodieInternalWriteStatus writeStatus, int size, String fileId, String partitionPath,
List<String> fileAbsPaths) { String instantTime, Dataset<Row> inputRows, List<String> filenames, List<String> fileAbsPaths, boolean populateMetaFields) {
assertEquals(writeStatus.getPartitionPath(), partitionPath); assertEquals(writeStatus.getPartitionPath(), partitionPath);
assertEquals(writeStatus.getTotalRecords(), size); assertEquals(writeStatus.getTotalRecords(), size);
assertEquals(writeStatus.getFailedRowsSize(), 0); assertEquals(writeStatus.getFailedRowsSize(), 0);
@@ -229,15 +232,25 @@ public class TestHoodieRowCreateHandle extends HoodieClientTestHarness {
// verify rows // verify rows
Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.toArray(new String[0])); Dataset<Row> result = sqlContext.read().parquet(fileAbsPaths.toArray(new String[0]));
assertRows(inputRows, result, instantTime, filenames); assertRows(inputRows, result, instantTime, filenames, populateMetaFields);
} }
private void assertRows(Dataset<Row> expectedRows, Dataset<Row> actualRows, String instantTime, List<String> filenames) { private void assertRows(Dataset<Row> expectedRows, Dataset<Row> actualRows, String instantTime, List<String> filenames, boolean populateMetaFields) {
// verify 3 meta fields that are filled in within create handle // verify 3 meta fields that are filled in within create handle
actualRows.collectAsList().forEach(entry -> { actualRows.collectAsList().forEach(entry -> {
assertEquals(entry.get(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).toString(), instantTime); String commitTime = entry.getString(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD));
assertTrue(filenames.contains(entry.get(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.FILENAME_METADATA_FIELD)).toString())); String fileName = entry.getString(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.FILENAME_METADATA_FIELD));
assertFalse(entry.isNullAt(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD))); String seqId = entry.getString(HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD));
if (populateMetaFields) {
assertEquals(instantTime, commitTime);
assertFalse(StringUtils.isNullOrEmpty(seqId));
assertTrue(filenames.contains(fileName));
} else {
assertEquals("", commitTime);
assertEquals("", seqId);
assertEquals("", fileName);
}
}); });
// after trimming 2 of the meta fields, rest of the fields should match // after trimming 2 of the meta fields, rest of the fields should match

View File

@@ -0,0 +1,166 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql
import org.apache.spark.sql.HoodieUnsafeRowUtils.{composeNestedFieldPath, getNestedInternalRowValue, getNestedRowValue}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types._
import org.junit.jupiter.api.Assertions.{assertEquals, fail}
import org.junit.jupiter.api.Test
class TestHoodieUnsafeRowUtils {
@Test
def testComposeNestedFieldPath(): Unit = {
val schema = StructType(Seq(
StructField("foo", StringType),
StructField(
name = "bar",
dataType = StructType(Seq(
StructField("baz", DateType),
StructField("bor", LongType)
))
)
))
assertEquals(
Seq((1, schema(1)), (0, schema(1).dataType.asInstanceOf[StructType](0))),
composeNestedFieldPath(schema, "bar.baz").toSeq)
assertThrows(classOf[IllegalArgumentException]) { () =>
composeNestedFieldPath(schema, "foo.baz")
}
}
@Test
def testGetNestedInternalRowValue(): Unit = {
val schema = StructType(Seq(
StructField("foo", StringType, nullable = false),
StructField(
name = "bar",
dataType = StructType(Seq(
StructField("baz", DateType),
StructField("bor", LongType)
))
)
))
val row = InternalRow("str", InternalRow(123, 456L))
assertEquals(
123,
getNestedInternalRowValue(row, composeNestedFieldPath(schema, "bar.baz"))
)
assertEquals(
456L,
getNestedInternalRowValue(row, composeNestedFieldPath(schema, "bar.bor"))
)
assertEquals(
"str",
getNestedInternalRowValue(row, composeNestedFieldPath(schema, "foo"))
)
assertEquals(
row.getStruct(1, 2),
getNestedInternalRowValue(row, composeNestedFieldPath(schema, "bar"))
)
val rowProperNullable = InternalRow("str", null)
assertEquals(
null,
getNestedInternalRowValue(rowProperNullable, composeNestedFieldPath(schema, "bar.baz"))
)
assertEquals(
null,
getNestedInternalRowValue(rowProperNullable, composeNestedFieldPath(schema, "bar"))
)
val rowInvalidNullable = InternalRow(null, InternalRow(123, 456L))
assertThrows(classOf[IllegalArgumentException]) { () =>
getNestedInternalRowValue(rowInvalidNullable, composeNestedFieldPath(schema, "foo"))
}
}
@Test
def testGetNestedRowValue(): Unit = {
val schema = StructType(Seq(
StructField("foo", StringType, nullable = false),
StructField(
name = "bar",
dataType = StructType(Seq(
StructField("baz", DateType),
StructField("bor", LongType)
))
)
))
val row = Row("str", Row(123, 456L))
assertEquals(
123,
getNestedRowValue(row, composeNestedFieldPath(schema, "bar.baz"))
)
assertEquals(
456L,
getNestedRowValue(row, composeNestedFieldPath(schema, "bar.bor"))
)
assertEquals(
"str",
getNestedRowValue(row, composeNestedFieldPath(schema, "foo"))
)
assertEquals(
row.getStruct(1),
getNestedRowValue(row, composeNestedFieldPath(schema, "bar"))
)
val rowProperNullable = Row("str", null)
assertEquals(
null,
getNestedRowValue(rowProperNullable, composeNestedFieldPath(schema, "bar.baz"))
)
assertEquals(
null,
getNestedRowValue(rowProperNullable, composeNestedFieldPath(schema, "bar"))
)
val rowInvalidNullable = Row(null, Row(123, 456L))
assertThrows(classOf[IllegalArgumentException]) { () =>
getNestedRowValue(rowInvalidNullable, composeNestedFieldPath(schema, "foo"))
}
}
private def assertThrows[T <: Throwable](expectedExceptionClass: Class[T])(f: () => Unit): T = {
try {
f.apply()
} catch {
case t: Throwable if expectedExceptionClass.isAssignableFrom(t.getClass) =>
// scalastyle:off return
return t.asInstanceOf[T]
// scalastyle:on return
case ot @ _ =>
fail(s"Expected exception of class $expectedExceptionClass, but ${ot.getClass} has been thrown")
}
fail(s"Expected exception of class $expectedExceptionClass, but nothing has been thrown")
}
}

View File

@@ -1,35 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi;
public class TypeUtils {
/**
* This utility abstracts unsafe type-casting in a way that allows to
* <ul>
* <li>Search for such type-casts more easily (just searching for usages of this method)</li>
* <li>Avoid type-cast warnings from the compiler</li>
* </ul>
*/
@SuppressWarnings("unchecked")
public static <T> T unsafeCast(Object o) {
return (T) o;
}
}

View File

@@ -24,12 +24,19 @@ package org.apache.hudi.common.bloom;
public interface BloomFilter { public interface BloomFilter {
/** /**
* Add a key to the {@link BloomFilter}. * Add a key represented by a {@link String} to the {@link BloomFilter}.
* *
* @param key the key to the added to the {@link BloomFilter} * @param key the key to the added to the {@link BloomFilter}
*/ */
void add(String key); void add(String key);
/**
* Add a key's bytes, representing UTF8-encoded string, to the {@link BloomFilter}.
*
* @param key the key bytes to the added to the {@link BloomFilter}
*/
void add(byte[] key);
/** /**
* Tests for key membership. * Tests for key membership.
* *

View File

@@ -78,7 +78,12 @@ public class HoodieDynamicBoundedBloomFilter implements BloomFilter {
@Override @Override
public void add(String key) { public void add(String key) {
internalDynamicBloomFilter.add(new Key(key.getBytes(StandardCharsets.UTF_8))); add(key.getBytes(StandardCharsets.UTF_8));
}
@Override
public void add(byte[] keyBytes) {
internalDynamicBloomFilter.add(new Key(keyBytes));
} }
@Override @Override

View File

@@ -77,10 +77,15 @@ public class SimpleBloomFilter implements BloomFilter {
@Override @Override
public void add(String key) { public void add(String key) {
if (key == null) { add(key.getBytes(StandardCharsets.UTF_8));
}
@Override
public void add(byte[] keyBytes) {
if (keyBytes == null) {
throw new NullPointerException("Key cannot be null"); throw new NullPointerException("Key cannot be null");
} }
filter.add(new Key(key.getBytes(StandardCharsets.UTF_8))); filter.add(new Key(keyBytes));
} }
@Override @Override

View File

@@ -20,7 +20,7 @@ package org.apache.hudi.common.util;
import javax.annotation.Nonnull; import javax.annotation.Nonnull;
import static org.apache.hudi.TypeUtils.unsafeCast; import static org.apache.hudi.common.util.TypeUtils.unsafeCast;
/** /**
* Utility that could hold exclusively only either of (hence the name): * Utility that could hold exclusively only either of (hence the name):

View File

@@ -30,7 +30,17 @@ import java.util.Deque;
public class HoodieTimer { public class HoodieTimer {
// Ordered stack of TimeInfo's to make sure stopping the timer returns the correct elapsed time // Ordered stack of TimeInfo's to make sure stopping the timer returns the correct elapsed time
Deque<TimeInfo> timeInfoDeque = new ArrayDeque<>(); private final Deque<TimeInfo> timeInfoDeque = new ArrayDeque<>();
public HoodieTimer() {
this(false);
}
public HoodieTimer(boolean shouldStart) {
if (shouldStart) {
startTimer();
}
}
static class TimeInfo { static class TimeInfo {

View File

@@ -39,4 +39,16 @@ public final class TypeUtils {
.collect(Collectors.toMap(valueMapper, Function.identity())); .collect(Collectors.toMap(valueMapper, Function.identity()));
} }
/**
* This utility abstracts unsafe type-casting in a way that allows to
* <ul>
* <li>Search for such type-casts more easily (just searching for usages of this method)</li>
* <li>Avoid type-cast warnings from the compiler</li>
* </ul>
*/
@SuppressWarnings("unchecked")
public static <T> T unsafeCast(Object o) {
return (T) o;
}
} }

View File

@@ -68,6 +68,7 @@ public abstract class BaseKeyGenerator extends KeyGenerator {
@Override @Override
public final List<String> getRecordKeyFieldNames() { public final List<String> getRecordKeyFieldNames() {
// For nested columns, pick top level column name // For nested columns, pick top level column name
// TODO materialize
return getRecordKeyFields().stream().map(k -> { return getRecordKeyFields().stream().map(k -> {
int idx = k.indexOf('.'); int idx = k.indexOf('.');
return idx > 0 ? k.substring(0, idx) : k; return idx > 0 ? k.substring(0, idx) : k;

View File

@@ -75,9 +75,9 @@ import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import static org.apache.hudi.TypeUtils.unsafeCast;
import static org.apache.hudi.common.util.DateTimeUtils.instantToMicros; import static org.apache.hudi.common.util.DateTimeUtils.instantToMicros;
import static org.apache.hudi.common.util.DateTimeUtils.microsToInstant; import static org.apache.hudi.common.util.DateTimeUtils.microsToInstant;
import static org.apache.hudi.common.util.TypeUtils.unsafeCast;
import static org.apache.hudi.common.util.ValidationUtils.checkArgument; import static org.apache.hudi.common.util.ValidationUtils.checkArgument;
import static org.apache.hudi.common.util.ValidationUtils.checkState; import static org.apache.hudi.common.util.ValidationUtils.checkState;
import static org.apache.hudi.metadata.HoodieTableMetadata.RECORDKEY_PARTITION_LIST; import static org.apache.hudi.metadata.HoodieTableMetadata.RECORDKEY_PARTITION_LIST;

View File

@@ -898,7 +898,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
return anchorTs + r.nextLong() % 259200000L; return anchorTs + r.nextLong() % 259200000L;
} }
private static UUID genPseudoRandomUUID(Random r) { public static UUID genPseudoRandomUUID(Random r) {
byte[] bytes = new byte[16]; byte[] bytes = new byte[16];
r.nextBytes(bytes); r.nextBytes(bytes);

View File

@@ -31,7 +31,7 @@ import org.apache.hudi.hadoop.realtime.RealtimeSplit;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import static org.apache.hudi.TypeUtils.unsafeCast; import static org.apache.hudi.common.util.TypeUtils.unsafeCast;
public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils { public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {

View File

@@ -18,6 +18,9 @@
package org.apache.hudi; package org.apache.hudi;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.client.HoodieReadClient; import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.HoodieWriteResult; import org.apache.hudi.client.HoodieWriteResult;
import org.apache.hudi.client.SparkRDDWriteClient; import org.apache.hudi.client.SparkRDDWriteClient;
@@ -41,10 +44,6 @@ import org.apache.hudi.exception.HoodieNotSupportedException;
import org.apache.hudi.exception.TableNotFoundException; import org.apache.hudi.exception.TableNotFoundException;
import org.apache.hudi.table.BulkInsertPartitioner; import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.util.DataTypeUtils; import org.apache.hudi.util.DataTypeUtils;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@@ -293,7 +292,7 @@ public class DataSourceUtils {
// - {@code HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED} has not been explicitly // - {@code HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED} has not been explicitly
// set by the writer // set by the writer
// //
// If both of these conditions are true, than we override the default value of {@code // If both of these conditions are true, then we override the default value of {@code
// HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED} and set it to "true" // HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED} and set it to "true"
LOG.warn("Small Decimal Type found in the persisted schema, reverting default value of 'hoodie.parquet.writelegacyformat.enabled' to true"); LOG.warn("Small Decimal Type found in the persisted schema, reverting default value of 'hoodie.parquet.writelegacyformat.enabled' to true");
properties.put(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key(), "true"); properties.put(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED.key(), "true");

View File

@@ -1,189 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.keygen.BuiltinKeyGenerator;
import org.apache.hudi.keygen.ComplexKeyGenerator;
import org.apache.hudi.keygen.NonpartitionedKeyGenerator;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataTypes;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import scala.collection.JavaConverters;
import static org.apache.spark.sql.functions.callUDF;
/**
* Helper class to assist in preparing {@link Dataset<Row>}s for bulk insert with datasource implementation.
*/
public class HoodieDatasetBulkInsertHelper {
private static final Logger LOG = LogManager.getLogger(HoodieDatasetBulkInsertHelper.class);
private static final String RECORD_KEY_UDF_FN = "hudi_recordkey_gen_function_";
private static final String PARTITION_PATH_UDF_FN = "hudi_partition_gen_function_";
/**
* Prepares input hoodie spark dataset for bulk insert. It does the following steps.
* 1. Uses KeyGenerator to generate hoodie record keys and partition path.
* 2. Add hoodie columns to input spark dataset.
* 3. Reorders input dataset columns so that hoodie columns appear in the beginning.
* 4. Sorts input dataset by hoodie partition path and record key
*
* @param sqlContext SQL Context
* @param config Hoodie Write Config
* @param rows Spark Input dataset
* @return hoodie dataset which is ready for bulk insert.
*/
public static Dataset<Row> prepareHoodieDatasetForBulkInsert(SQLContext sqlContext,
HoodieWriteConfig config, Dataset<Row> rows, String structName, String recordNamespace,
BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows,
boolean isGlobalIndex, boolean dropPartitionColumns) {
List<Column> originalFields =
Arrays.stream(rows.schema().fields()).map(f -> new Column(f.name())).collect(Collectors.toList());
TypedProperties properties = new TypedProperties();
properties.putAll(config.getProps());
String keyGeneratorClass = properties.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key());
String recordKeyFields = properties.getString(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key());
String partitionPathFields = properties.containsKey(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key())
? properties.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()) : "";
BuiltinKeyGenerator keyGenerator = (BuiltinKeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, properties);
Dataset<Row> rowDatasetWithRecordKeysAndPartitionPath;
if (keyGeneratorClass.equals(NonpartitionedKeyGenerator.class.getName())) {
// for non partitioned, set partition path to empty.
rowDatasetWithRecordKeysAndPartitionPath = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, functions.col(recordKeyFields))
.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType));
} else if (keyGeneratorClass.equals(SimpleKeyGenerator.class.getName())
|| (keyGeneratorClass.equals(ComplexKeyGenerator.class.getName()) && !recordKeyFields.contains(",") && !partitionPathFields.contains(",")
&& (!partitionPathFields.contains("timestamp")))) { // incase of ComplexKeyGen, check partition path type.
// simple fields for both record key and partition path: can directly use withColumn
String partitionPathField = keyGeneratorClass.equals(SimpleKeyGenerator.class.getName()) ? partitionPathFields :
partitionPathFields.substring(partitionPathFields.indexOf(":") + 1);
rowDatasetWithRecordKeysAndPartitionPath = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, functions.col(recordKeyFields).cast(DataTypes.StringType))
.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, functions.col(partitionPathField).cast(DataTypes.StringType));
} else {
// use udf
String tableName = properties.getString(HoodieWriteConfig.TBL_NAME.key());
String recordKeyUdfFn = RECORD_KEY_UDF_FN + tableName;
String partitionPathUdfFn = PARTITION_PATH_UDF_FN + tableName;
sqlContext.udf().register(recordKeyUdfFn, (UDF1<Row, String>) keyGenerator::getRecordKey, DataTypes.StringType);
sqlContext.udf().register(partitionPathUdfFn, (UDF1<Row, String>) keyGenerator::getPartitionPath, DataTypes.StringType);
final Dataset<Row> rowDatasetWithRecordKeys = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD,
callUDF(recordKeyUdfFn, org.apache.spark.sql.functions.struct(
JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
rowDatasetWithRecordKeysAndPartitionPath =
rowDatasetWithRecordKeys.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD,
callUDF(partitionPathUdfFn,
org.apache.spark.sql.functions.struct(
JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
}
// Add other empty hoodie fields which will be populated before writing to parquet.
Dataset<Row> rowDatasetWithHoodieColumns =
rowDatasetWithRecordKeysAndPartitionPath.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.FILENAME_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType));
Dataset<Row> processedDf = rowDatasetWithHoodieColumns;
if (dropPartitionColumns) {
String partitionColumns = String.join(",", keyGenerator.getPartitionPathFields());
for (String partitionField : keyGenerator.getPartitionPathFields()) {
originalFields.remove(new Column(partitionField));
}
processedDf = rowDatasetWithHoodieColumns.drop(partitionColumns);
}
Dataset<Row> dedupedDf = processedDf;
if (config.shouldCombineBeforeInsert()) {
dedupedDf = SparkRowWriteHelper.newInstance().deduplicateRows(processedDf, config.getPreCombineField(), isGlobalIndex);
}
List<Column> orderedFields = Stream.concat(HoodieRecord.HOODIE_META_COLUMNS.stream().map(Column::new),
originalFields.stream()).collect(Collectors.toList());
Dataset<Row> colOrderedDataset = dedupedDf.select(
JavaConverters.collectionAsScalaIterableConverter(orderedFields).asScala().toSeq());
return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
}
/**
* Add empty meta fields and reorder such that meta fields are at the beginning.
*
* @param rows
* @return
*/
public static Dataset<Row> prepareHoodieDatasetForBulkInsertWithoutMetaFields(Dataset<Row> rows) {
// add empty meta cols.
Dataset<Row> rowsWithMetaCols = rows
.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType))
.withColumn(HoodieRecord.FILENAME_METADATA_FIELD,
functions.lit("").cast(DataTypes.StringType));
List<Column> originalFields =
Arrays.stream(rowsWithMetaCols.schema().fields())
.filter(field -> !HoodieRecord.HOODIE_META_COLUMNS_WITH_OPERATION.contains(field.name()))
.map(f -> new Column(f.name())).collect(Collectors.toList());
List<Column> metaFields =
Arrays.stream(rowsWithMetaCols.schema().fields())
.filter(field -> HoodieRecord.HOODIE_META_COLUMNS_WITH_OPERATION.contains(field.name()))
.map(f -> new Column(f.name())).collect(Collectors.toList());
// reorder such that all meta columns are at the beginning followed by original columns
List<Column> allCols = new ArrayList<>();
allCols.addAll(metaFields);
allCols.addAll(originalFields);
return rowsWithMetaCols.select(
JavaConverters.collectionAsScalaIterableConverter(allCols).asScala().toSeq());
}
}

View File

@@ -1,72 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer$;
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder;
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
import org.apache.spark.sql.catalyst.expressions.Attribute;
import org.apache.spark.sql.types.StructType;
import scala.Tuple2;
import scala.collection.JavaConversions;
import scala.collection.JavaConverters;
import java.util.List;
import java.util.stream.Collectors;
/**
* Helper class to assist in deduplicating Rows for BulkInsert with Rows.
*/
public class SparkRowWriteHelper {
private SparkRowWriteHelper() {
}
private static class WriteHelperHolder {
private static final SparkRowWriteHelper SPARK_WRITE_HELPER = new SparkRowWriteHelper();
}
public static SparkRowWriteHelper newInstance() {
return SparkRowWriteHelper.WriteHelperHolder.SPARK_WRITE_HELPER;
}
public Dataset<Row> deduplicateRows(Dataset<Row> inputDf, String preCombineField, boolean isGlobalIndex) {
return inputDf.groupByKey((MapFunction<Row, String>) value ->
isGlobalIndex
? (value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD))
: (value.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD) + "+" + value.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD)), Encoders.STRING())
.reduceGroups((ReduceFunction<Row>) (v1, v2) ->
((Comparable) v1.getAs(preCombineField)).compareTo(v2.getAs(preCombineField)) >= 0 ? v1 : v2)
.map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, getEncoder(inputDf.schema()));
}
private ExpressionEncoder getEncoder(StructType schema) {
List<Attribute> attributes = JavaConversions.asJavaCollection(schema.toAttributes()).stream()
.map(Attribute::toAttribute).collect(Collectors.toList());
return RowEncoder.apply(schema)
.resolveAndBind(JavaConverters.asScalaBufferConverter(attributes).asScala().toSeq(),
SimpleAnalyzer$.MODULE$);
}
}

View File

@@ -27,18 +27,17 @@ import org.apache.hudi.common.util.PartitionPathEncodeUtils;
import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.storage.row.HoodieRowCreateHandle; import org.apache.hudi.io.storage.row.HoodieRowCreateHandle;
import org.apache.hudi.io.storage.row.HoodieRowCreateHandleWithoutMetaFields;
import org.apache.hudi.keygen.BuiltinKeyGenerator; import org.apache.hudi.keygen.BuiltinKeyGenerator;
import org.apache.hudi.keygen.NonpartitionedKeyGenerator; import org.apache.hudi.keygen.NonpartitionedKeyGenerator;
import org.apache.hudi.keygen.SimpleKeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory; import org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory;
import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTable;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.InternalRow;
import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import org.apache.spark.unsafe.types.UTF8String;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@@ -64,16 +63,20 @@ public class BulkInsertDataInternalWriterHelper {
private final StructType structType; private final StructType structType;
private final Boolean arePartitionRecordsSorted; private final Boolean arePartitionRecordsSorted;
private final List<HoodieInternalWriteStatus> writeStatusList = new ArrayList<>(); private final List<HoodieInternalWriteStatus> writeStatusList = new ArrayList<>();
private HoodieRowCreateHandle handle; private final String fileIdPrefix;
private String lastKnownPartitionPath = null; private final Map<String, HoodieRowCreateHandle> handles = new HashMap<>();
private String fileIdPrefix;
private int numFilesWritten = 0;
private Map<String, HoodieRowCreateHandle> handles = new HashMap<>();
private final boolean populateMetaFields; private final boolean populateMetaFields;
private Option<BuiltinKeyGenerator> keyGeneratorOpt = null; private final Option<BuiltinKeyGenerator> keyGeneratorOpt;
private boolean simpleKeyGen = false; private final boolean simpleKeyGen;
private int simplePartitionFieldIndex = -1; private final int simplePartitionFieldIndex;
private DataType simplePartitionFieldDataType; private final DataType simplePartitionFieldDataType;
/**
* NOTE: This is stored as Catalyst's internal {@link UTF8String} to avoid
* conversion (deserialization) b/w {@link UTF8String} and {@link String}
*/
private String lastKnownPartitionPath = null;
private HoodieRowCreateHandle handle;
private int numFilesWritten = 0;
public BulkInsertDataInternalWriterHelper(HoodieTable hoodieTable, HoodieWriteConfig writeConfig, public BulkInsertDataInternalWriterHelper(HoodieTable hoodieTable, HoodieWriteConfig writeConfig,
String instantTime, int taskPartitionId, long taskId, long taskEpochId, StructType structType, String instantTime, int taskPartitionId, long taskId, long taskEpochId, StructType structType,
@@ -88,13 +91,21 @@ public class BulkInsertDataInternalWriterHelper {
this.populateMetaFields = populateMetaFields; this.populateMetaFields = populateMetaFields;
this.arePartitionRecordsSorted = arePartitionRecordsSorted; this.arePartitionRecordsSorted = arePartitionRecordsSorted;
this.fileIdPrefix = UUID.randomUUID().toString(); this.fileIdPrefix = UUID.randomUUID().toString();
if (!populateMetaFields) { if (!populateMetaFields) {
this.keyGeneratorOpt = getKeyGenerator(writeConfig.getProps()); this.keyGeneratorOpt = getKeyGenerator(writeConfig.getProps());
if (keyGeneratorOpt.isPresent() && keyGeneratorOpt.get() instanceof SimpleKeyGenerator) { } else {
simpleKeyGen = true; this.keyGeneratorOpt = Option.empty();
simplePartitionFieldIndex = (Integer) structType.getFieldIndex((keyGeneratorOpt.get()).getPartitionPathFields().get(0)).get(); }
simplePartitionFieldDataType = structType.fields()[simplePartitionFieldIndex].dataType();
} if (keyGeneratorOpt.isPresent() && keyGeneratorOpt.get() instanceof SimpleKeyGenerator) {
this.simpleKeyGen = true;
this.simplePartitionFieldIndex = (Integer) structType.getFieldIndex(keyGeneratorOpt.get().getPartitionPathFields().get(0)).get();
this.simplePartitionFieldDataType = structType.fields()[simplePartitionFieldIndex].dataType();
} else {
this.simpleKeyGen = false;
this.simplePartitionFieldIndex = -1;
this.simplePartitionFieldDataType = null;
} }
} }
@@ -120,32 +131,16 @@ public class BulkInsertDataInternalWriterHelper {
} }
} }
public void write(InternalRow record) throws IOException { public void write(InternalRow row) throws IOException {
try { try {
String partitionPath = null; String partitionPath = extractPartitionPath(row);
if (populateMetaFields) { // usual path where meta fields are pre populated in prep step. if (lastKnownPartitionPath == null || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) {
partitionPath = String.valueOf(record.getUTF8String(HoodieRecord.PARTITION_PATH_META_FIELD_POS));
} else { // if meta columns are disabled.
if (!keyGeneratorOpt.isPresent()) { // NoPartitionerKeyGen
partitionPath = "";
} else if (simpleKeyGen) { // SimpleKeyGen
Object parititionPathValue = record.get(simplePartitionFieldIndex, simplePartitionFieldDataType);
partitionPath = parititionPathValue != null ? parititionPathValue.toString() : PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH;
if (writeConfig.isHiveStylePartitioningEnabled()) {
partitionPath = (keyGeneratorOpt.get()).getPartitionPathFields().get(0) + "=" + partitionPath;
}
} else {
// only BuiltIn key generators are supported if meta fields are disabled.
partitionPath = keyGeneratorOpt.get().getPartitionPath(record, structType);
}
}
if ((lastKnownPartitionPath == null) || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) {
LOG.info("Creating new file for partition path " + partitionPath); LOG.info("Creating new file for partition path " + partitionPath);
handle = getRowCreateHandle(partitionPath); handle = getRowCreateHandle(partitionPath);
lastKnownPartitionPath = partitionPath; lastKnownPartitionPath = partitionPath;
} }
handle.write(record);
handle.write(row);
} catch (Throwable t) { } catch (Throwable t) {
LOG.error("Global error thrown while trying to write records in HoodieRowCreateHandle ", t); LOG.error("Global error thrown while trying to write records in HoodieRowCreateHandle ", t);
throw t; throw t;
@@ -157,30 +152,7 @@ public class BulkInsertDataInternalWriterHelper {
return writeStatusList; return writeStatusList;
} }
public void abort() { public void abort() {}
}
private HoodieRowCreateHandle getRowCreateHandle(String partitionPath) throws IOException {
if (!handles.containsKey(partitionPath)) { // if there is no handle corresponding to the partition path
// if records are sorted, we can close all existing handles
if (arePartitionRecordsSorted) {
close();
}
HoodieRowCreateHandle rowCreateHandle = populateMetaFields ? new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType) : new HoodieRowCreateHandleWithoutMetaFields(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType);
handles.put(partitionPath, rowCreateHandle);
} else if (!handles.get(partitionPath).canWrite()) {
// even if there is a handle to the partition path, it could have reached its max size threshold. So, we close the handle here and
// create a new one.
writeStatusList.add(handles.remove(partitionPath).close());
HoodieRowCreateHandle rowCreateHandle = populateMetaFields ? new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType) : new HoodieRowCreateHandleWithoutMetaFields(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType);
handles.put(partitionPath, rowCreateHandle);
}
return handles.get(partitionPath);
}
public void close() throws IOException { public void close() throws IOException {
for (HoodieRowCreateHandle rowCreateHandle : handles.values()) { for (HoodieRowCreateHandle rowCreateHandle : handles.values()) {
@@ -190,6 +162,56 @@ public class BulkInsertDataInternalWriterHelper {
handle = null; handle = null;
} }
private String extractPartitionPath(InternalRow row) {
String partitionPath;
if (populateMetaFields) {
// In case meta-fields are materialized w/in the table itself, we can just simply extract
// partition path from there
//
// NOTE: Helper keeps track of [[lastKnownPartitionPath]] as [[UTF8String]] to avoid
// conversion from Catalyst internal representation into a [[String]]
partitionPath = row.getString(HoodieRecord.PARTITION_PATH_META_FIELD_POS);
} else if (keyGeneratorOpt.isPresent()) {
// TODO(HUDI-4039) this should be handled by the SimpleKeyGenerator itself
if (simpleKeyGen) {
String partitionPathValue = row.get(simplePartitionFieldIndex, simplePartitionFieldDataType).toString();
partitionPath = partitionPathValue != null ? partitionPathValue : PartitionPathEncodeUtils.DEFAULT_PARTITION_PATH;
if (writeConfig.isHiveStylePartitioningEnabled()) {
partitionPath = (keyGeneratorOpt.get()).getPartitionPathFields().get(0) + "=" + partitionPath;
}
} else {
// only BuiltIn key generators are supported if meta fields are disabled.
partitionPath = keyGeneratorOpt.get().getPartitionPath(row, structType);
}
} else {
partitionPath = "";
}
return partitionPath;
}
private HoodieRowCreateHandle getRowCreateHandle(String partitionPath) throws IOException {
if (!handles.containsKey(partitionPath)) { // if there is no handle corresponding to the partition path
// if records are sorted, we can close all existing handles
if (arePartitionRecordsSorted) {
close();
}
HoodieRowCreateHandle rowCreateHandle = createHandle(partitionPath);
handles.put(partitionPath, rowCreateHandle);
} else if (!handles.get(partitionPath).canWrite()) {
// even if there is a handle to the partition path, it could have reached its max size threshold. So, we close the handle here and
// create a new one.
writeStatusList.add(handles.remove(partitionPath).close());
HoodieRowCreateHandle rowCreateHandle = createHandle(partitionPath);
handles.put(partitionPath, rowCreateHandle);
}
return handles.get(partitionPath);
}
private HoodieRowCreateHandle createHandle(String partitionPath) {
return new HoodieRowCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
instantTime, taskPartitionId, taskId, taskEpochId, structType, populateMetaFields);
}
private String getNextFileId() { private String getNextFileId() {
return String.format("%s-%d", fileIdPrefix, numFilesWritten++); return String.format("%s-%d", fileIdPrefix, numFilesWritten++);
} }

View File

@@ -0,0 +1,158 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi
import org.apache.hudi.client.model.HoodieInternalRow
import org.apache.hudi.common.config.TypedProperties
import org.apache.hudi.common.model.HoodieRecord
import org.apache.hudi.common.util.ReflectionUtils
import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.index.SparkHoodieIndexFactory
import org.apache.hudi.keygen.BuiltinKeyGenerator
import org.apache.hudi.table.BulkInsertPartitioner
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.HoodieUnsafeRDDUtils.createDataFrame
import org.apache.spark.sql.HoodieUnsafeRowUtils.{composeNestedFieldPath, getNestedInternalRowValue}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, HoodieUnsafeRDDUtils, Row}
import org.apache.spark.unsafe.types.UTF8String
import scala.collection.JavaConverters.asScalaBufferConverter
import scala.collection.mutable
object HoodieDatasetBulkInsertHelper extends Logging {
/**
* Prepares [[DataFrame]] for bulk-insert into Hudi table, taking following steps:
*
* <ol>
* <li>Invoking configured [[KeyGenerator]] to produce record key, alas partition-path value</li>
* <li>Prepends Hudi meta-fields to every row in the dataset</li>
* <li>Dedupes rows (if necessary)</li>
* <li>Partitions dataset using provided [[partitioner]]</li>
* </ol>
*/
def prepareForBulkInsert(df: DataFrame,
config: HoodieWriteConfig,
partitioner: BulkInsertPartitioner[Dataset[Row]],
shouldDropPartitionColumns: Boolean): Dataset[Row] = {
val populateMetaFields = config.populateMetaFields()
val schema = df.schema
val keyGeneratorClassName = config.getStringOrThrow(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME,
"Key-generator class name is required")
val prependedRdd: RDD[InternalRow] =
df.queryExecution.toRdd.mapPartitions { iter =>
val keyGenerator =
ReflectionUtils.loadClass(keyGeneratorClassName, new TypedProperties(config.getProps))
.asInstanceOf[BuiltinKeyGenerator]
iter.map { row =>
val (recordKey, partitionPath) =
if (populateMetaFields) {
(UTF8String.fromString(keyGenerator.getRecordKey(row, schema)),
UTF8String.fromString(keyGenerator.getPartitionPath(row, schema)))
} else {
(UTF8String.EMPTY_UTF8, UTF8String.EMPTY_UTF8)
}
val commitTimestamp = UTF8String.EMPTY_UTF8
val commitSeqNo = UTF8String.EMPTY_UTF8
val filename = UTF8String.EMPTY_UTF8
// TODO use mutable row, avoid re-allocating
new HoodieInternalRow(commitTimestamp, commitSeqNo, recordKey, partitionPath, filename, row, false)
}
}
val metaFields = Seq(
StructField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, StringType),
StructField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, StringType),
StructField(HoodieRecord.RECORD_KEY_METADATA_FIELD, StringType),
StructField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, StringType),
StructField(HoodieRecord.FILENAME_METADATA_FIELD, StringType))
val updatedSchema = StructType(metaFields ++ schema.fields)
val updatedDF = if (populateMetaFields && config.shouldCombineBeforeInsert) {
val dedupedRdd = dedupeRows(prependedRdd, updatedSchema, config.getPreCombineField, SparkHoodieIndexFactory.isGlobalIndex(config))
HoodieUnsafeRDDUtils.createDataFrame(df.sparkSession, dedupedRdd, updatedSchema)
} else {
HoodieUnsafeRDDUtils.createDataFrame(df.sparkSession, prependedRdd, updatedSchema)
}
val trimmedDF = if (shouldDropPartitionColumns) {
dropPartitionColumns(updatedDF, config)
} else {
updatedDF
}
partitioner.repartitionRecords(trimmedDF, config.getBulkInsertShuffleParallelism)
}
private def dedupeRows(rdd: RDD[InternalRow], schema: StructType, preCombineFieldRef: String, isGlobalIndex: Boolean): RDD[InternalRow] = {
val recordKeyMetaFieldOrd = schema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD)
val partitionPathMetaFieldOrd = schema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD)
// NOTE: Pre-combine field could be a nested field
val preCombineFieldPath = composeNestedFieldPath(schema, preCombineFieldRef)
rdd.map { row =>
val rowKey = if (isGlobalIndex) {
row.getString(recordKeyMetaFieldOrd)
} else {
val partitionPath = row.getString(partitionPathMetaFieldOrd)
val recordKey = row.getString(recordKeyMetaFieldOrd)
s"$partitionPath:$recordKey"
}
// NOTE: It's critical whenever we keep the reference to the row, to make a copy
// since Spark might be providing us with a mutable copy (updated during the iteration)
(rowKey, row.copy())
}
.reduceByKey {
(oneRow, otherRow) =>
val onePreCombineVal = getNestedInternalRowValue(oneRow, preCombineFieldPath).asInstanceOf[Comparable[AnyRef]]
val otherPreCombineVal = getNestedInternalRowValue(otherRow, preCombineFieldPath).asInstanceOf[Comparable[AnyRef]]
if (onePreCombineVal.compareTo(otherPreCombineVal.asInstanceOf[AnyRef]) >= 0) {
oneRow
} else {
otherRow
}
}
.values
}
private def dropPartitionColumns(df: DataFrame, config: HoodieWriteConfig): DataFrame = {
val partitionPathFields = getPartitionPathFields(config).toSet
val nestedPartitionPathFields = partitionPathFields.filter(f => f.contains('.'))
if (nestedPartitionPathFields.nonEmpty) {
logWarning(s"Can not drop nested partition path fields: $nestedPartitionPathFields")
}
val partitionPathCols = (partitionPathFields -- nestedPartitionPathFields).toSeq
df.drop(partitionPathCols: _*)
}
private def getPartitionPathFields(config: HoodieWriteConfig): Seq[String] = {
val keyGeneratorClassName = config.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME)
val keyGenerator = ReflectionUtils.loadClass(keyGeneratorClassName, new TypedProperties(config.getProps)).asInstanceOf[BuiltinKeyGenerator]
keyGenerator.getPartitionPathFields.asScala
}
}

View File

@@ -515,8 +515,8 @@ object HoodieSparkSqlWriter {
instantTime: String, instantTime: String,
partitionColumns: String): (Boolean, common.util.Option[String]) = { partitionColumns: String): (Boolean, common.util.Option[String]) = {
val sparkContext = sqlContext.sparkContext val sparkContext = sqlContext.sparkContext
val populateMetaFields = java.lang.Boolean.parseBoolean((parameters.getOrElse(HoodieTableConfig.POPULATE_META_FIELDS.key(), val populateMetaFields = java.lang.Boolean.parseBoolean(parameters.getOrElse(HoodieTableConfig.POPULATE_META_FIELDS.key(),
String.valueOf(HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())))) String.valueOf(HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())))
val dropPartitionColumns = parameters.get(DataSourceWriteOptions.DROP_PARTITION_COLUMNS.key()).map(_.toBoolean) val dropPartitionColumns = parameters.get(DataSourceWriteOptions.DROP_PARTITION_COLUMNS.key()).map(_.toBoolean)
.getOrElse(DataSourceWriteOptions.DROP_PARTITION_COLUMNS.defaultValue()) .getOrElse(DataSourceWriteOptions.DROP_PARTITION_COLUMNS.defaultValue())
// register classes & schemas // register classes & schemas
@@ -556,12 +556,9 @@ object HoodieSparkSqlWriter {
} else { } else {
false false
} }
val hoodieDF = if (populateMetaFields) {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, writeConfig, df, structName, nameSpace, val hoodieDF = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(df, writeConfig, bulkInsertPartitionerRows, dropPartitionColumns)
bulkInsertPartitionerRows, isGlobalIndex, dropPartitionColumns)
} else {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsertWithoutMetaFields(df)
}
if (HoodieSparkUtils.isSpark2) { if (HoodieSparkUtils.isSpark2) {
hoodieDF.write.format("org.apache.hudi.internal") hoodieDF.write.format("org.apache.hudi.internal")
.option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime) .option(DataSourceInternalWriterHelper.INSTANT_TIME_OPT_KEY, instantTime)

View File

@@ -17,6 +17,7 @@
package org.apache.hudi.functional; package org.apache.hudi.functional;
import org.apache.avro.Schema;
import org.apache.hudi.AvroConversionUtils; import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.DataSourceWriteOptions; import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.HoodieDatasetBulkInsertHelper; import org.apache.hudi.HoodieDatasetBulkInsertHelper;
@@ -27,10 +28,9 @@ import org.apache.hudi.execution.bulkinsert.NonSortPartitionerWithRows;
import org.apache.hudi.keygen.ComplexKeyGenerator; import org.apache.hudi.keygen.ComplexKeyGenerator;
import org.apache.hudi.keygen.NonpartitionedKeyGenerator; import org.apache.hudi.keygen.NonpartitionedKeyGenerator;
import org.apache.hudi.keygen.SimpleKeyGenerator; import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.metadata.HoodieTableMetadata;
import org.apache.hudi.testutils.DataSourceTestUtils; import org.apache.hudi.testutils.DataSourceTestUtils;
import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hudi.testutils.HoodieClientTestBase;
import org.apache.avro.Schema;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.ReduceFunction; import org.apache.spark.api.java.function.ReduceFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@@ -46,6 +46,9 @@ import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.MethodSource;
import scala.Tuple2;
import scala.collection.JavaConversions;
import scala.collection.JavaConverters;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@@ -56,10 +59,6 @@ import java.util.stream.Collectors;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import java.util.stream.Stream; import java.util.stream.Stream;
import scala.Tuple2;
import scala.collection.JavaConversions;
import scala.collection.JavaConverters;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail; import static org.junit.jupiter.api.Assertions.fail;
@@ -117,36 +116,42 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
testBulkInsertHelperFor(keyGenClass, "_row_key"); testBulkInsertHelperFor(keyGenClass, "_row_key");
} }
private void testBulkInsertHelperFor(String keyGenClass, String recordKey) { private void testBulkInsertHelperFor(String keyGenClass, String recordKeyField) {
Map<String, String> props = null; Map<String, String> props = null;
if (keyGenClass.equals(SimpleKeyGenerator.class.getName())) { if (keyGenClass.equals(SimpleKeyGenerator.class.getName())) {
props = getPropsAllSet(recordKey); props = getPropsAllSet(recordKeyField);
} else if (keyGenClass.equals(ComplexKeyGenerator.class.getName())) { } else if (keyGenClass.equals(ComplexKeyGenerator.class.getName())) {
props = getPropsForComplexKeyGen(recordKey); props = getPropsForComplexKeyGen(recordKeyField);
} else { // NonPartitioned key gen } else { // NonPartitioned key gen
props = getPropsForNonPartitionedKeyGen(recordKey); props = getPropsForNonPartitionedKeyGen(recordKeyField);
} }
HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(props).combineInput(false, false).build(); HoodieWriteConfig config = getConfigBuilder(schemaStr).withProps(props).combineInput(false, false).build();
List<Row> rows = DataSourceTestUtils.generateRandomRows(10); List<Row> rows = DataSourceTestUtils.generateRandomRows(10);
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType); Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(dataset, config,
"testNamespace", new NonSortPartitionerWithRows(), false, false); new NonSortPartitionerWithRows(), false);
StructType resultSchema = result.schema(); StructType resultSchema = result.schema();
assertEquals(result.count(), 10); assertEquals(result.count(), 10);
assertEquals(resultSchema.fieldNames().length, structType.fieldNames().length + HoodieRecord.HOODIE_META_COLUMNS.size()); assertEquals(resultSchema.fieldNames().length, structType.fieldNames().length + HoodieRecord.HOODIE_META_COLUMNS.size());
for (Map.Entry<String, Integer> entry : HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.entrySet()) { for (Map.Entry<String, Integer> entry : HoodieRecord.HOODIE_META_COLUMNS_NAME_TO_POS.entrySet()) {
assertTrue(resultSchema.fieldIndex(entry.getKey()) == entry.getValue()); assertEquals(entry.getValue(), resultSchema.fieldIndex(entry.getKey()));
} }
boolean isNonPartitioned = keyGenClass.equals(NonpartitionedKeyGenerator.class.getName()); boolean isNonPartitionedKeyGen = keyGenClass.equals(NonpartitionedKeyGenerator.class.getName());
boolean isComplexKeyGen = keyGenClass.equals(ComplexKeyGenerator.class.getName());
result.toJavaRDD().foreach(entry -> { result.toJavaRDD().foreach(entry -> {
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD)).equals(entry.getAs(recordKey).toString())); String recordKey = isComplexKeyGen ? String.format("%s:%s", recordKeyField, entry.getAs(recordKeyField)) : entry.getAs(recordKeyField).toString();
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).equals(isNonPartitioned ? "" : entry.getAs("partition"))); assertEquals(recordKey, entry.get(resultSchema.fieldIndex(HoodieRecord.RECORD_KEY_METADATA_FIELD)));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD)).equals(""));
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_TIME_METADATA_FIELD)).equals("")); String partitionPath = isNonPartitionedKeyGen ? HoodieTableMetadata.EMPTY_PARTITION_NAME : entry.getAs("partition").toString();
assertTrue(entry.get(resultSchema.fieldIndex(HoodieRecord.FILENAME_METADATA_FIELD)).equals("")); assertEquals(partitionPath, entry.get(resultSchema.fieldIndex(HoodieRecord.PARTITION_PATH_METADATA_FIELD)));
assertEquals("", entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD)));
assertEquals("", entry.get(resultSchema.fieldIndex(HoodieRecord.COMMIT_TIME_METADATA_FIELD)));
assertEquals("", entry.get(resultSchema.fieldIndex(HoodieRecord.FILENAME_METADATA_FIELD)));
}); });
Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
@@ -157,8 +162,13 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
@Test @Test
public void testBulkInsertHelperNoMetaFields() { public void testBulkInsertHelperNoMetaFields() {
List<Row> rows = DataSourceTestUtils.generateRandomRows(10); List<Row> rows = DataSourceTestUtils.generateRandomRows(10);
HoodieWriteConfig config = getConfigBuilder(schemaStr)
.withProps(getPropsAllSet("_row_key"))
.withPopulateMetaFields(false)
.build();
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType); Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsertWithoutMetaFields(dataset); Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(dataset, config,
new NonSortPartitionerWithRows(), false);
StructType resultSchema = result.schema(); StructType resultSchema = result.schema();
assertEquals(result.count(), 10); assertEquals(result.count(), 10);
@@ -194,8 +204,8 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
rows.addAll(inserts); rows.addAll(inserts);
rows.addAll(updates); rows.addAll(updates);
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType); Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", Dataset<Row> result = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(dataset, config,
"testNamespace", new NonSortPartitionerWithRows(), false, false); new NonSortPartitionerWithRows(), false);
StructType resultSchema = result.schema(); StructType resultSchema = result.schema();
assertEquals(result.count(), enablePreCombine ? 10 : 15); assertEquals(result.count(), enablePreCombine ? 10 : 15);
@@ -211,13 +221,15 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
int metadataCommitSeqNoIndex = resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD); int metadataCommitSeqNoIndex = resultSchema.fieldIndex(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD);
int metadataFilenameIndex = resultSchema.fieldIndex(HoodieRecord.FILENAME_METADATA_FIELD); int metadataFilenameIndex = resultSchema.fieldIndex(HoodieRecord.FILENAME_METADATA_FIELD);
result.toJavaRDD().foreach(entry -> { result.toJavaRDD()
assertTrue(entry.get(metadataRecordKeyIndex).equals(entry.getAs("_row_key"))); .collect()
assertTrue(entry.get(metadataPartitionPathIndex).equals(entry.getAs("partition"))); .forEach(entry -> {
assertTrue(entry.get(metadataCommitSeqNoIndex).equals("")); assertTrue(entry.get(metadataRecordKeyIndex).equals(entry.getAs("_row_key")));
assertTrue(entry.get(metadataCommitTimeIndex).equals("")); assertTrue(entry.get(metadataPartitionPathIndex).equals(entry.getAs("partition")));
assertTrue(entry.get(metadataFilenameIndex).equals("")); assertTrue(entry.get(metadataCommitSeqNoIndex).equals(""));
}); assertTrue(entry.get(metadataCommitTimeIndex).equals(""));
assertTrue(entry.get(metadataFilenameIndex).equals(""));
});
Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD) Dataset<Row> trimmedOutput = result.drop(HoodieRecord.PARTITION_PATH_METADATA_FIELD).drop(HoodieRecord.RECORD_KEY_METADATA_FIELD)
.drop(HoodieRecord.FILENAME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD); .drop(HoodieRecord.FILENAME_METADATA_FIELD).drop(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD).drop(HoodieRecord.COMMIT_TIME_METADATA_FIELD);
@@ -226,7 +238,7 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
ExpressionEncoder encoder = getEncoder(dataset.schema()); ExpressionEncoder encoder = getEncoder(dataset.schema());
if (enablePreCombine) { if (enablePreCombine) {
Dataset<Row> inputSnapshotDf = dataset.groupByKey( Dataset<Row> inputSnapshotDf = dataset.groupByKey(
(MapFunction<Row, String>) value -> value.getAs("partition") + "+" + value.getAs("_row_key"), Encoders.STRING()) (MapFunction<Row, String>) value -> value.getAs("partition") + ":" + value.getAs("_row_key"), Encoders.STRING())
.reduceGroups((ReduceFunction<Row>) (v1, v2) -> { .reduceGroups((ReduceFunction<Row>) (v1, v2) -> {
long ts1 = v1.getAs("ts"); long ts1 = v1.getAs("ts");
long ts2 = v2.getAs("ts"); long ts2 = v2.getAs("ts");
@@ -238,9 +250,9 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
}) })
.map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, encoder); .map((MapFunction<Tuple2<String, Row>, Row>) value -> value._2, encoder);
assertTrue(inputSnapshotDf.except(trimmedOutput).count() == 0); assertEquals(0, inputSnapshotDf.except(trimmedOutput).count());
} else { } else {
assertTrue(dataset.except(trimmedOutput).count() == 0); assertEquals(0, dataset.except(trimmedOutput).count());
} }
} }
@@ -277,7 +289,7 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
Map<String, String> props = new HashMap<>(); Map<String, String> props = new HashMap<>();
props.put(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), ComplexKeyGenerator.class.getName()); props.put(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), ComplexKeyGenerator.class.getName());
props.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey); props.put(DataSourceWriteOptions.RECORDKEY_FIELD().key(), recordKey);
props.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "simple:partition"); props.put(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition");
props.put(HoodieWriteConfig.TBL_NAME.key(), recordKey + "_table"); props.put(HoodieWriteConfig.TBL_NAME.key(), recordKey + "_table");
return props; return props;
} }
@@ -296,8 +308,9 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
List<Row> rows = DataSourceTestUtils.generateRandomRows(10); List<Row> rows = DataSourceTestUtils.generateRandomRows(10);
Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType); Dataset<Row> dataset = sqlContext.createDataFrame(rows, structType);
try { try {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", Dataset<Row> preparedDF = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(dataset, config,
"testNamespace", new NonSortPartitionerWithRows(), false, false); new NonSortPartitionerWithRows(), false);
preparedDF.count();
fail("Should have thrown exception"); fail("Should have thrown exception");
} catch (Exception e) { } catch (Exception e) {
// ignore // ignore
@@ -307,8 +320,9 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
rows = DataSourceTestUtils.generateRandomRows(10); rows = DataSourceTestUtils.generateRandomRows(10);
dataset = sqlContext.createDataFrame(rows, structType); dataset = sqlContext.createDataFrame(rows, structType);
try { try {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", Dataset<Row> preparedDF = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(dataset, config,
"testNamespace", new NonSortPartitionerWithRows(), false, false); new NonSortPartitionerWithRows(), false);
preparedDF.count();
fail("Should have thrown exception"); fail("Should have thrown exception");
} catch (Exception e) { } catch (Exception e) {
// ignore // ignore
@@ -318,8 +332,9 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
rows = DataSourceTestUtils.generateRandomRows(10); rows = DataSourceTestUtils.generateRandomRows(10);
dataset = sqlContext.createDataFrame(rows, structType); dataset = sqlContext.createDataFrame(rows, structType);
try { try {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", Dataset<Row> preparedDF = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(dataset, config,
"testNamespace", new NonSortPartitionerWithRows(), false, false); new NonSortPartitionerWithRows(), false);
preparedDF.count();
fail("Should have thrown exception"); fail("Should have thrown exception");
} catch (Exception e) { } catch (Exception e) {
// ignore // ignore
@@ -329,8 +344,9 @@ public class TestHoodieDatasetBulkInsertHelper extends HoodieClientTestBase {
rows = DataSourceTestUtils.generateRandomRows(10); rows = DataSourceTestUtils.generateRandomRows(10);
dataset = sqlContext.createDataFrame(rows, structType); dataset = sqlContext.createDataFrame(rows, structType);
try { try {
HoodieDatasetBulkInsertHelper.prepareHoodieDatasetForBulkInsert(sqlContext, config, dataset, "testStructName", Dataset<Row> preparedDF = HoodieDatasetBulkInsertHelper.prepareForBulkInsert(dataset, config,
"testNamespace", new NonSortPartitionerWithRows(), false, false); new NonSortPartitionerWithRows(), false);
preparedDF.count();
fail("Should have thrown exception"); fail("Should have thrown exception");
} catch (Exception e) { } catch (Exception e) {
// ignore // ignore

View File

@@ -24,6 +24,7 @@ import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.util.FileIOUtils; import org.apache.hudi.common.util.FileIOUtils;
import org.apache.avro.Schema; import org.apache.avro.Schema;
@@ -48,6 +49,8 @@ import static org.apache.hudi.common.testutils.HoodieTestDataGenerator.DEFAULT_T
*/ */
public class DataSourceTestUtils { public class DataSourceTestUtils {
private static final Random RANDOM = new Random(0xDAADDEED);
public static Schema getStructTypeExampleSchema() throws IOException { public static Schema getStructTypeExampleSchema() throws IOException {
return new Schema.Parser().parse(FileIOUtils.readAsUTFString(DataSourceTestUtils.class.getResourceAsStream("/exampleSchema.txt"))); return new Schema.Parser().parse(FileIOUtils.readAsUTFString(DataSourceTestUtils.class.getResourceAsStream("/exampleSchema.txt")));
} }
@@ -57,13 +60,12 @@ public class DataSourceTestUtils {
} }
public static List<Row> generateRandomRows(int count) { public static List<Row> generateRandomRows(int count) {
Random random = new Random();
List<Row> toReturn = new ArrayList<>(); List<Row> toReturn = new ArrayList<>();
List<String> partitions = Arrays.asList(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH}); List<String> partitions = Arrays.asList(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH});
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
Object[] values = new Object[3]; Object[] values = new Object[3];
values[0] = UUID.randomUUID().toString(); values[0] = HoodieTestDataGenerator.genPseudoRandomUUID(RANDOM).toString();
values[1] = partitions.get(random.nextInt(3)); values[1] = partitions.get(RANDOM.nextInt(3));
values[2] = new Date().getTime(); values[2] = new Date().getTime();
toReturn.add(RowFactory.create(values)); toReturn.add(RowFactory.create(values));
} }
@@ -97,13 +99,12 @@ public class DataSourceTestUtils {
} }
public static List<Row> generateRandomRowsEvolvedSchema(int count) { public static List<Row> generateRandomRowsEvolvedSchema(int count) {
Random random = new Random();
List<Row> toReturn = new ArrayList<>(); List<Row> toReturn = new ArrayList<>();
List<String> partitions = Arrays.asList(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH}); List<String> partitions = Arrays.asList(new String[] {DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH});
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
Object[] values = new Object[4]; Object[] values = new Object[4];
values[0] = UUID.randomUUID().toString(); values[0] = UUID.randomUUID().toString();
values[1] = partitions.get(random.nextInt(3)); values[1] = partitions.get(RANDOM.nextInt(3));
values[2] = new Date().getTime(); values[2] = new Date().getTime();
values[3] = UUID.randomUUID().toString(); values[3] = UUID.randomUUID().toString();
toReturn.add(RowFactory.create(values)); toReturn.add(RowFactory.create(values));
@@ -112,14 +113,13 @@ public class DataSourceTestUtils {
} }
public static List<Row> updateRowsWithHigherTs(Dataset<Row> inputDf) { public static List<Row> updateRowsWithHigherTs(Dataset<Row> inputDf) {
Random random = new Random();
List<Row> input = inputDf.collectAsList(); List<Row> input = inputDf.collectAsList();
List<Row> rows = new ArrayList<>(); List<Row> rows = new ArrayList<>();
for (Row row : input) { for (Row row : input) {
Object[] values = new Object[3]; Object[] values = new Object[3];
values[0] = row.getAs("_row_key"); values[0] = row.getAs("_row_key");
values[1] = row.getAs("partition"); values[1] = row.getAs("partition");
values[2] = ((Long) row.getAs("ts")) + random.nextInt(1000); values[2] = ((Long) row.getAs("ts")) + RANDOM.nextInt(1000);
rows.add(RowFactory.create(values)); rows.add(RowFactory.create(values));
} }
return rows; return rows;

View File

@@ -256,6 +256,8 @@ class TestDataSourceDefaults {
getKey(genericRecord).getRecordKey getKey(genericRecord).getRecordKey
} }
override def getRecordKey(row: InternalRow, schema: StructType): String = null
override def getPartitionPath(row: Row): String = { override def getPartitionPath(row: Row): String = {
if (null == converterFn) converterFn = AvroConversionUtils.createConverterToAvro(row.schema, STRUCT_NAME, NAMESPACE) if (null == converterFn) converterFn = AvroConversionUtils.createConverterToAvro(row.schema, STRUCT_NAME, NAMESPACE)
val genericRecord = converterFn.apply(row).asInstanceOf[GenericRecord] val genericRecord = converterFn.apply(row).asInstanceOf[GenericRecord]

View File

@@ -20,6 +20,7 @@ package org.apache.hudi
import org.apache.hudi.config.HoodieWriteConfig import org.apache.hudi.config.HoodieWriteConfig
import org.apache.hudi.testutils.HoodieClientTestBase import org.apache.hudi.testutils.HoodieClientTestBase
import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.{DataFrame, SparkSession}
import org.junit.jupiter.api.Assertions.{assertArrayEquals, assertEquals}
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test} import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import java.sql.{Date, Timestamp} import java.sql.{Date, Timestamp}
@@ -113,6 +114,6 @@ class TestGenericRecordAndRowConsistency extends HoodieClientTestBase {
.select("_hoodie_record_key") .select("_hoodie_record_key")
.map(_.toString()).collect().sorted .map(_.toString()).collect().sorted
assert(data1 sameElements data2) assertEquals(data1.toSeq, data2.toSeq)
} }
} }

View File

@@ -228,24 +228,12 @@
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-client</artifactId> <artifactId>hudi-spark-client</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId> <artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
@@ -264,14 +252,10 @@
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark3-common</artifactId> <artifactId>hudi-spark3-common</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<!-- Hoodie - Test --> <!-- Hoodie - Test -->
<dependency> <dependency>
<groupId>org.apache.hudi</groupId> <groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId> <artifactId>hudi-client-common</artifactId>
@@ -288,12 +272,6 @@
<classifier>tests</classifier> <classifier>tests</classifier>
<type>test-jar</type> <type>test-jar</type>
<scope>test</scope> <scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
@@ -312,18 +290,13 @@
<classifier>tests</classifier> <classifier>tests</classifier>
<type>test-jar</type> <type>test-jar</type>
<scope>test</scope> <scope>test</scope>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.junit.jupiter</groupId> <groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId> <artifactId>junit-jupiter-api</artifactId>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency> <dependency>
@@ -331,6 +304,29 @@
<artifactId>junit-jupiter-params</artifactId> <artifactId>junit-jupiter-params</artifactId>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<classifier>tests</classifier>
<scope>test</scope>
<!-- Need these exclusions to make sure JavaSparkContext can be setup. https://issues.apache.org/jira/browse/SPARK-1693 -->
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies> </dependencies>
</project> </project>