[HUDI-1771] Propagate CDC format for hoodie (#3285)
This commit is contained in:
@@ -18,7 +18,7 @@
|
||||
|
||||
package org.apache.hudi.client.model;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieOperation;
|
||||
|
||||
import org.apache.flink.table.data.ArrayData;
|
||||
import org.apache.flink.table.data.DecimalData;
|
||||
@@ -35,12 +35,7 @@ import org.apache.flink.types.RowKind;
|
||||
* copy rather than fetching from {@link RowData}.
|
||||
*/
|
||||
public class HoodieRowData implements RowData {
|
||||
|
||||
private final String commitTime;
|
||||
private final String commitSeqNumber;
|
||||
private final String recordKey;
|
||||
private final String partitionPath;
|
||||
private final String fileName;
|
||||
private final String[] metaColumns;
|
||||
private final RowData row;
|
||||
private final int metaColumnsNum;
|
||||
|
||||
@@ -49,14 +44,19 @@ public class HoodieRowData implements RowData {
|
||||
String recordKey,
|
||||
String partitionPath,
|
||||
String fileName,
|
||||
RowData row) {
|
||||
this.commitTime = commitTime;
|
||||
this.commitSeqNumber = commitSeqNumber;
|
||||
this.recordKey = recordKey;
|
||||
this.partitionPath = partitionPath;
|
||||
this.fileName = fileName;
|
||||
RowData row,
|
||||
boolean withOperation) {
|
||||
this.metaColumnsNum = withOperation ? 6 : 5;
|
||||
this.metaColumns = new String[metaColumnsNum];
|
||||
metaColumns[0] = commitTime;
|
||||
metaColumns[1] = commitSeqNumber;
|
||||
metaColumns[2] = recordKey;
|
||||
metaColumns[3] = partitionPath;
|
||||
metaColumns[4] = fileName;
|
||||
if (withOperation) {
|
||||
metaColumns[5] = HoodieOperation.fromValue(row.getRowKind().toByteValue()).getName();
|
||||
}
|
||||
this.row = row;
|
||||
this.metaColumnsNum = HoodieRecord.HOODIE_META_COLUMNS.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -74,28 +74,6 @@ public class HoodieRowData implements RowData {
|
||||
this.row.setRowKind(kind);
|
||||
}
|
||||
|
||||
private String getMetaColumnVal(int ordinal) {
|
||||
switch (ordinal) {
|
||||
case 0: {
|
||||
return commitTime;
|
||||
}
|
||||
case 1: {
|
||||
return commitSeqNumber;
|
||||
}
|
||||
case 2: {
|
||||
return recordKey;
|
||||
}
|
||||
case 3: {
|
||||
return partitionPath;
|
||||
}
|
||||
case 4: {
|
||||
return fileName;
|
||||
}
|
||||
default:
|
||||
throw new IllegalArgumentException("Not expected");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNullAt(int ordinal) {
|
||||
if (ordinal < metaColumnsNum) {
|
||||
@@ -181,4 +159,8 @@ public class HoodieRowData implements RowData {
|
||||
public MapData getMap(int ordinal) {
|
||||
return row.getMap(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
private String getMetaColumnVal(int ordinal) {
|
||||
return this.metaColumns[ordinal];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,6 +79,9 @@ public class FlinkAppendHandle<T extends HoodieRecordPayload, I, K, O>
|
||||
|
||||
@Override
|
||||
protected boolean isUpdateRecord(HoodieRecord<T> hoodieRecord) {
|
||||
// do not use the HoodieRecord operation because hoodie writer has its own
|
||||
// INSERT/MERGE bucket for 'UPSERT' semantics. For e.g, a hoodie record with fresh new key
|
||||
// and operation HoodieCdcOperation.DELETE would be put into either an INSERT bucket or UPDATE bucket.
|
||||
return hoodieRecord.getCurrentLocation() != null
|
||||
&& hoodieRecord.getCurrentLocation().getInstantTime().equals("U");
|
||||
}
|
||||
|
||||
@@ -117,7 +117,7 @@ public class HoodieRowDataCreateHandle implements Serializable {
|
||||
try {
|
||||
String seqId = HoodieRecord.generateSequenceId(instantTime, taskPartitionId, SEQGEN.getAndIncrement());
|
||||
HoodieRowData rowData = new HoodieRowData(instantTime, seqId, recordKey, partitionPath, path.getName(),
|
||||
record);
|
||||
record, writeConfig.allowOperationMetadataField());
|
||||
try {
|
||||
fileWriter.writeRow(recordKey, rowData);
|
||||
writeStatus.markSuccess(recordKey);
|
||||
@@ -131,7 +131,7 @@ public class HoodieRowDataCreateHandle implements Serializable {
|
||||
}
|
||||
|
||||
/**
|
||||
* @returns {@code true} if this handle can take in more writes. else {@code false}.
|
||||
* Returns {@code true} if this handle can take in more writes. else {@code false}.
|
||||
*/
|
||||
public boolean canWrite() {
|
||||
return fileWriter.canWrite();
|
||||
|
||||
@@ -354,7 +354,7 @@ public class HoodieFlinkCopyOnWriteTable<T extends HoodieRecordPayload> extends
|
||||
dataFileToBeMerged, taskContextSupplier, Option.empty());
|
||||
} else {
|
||||
return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
|
||||
dataFileToBeMerged,taskContextSupplier, Option.empty());
|
||||
dataFileToBeMerged, taskContextSupplier, Option.empty());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ package org.apache.hudi.table.action.commit;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieOperation;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
@@ -45,7 +46,7 @@ import java.util.stream.Collectors;
|
||||
* <p>Computing the records batch locations all at a time is a pressure to the engine,
|
||||
* we should avoid that in streaming system.
|
||||
*/
|
||||
public class FlinkWriteHelper<T extends HoodieRecordPayload,R> extends AbstractWriteHelper<T, List<HoodieRecord<T>>,
|
||||
public class FlinkWriteHelper<T extends HoodieRecordPayload, R> extends AbstractWriteHelper<T, List<HoodieRecord<T>>,
|
||||
List<HoodieKey>, List<WriteStatus>, R> {
|
||||
|
||||
private FlinkWriteHelper() {
|
||||
@@ -80,8 +81,8 @@ public class FlinkWriteHelper<T extends HoodieRecordPayload,R> extends AbstractW
|
||||
|
||||
@Override
|
||||
public List<HoodieRecord<T>> deduplicateRecords(List<HoodieRecord<T>> records,
|
||||
HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> index,
|
||||
int parallelism) {
|
||||
HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> index,
|
||||
int parallelism) {
|
||||
Map<Object, List<Pair<Object, HoodieRecord<T>>>> keyedRecords = records.stream().map(record -> {
|
||||
// If index used is global, then records are expected to differ in their partitionPath
|
||||
final Object key = record.getKey().getRecordKey();
|
||||
@@ -89,13 +90,17 @@ public class FlinkWriteHelper<T extends HoodieRecordPayload,R> extends AbstractW
|
||||
}).collect(Collectors.groupingBy(Pair::getLeft));
|
||||
|
||||
return keyedRecords.values().stream().map(x -> x.stream().map(Pair::getRight).reduce((rec1, rec2) -> {
|
||||
@SuppressWarnings("unchecked")
|
||||
T reducedData = (T) rec1.getData().preCombine(rec2.getData());
|
||||
final T data1 = rec1.getData();
|
||||
final T data2 = rec2.getData();
|
||||
|
||||
@SuppressWarnings("unchecked") final T reducedData = (T) data2.preCombine(data1);
|
||||
// we cannot allow the user to change the key or partitionPath, since that will affect
|
||||
// everything
|
||||
// so pick it from one of the records.
|
||||
HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey();
|
||||
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(reducedKey, reducedData);
|
||||
boolean choosePrev = data1.equals(reducedData);
|
||||
HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey();
|
||||
HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation();
|
||||
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(reducedKey, reducedData, operation);
|
||||
// reuse the location from the first record.
|
||||
hoodieRecord.setCurrentLocation(rec1.getCurrentLocation());
|
||||
return hoodieRecord;
|
||||
|
||||
@@ -86,7 +86,7 @@ public class HoodieFlinkMergeOnReadTableCompactor<T extends HoodieRecordPayload>
|
||||
|
||||
@Override
|
||||
public List<WriteStatus> compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan,
|
||||
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
|
||||
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
|
||||
throw new UnsupportedOperationException("HoodieFlinkMergeOnReadTableCompactor does not support compact directly, "
|
||||
+ "the function works as a separate pipeline");
|
||||
}
|
||||
@@ -98,7 +98,7 @@ public class HoodieFlinkMergeOnReadTableCompactor<T extends HoodieRecordPayload>
|
||||
String instantTime) throws IOException {
|
||||
FileSystem fs = metaClient.getFs();
|
||||
|
||||
Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||
Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
|
||||
LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames()
|
||||
+ " for commit " + instantTime);
|
||||
// TODO - FIX THIS
|
||||
|
||||
Reference in New Issue
Block a user