1
0

[HUDI-1771] Propagate CDC format for hoodie (#3285)

This commit is contained in:
swuferhong
2021-08-10 20:23:23 +08:00
committed by GitHub
parent b4441abcf7
commit 21db6d7a84
50 changed files with 1081 additions and 199 deletions

View File

@@ -18,7 +18,7 @@
package org.apache.hudi.client.model;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieOperation;
import org.apache.flink.table.data.ArrayData;
import org.apache.flink.table.data.DecimalData;
@@ -35,12 +35,7 @@ import org.apache.flink.types.RowKind;
* copy rather than fetching from {@link RowData}.
*/
public class HoodieRowData implements RowData {
private final String commitTime;
private final String commitSeqNumber;
private final String recordKey;
private final String partitionPath;
private final String fileName;
private final String[] metaColumns;
private final RowData row;
private final int metaColumnsNum;
@@ -49,14 +44,19 @@ public class HoodieRowData implements RowData {
String recordKey,
String partitionPath,
String fileName,
RowData row) {
this.commitTime = commitTime;
this.commitSeqNumber = commitSeqNumber;
this.recordKey = recordKey;
this.partitionPath = partitionPath;
this.fileName = fileName;
RowData row,
boolean withOperation) {
this.metaColumnsNum = withOperation ? 6 : 5;
this.metaColumns = new String[metaColumnsNum];
metaColumns[0] = commitTime;
metaColumns[1] = commitSeqNumber;
metaColumns[2] = recordKey;
metaColumns[3] = partitionPath;
metaColumns[4] = fileName;
if (withOperation) {
metaColumns[5] = HoodieOperation.fromValue(row.getRowKind().toByteValue()).getName();
}
this.row = row;
this.metaColumnsNum = HoodieRecord.HOODIE_META_COLUMNS.size();
}
@Override
@@ -74,28 +74,6 @@ public class HoodieRowData implements RowData {
this.row.setRowKind(kind);
}
private String getMetaColumnVal(int ordinal) {
switch (ordinal) {
case 0: {
return commitTime;
}
case 1: {
return commitSeqNumber;
}
case 2: {
return recordKey;
}
case 3: {
return partitionPath;
}
case 4: {
return fileName;
}
default:
throw new IllegalArgumentException("Not expected");
}
}
@Override
public boolean isNullAt(int ordinal) {
if (ordinal < metaColumnsNum) {
@@ -181,4 +159,8 @@ public class HoodieRowData implements RowData {
public MapData getMap(int ordinal) {
return row.getMap(ordinal - metaColumnsNum);
}
private String getMetaColumnVal(int ordinal) {
return this.metaColumns[ordinal];
}
}

View File

@@ -79,6 +79,9 @@ public class FlinkAppendHandle<T extends HoodieRecordPayload, I, K, O>
@Override
protected boolean isUpdateRecord(HoodieRecord<T> hoodieRecord) {
// do not use the HoodieRecord operation because hoodie writer has its own
// INSERT/MERGE bucket for 'UPSERT' semantics. For e.g, a hoodie record with fresh new key
// and operation HoodieCdcOperation.DELETE would be put into either an INSERT bucket or UPDATE bucket.
return hoodieRecord.getCurrentLocation() != null
&& hoodieRecord.getCurrentLocation().getInstantTime().equals("U");
}

View File

@@ -117,7 +117,7 @@ public class HoodieRowDataCreateHandle implements Serializable {
try {
String seqId = HoodieRecord.generateSequenceId(instantTime, taskPartitionId, SEQGEN.getAndIncrement());
HoodieRowData rowData = new HoodieRowData(instantTime, seqId, recordKey, partitionPath, path.getName(),
record);
record, writeConfig.allowOperationMetadataField());
try {
fileWriter.writeRow(recordKey, rowData);
writeStatus.markSuccess(recordKey);
@@ -131,7 +131,7 @@ public class HoodieRowDataCreateHandle implements Serializable {
}
/**
* @returns {@code true} if this handle can take in more writes. else {@code false}.
* Returns {@code true} if this handle can take in more writes. else {@code false}.
*/
public boolean canWrite() {
return fileWriter.canWrite();

View File

@@ -354,7 +354,7 @@ public class HoodieFlinkCopyOnWriteTable<T extends HoodieRecordPayload> extends
dataFileToBeMerged, taskContextSupplier, Option.empty());
} else {
return new HoodieMergeHandle<>(config, instantTime, this, keyToNewRecords, partitionPath, fileId,
dataFileToBeMerged,taskContextSupplier, Option.empty());
dataFileToBeMerged, taskContextSupplier, Option.empty());
}
}

View File

@@ -21,6 +21,7 @@ package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieOperation;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.util.collection.Pair;
@@ -45,7 +46,7 @@ import java.util.stream.Collectors;
* <p>Computing the records batch locations all at a time is a pressure to the engine,
* we should avoid that in streaming system.
*/
public class FlinkWriteHelper<T extends HoodieRecordPayload,R> extends AbstractWriteHelper<T, List<HoodieRecord<T>>,
public class FlinkWriteHelper<T extends HoodieRecordPayload, R> extends AbstractWriteHelper<T, List<HoodieRecord<T>>,
List<HoodieKey>, List<WriteStatus>, R> {
private FlinkWriteHelper() {
@@ -80,8 +81,8 @@ public class FlinkWriteHelper<T extends HoodieRecordPayload,R> extends AbstractW
@Override
public List<HoodieRecord<T>> deduplicateRecords(List<HoodieRecord<T>> records,
HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> index,
int parallelism) {
HoodieIndex<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> index,
int parallelism) {
Map<Object, List<Pair<Object, HoodieRecord<T>>>> keyedRecords = records.stream().map(record -> {
// If index used is global, then records are expected to differ in their partitionPath
final Object key = record.getKey().getRecordKey();
@@ -89,13 +90,17 @@ public class FlinkWriteHelper<T extends HoodieRecordPayload,R> extends AbstractW
}).collect(Collectors.groupingBy(Pair::getLeft));
return keyedRecords.values().stream().map(x -> x.stream().map(Pair::getRight).reduce((rec1, rec2) -> {
@SuppressWarnings("unchecked")
T reducedData = (T) rec1.getData().preCombine(rec2.getData());
final T data1 = rec1.getData();
final T data2 = rec2.getData();
@SuppressWarnings("unchecked") final T reducedData = (T) data2.preCombine(data1);
// we cannot allow the user to change the key or partitionPath, since that will affect
// everything
// so pick it from one of the records.
HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey();
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(reducedKey, reducedData);
boolean choosePrev = data1.equals(reducedData);
HoodieKey reducedKey = choosePrev ? rec1.getKey() : rec2.getKey();
HoodieOperation operation = choosePrev ? rec1.getOperation() : rec2.getOperation();
HoodieRecord<T> hoodieRecord = new HoodieRecord<>(reducedKey, reducedData, operation);
// reuse the location from the first record.
hoodieRecord.setCurrentLocation(rec1.getCurrentLocation());
return hoodieRecord;

View File

@@ -86,7 +86,7 @@ public class HoodieFlinkMergeOnReadTableCompactor<T extends HoodieRecordPayload>
@Override
public List<WriteStatus> compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan,
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
HoodieTable hoodieTable, HoodieWriteConfig config, String compactionInstantTime) throws IOException {
throw new UnsupportedOperationException("HoodieFlinkMergeOnReadTableCompactor does not support compact directly, "
+ "the function works as a separate pipeline");
}
@@ -98,7 +98,7 @@ public class HoodieFlinkMergeOnReadTableCompactor<T extends HoodieRecordPayload>
String instantTime) throws IOException {
FileSystem fs = metaClient.getFs();
Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames()
+ " for commit " + instantTime);
// TODO - FIX THIS