[HUDI-2087] Support Append only in Flink stream (#3390)
Co-authored-by: 喻兆靖 <yuzhaojing@bilibili.com>
This commit is contained in:
@@ -437,6 +437,12 @@ public class HoodieFlinkWriteClient<T extends HoodieRecordPayload> extends
|
|||||||
final HoodieRecordLocation loc = record.getCurrentLocation();
|
final HoodieRecordLocation loc = record.getCurrentLocation();
|
||||||
final String fileID = loc.getFileId();
|
final String fileID = loc.getFileId();
|
||||||
final String partitionPath = record.getPartitionPath();
|
final String partitionPath = record.getPartitionPath();
|
||||||
|
// Always use FlinkCreateHandle when insert duplication turns on
|
||||||
|
if (config.allowDuplicateInserts()) {
|
||||||
|
return new FlinkCreateHandle<>(config, instantTime, table, partitionPath,
|
||||||
|
fileID, table.getTaskContextSupplier());
|
||||||
|
}
|
||||||
|
|
||||||
if (bucketToHandles.containsKey(fileID)) {
|
if (bucketToHandles.containsKey(fileID)) {
|
||||||
MiniBatchHandle lastHandle = (MiniBatchHandle) bucketToHandles.get(fileID);
|
MiniBatchHandle lastHandle = (MiniBatchHandle) bucketToHandles.get(fileID);
|
||||||
if (lastHandle.shouldReplace()) {
|
if (lastHandle.shouldReplace()) {
|
||||||
|
|||||||
@@ -209,6 +209,12 @@ public class FlinkOptions extends HoodieConfig {
|
|||||||
.defaultValue(TABLE_TYPE_COPY_ON_WRITE)
|
.defaultValue(TABLE_TYPE_COPY_ON_WRITE)
|
||||||
.withDescription("Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ");
|
.withDescription("Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ");
|
||||||
|
|
||||||
|
public static final ConfigOption<Boolean> INSERT_ALLOW_DUP = ConfigOptions
|
||||||
|
.key("write.insert.allow_dup")
|
||||||
|
.booleanType()
|
||||||
|
.defaultValue(true)
|
||||||
|
.withDescription("Whether to allow data duplication for INSERT operation, if enabled, writes the base files directly, default true");
|
||||||
|
|
||||||
public static final ConfigOption<String> OPERATION = ConfigOptions
|
public static final ConfigOption<String> OPERATION = ConfigOptions
|
||||||
.key("write.operation")
|
.key("write.operation")
|
||||||
.stringType()
|
.stringType()
|
||||||
|
|||||||
@@ -91,8 +91,6 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
|
|
||||||
private final Configuration conf;
|
private final Configuration conf;
|
||||||
|
|
||||||
private transient org.apache.hadoop.conf.Configuration hadoopConf;
|
|
||||||
|
|
||||||
private final boolean isChangingRecords;
|
private final boolean isChangingRecords;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -117,21 +115,25 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
public void open(Configuration parameters) throws Exception {
|
public void open(Configuration parameters) throws Exception {
|
||||||
super.open(parameters);
|
super.open(parameters);
|
||||||
HoodieWriteConfig writeConfig = StreamerUtil.getHoodieClientConfig(this.conf);
|
HoodieWriteConfig writeConfig = StreamerUtil.getHoodieClientConfig(this.conf);
|
||||||
this.hadoopConf = StreamerUtil.getHadoopConf();
|
|
||||||
HoodieFlinkEngineContext context = new HoodieFlinkEngineContext(
|
HoodieFlinkEngineContext context = new HoodieFlinkEngineContext(
|
||||||
new SerializableConfiguration(this.hadoopConf),
|
new SerializableConfiguration(StreamerUtil.getHadoopConf()),
|
||||||
new FlinkTaskContextSupplier(getRuntimeContext()));
|
new FlinkTaskContextSupplier(getRuntimeContext()));
|
||||||
this.bucketAssigner = BucketAssigners.create(
|
this.bucketAssigner = BucketAssigners.create(
|
||||||
getRuntimeContext().getIndexOfThisSubtask(),
|
getRuntimeContext().getIndexOfThisSubtask(),
|
||||||
getRuntimeContext().getMaxNumberOfParallelSubtasks(),
|
getRuntimeContext().getMaxNumberOfParallelSubtasks(),
|
||||||
getRuntimeContext().getNumberOfParallelSubtasks(),
|
getRuntimeContext().getNumberOfParallelSubtasks(),
|
||||||
WriteOperationType.isOverwrite(WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION))),
|
ignoreSmallFiles(writeConfig),
|
||||||
HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE)),
|
HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE)),
|
||||||
context,
|
context,
|
||||||
writeConfig);
|
writeConfig);
|
||||||
this.payloadCreation = PayloadCreation.instance(this.conf);
|
this.payloadCreation = PayloadCreation.instance(this.conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean ignoreSmallFiles(HoodieWriteConfig writeConfig) {
|
||||||
|
WriteOperationType operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION));
|
||||||
|
return WriteOperationType.isOverwrite(operationType) || writeConfig.allowDuplicateInserts();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void snapshotState(FunctionSnapshotContext context) {
|
public void snapshotState(FunctionSnapshotContext context) {
|
||||||
this.bucketAssigner.reset();
|
this.bucketAssigner.reset();
|
||||||
|
|||||||
@@ -35,25 +35,25 @@ public abstract class BucketAssigners {
|
|||||||
/**
|
/**
|
||||||
* Creates a {@code BucketAssigner}.
|
* Creates a {@code BucketAssigner}.
|
||||||
*
|
*
|
||||||
* @param taskID The task ID
|
* @param taskID The task ID
|
||||||
* @param maxParallelism The max parallelism
|
* @param maxParallelism The max parallelism
|
||||||
* @param numTasks The number of tasks
|
* @param numTasks The number of tasks
|
||||||
* @param overwrite Whether the write operation is OVERWRITE
|
* @param ignoreSmallFiles Whether to ignore the small files
|
||||||
* @param tableType The table type
|
* @param tableType The table type
|
||||||
* @param context The engine context
|
* @param context The engine context
|
||||||
* @param config The configuration
|
* @param config The configuration
|
||||||
* @return the bucket assigner instance
|
* @return the bucket assigner instance
|
||||||
*/
|
*/
|
||||||
public static BucketAssigner create(
|
public static BucketAssigner create(
|
||||||
int taskID,
|
int taskID,
|
||||||
int maxParallelism,
|
int maxParallelism,
|
||||||
int numTasks,
|
int numTasks,
|
||||||
boolean overwrite,
|
boolean ignoreSmallFiles,
|
||||||
HoodieTableType tableType,
|
HoodieTableType tableType,
|
||||||
HoodieFlinkEngineContext context,
|
HoodieFlinkEngineContext context,
|
||||||
HoodieWriteConfig config) {
|
HoodieWriteConfig config) {
|
||||||
boolean delta = tableType.equals(HoodieTableType.MERGE_ON_READ);
|
boolean delta = tableType.equals(HoodieTableType.MERGE_ON_READ);
|
||||||
WriteProfile writeProfile = WriteProfiles.singleton(overwrite, delta, config, context);
|
WriteProfile writeProfile = WriteProfiles.singleton(ignoreSmallFiles, delta, config, context);
|
||||||
return new BucketAssigner(taskID, maxParallelism, numTasks, writeProfile, config);
|
return new BucketAssigner(taskID, maxParallelism, numTasks, writeProfile, config);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -26,13 +26,18 @@ import java.util.Collections;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* WriteProfile for INSERT OVERWRITE and INSERT OVERWRITE TABLE operations,
|
* WriteProfile that always return empty small files.
|
||||||
* this WriteProfile always skip the existing small files because of the 'OVERWRITE' semantics.
|
*
|
||||||
|
* <p>This write profile is used for cases:
|
||||||
|
* i). INSERT OVERWRITE and INSERT OVERWRITE TABLE operations,
|
||||||
|
* the existing small files are ignored because of the 'OVERWRITE' semantics;
|
||||||
|
* ii). INSERT operation when data file merge is disabled.
|
||||||
|
*
|
||||||
*
|
*
|
||||||
* <p>Note: assumes the index can always index log files for Flink write.
|
* <p>Note: assumes the index can always index log files for Flink write.
|
||||||
*/
|
*/
|
||||||
public class OverwriteWriteProfile extends WriteProfile {
|
public class EmptyWriteProfile extends WriteProfile {
|
||||||
public OverwriteWriteProfile(HoodieWriteConfig config, HoodieFlinkEngineContext context) {
|
public EmptyWriteProfile(HoodieWriteConfig config, HoodieFlinkEngineContext context) {
|
||||||
super(config, context);
|
super(config, context);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -53,22 +53,22 @@ public class WriteProfiles {
|
|||||||
|
|
||||||
private WriteProfiles() {}
|
private WriteProfiles() {}
|
||||||
|
|
||||||
public static synchronized WriteProfile singleton(
|
public static synchronized WriteProfile singleton(
|
||||||
boolean overwrite,
|
boolean ignoreSmallFiles,
|
||||||
boolean delta,
|
boolean delta,
|
||||||
HoodieWriteConfig config,
|
HoodieWriteConfig config,
|
||||||
HoodieFlinkEngineContext context) {
|
HoodieFlinkEngineContext context) {
|
||||||
return PROFILES.computeIfAbsent(config.getBasePath(),
|
return PROFILES.computeIfAbsent(config.getBasePath(),
|
||||||
k -> getWriteProfile(overwrite, delta, config, context));
|
k -> getWriteProfile(ignoreSmallFiles, delta, config, context));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static WriteProfile getWriteProfile(
|
private static WriteProfile getWriteProfile(
|
||||||
boolean overwrite,
|
boolean ignoreSmallFiles,
|
||||||
boolean delta,
|
boolean delta,
|
||||||
HoodieWriteConfig config,
|
HoodieWriteConfig config,
|
||||||
HoodieFlinkEngineContext context) {
|
HoodieFlinkEngineContext context) {
|
||||||
if (overwrite) {
|
if (ignoreSmallFiles) {
|
||||||
return new OverwriteWriteProfile(config, context);
|
return new EmptyWriteProfile(config, context);
|
||||||
} else if (delta) {
|
} else if (delta) {
|
||||||
return new DeltaWriteProfile(config, context);
|
return new DeltaWriteProfile(config, context);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -69,6 +69,9 @@ public class FlinkStreamerConfig extends Configuration {
|
|||||||
@Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ.", required = true)
|
@Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ.", required = true)
|
||||||
public String tableType;
|
public String tableType;
|
||||||
|
|
||||||
|
@Parameter(names = {"--insert-allow-dup"}, description = "Whether to allow data duplication for INSERT operation, if enabled, writes the base files directly.", required = true)
|
||||||
|
public Boolean insertAllowDup = true;
|
||||||
|
|
||||||
@Parameter(names = {"--props"}, description = "Path to properties file on localfs or dfs, with configurations for "
|
@Parameter(names = {"--props"}, description = "Path to properties file on localfs or dfs, with configurations for "
|
||||||
+ "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are "
|
+ "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are "
|
||||||
+ "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer"
|
+ "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer"
|
||||||
@@ -305,6 +308,7 @@ public class FlinkStreamerConfig extends Configuration {
|
|||||||
conf.setString(FlinkOptions.TABLE_NAME, config.targetTableName);
|
conf.setString(FlinkOptions.TABLE_NAME, config.targetTableName);
|
||||||
// copy_on_write works same as COPY_ON_WRITE
|
// copy_on_write works same as COPY_ON_WRITE
|
||||||
conf.setString(FlinkOptions.TABLE_TYPE, config.tableType.toUpperCase());
|
conf.setString(FlinkOptions.TABLE_TYPE, config.tableType.toUpperCase());
|
||||||
|
conf.setBoolean(FlinkOptions.INSERT_ALLOW_DUP, config.insertAllowDup);
|
||||||
conf.setString(FlinkOptions.OPERATION, config.operation.value());
|
conf.setString(FlinkOptions.OPERATION, config.operation.value());
|
||||||
conf.setString(FlinkOptions.PRECOMBINE_FIELD, config.sourceOrderingField);
|
conf.setString(FlinkOptions.PRECOMBINE_FIELD, config.sourceOrderingField);
|
||||||
conf.setString(FlinkOptions.PAYLOAD_CLASS, config.payloadClassName);
|
conf.setString(FlinkOptions.PAYLOAD_CLASS, config.payloadClassName);
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab
|
|||||||
|
|
||||||
Configuration conf = (Configuration) helper.getOptions();
|
Configuration conf = (Configuration) helper.getOptions();
|
||||||
TableSchema schema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema());
|
TableSchema schema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema());
|
||||||
validateRequiredFields(conf, schema);
|
sanityCheck(conf, schema);
|
||||||
setupConfOptions(conf, context.getObjectIdentifier().getObjectName(), context.getCatalogTable(), schema);
|
setupConfOptions(conf, context.getObjectIdentifier().getObjectName(), context.getCatalogTable(), schema);
|
||||||
|
|
||||||
Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() ->
|
Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() ->
|
||||||
@@ -79,7 +79,7 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab
|
|||||||
public DynamicTableSink createDynamicTableSink(Context context) {
|
public DynamicTableSink createDynamicTableSink(Context context) {
|
||||||
Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions());
|
Configuration conf = FlinkOptions.fromMap(context.getCatalogTable().getOptions());
|
||||||
TableSchema schema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema());
|
TableSchema schema = TableSchemaUtils.getPhysicalSchema(context.getCatalogTable().getSchema());
|
||||||
validateRequiredFields(conf, schema);
|
sanityCheck(conf, schema);
|
||||||
setupConfOptions(conf, context.getObjectIdentifier().getObjectName(), context.getCatalogTable(), schema);
|
setupConfOptions(conf, context.getObjectIdentifier().getObjectName(), context.getCatalogTable(), schema);
|
||||||
return new HoodieTableSink(conf, schema);
|
return new HoodieTableSink(conf, schema);
|
||||||
}
|
}
|
||||||
@@ -103,12 +103,13 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab
|
|||||||
// Utilities
|
// Utilities
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|
||||||
/** Validate required options. For e.g, record key and pre_combine key.
|
/**
|
||||||
|
* The sanity check.
|
||||||
*
|
*
|
||||||
* @param conf The table options
|
* @param conf The table options
|
||||||
* @param schema The table schema
|
* @param schema The table schema
|
||||||
*/
|
*/
|
||||||
private void validateRequiredFields(Configuration conf, TableSchema schema) {
|
private void sanityCheck(Configuration conf, TableSchema schema) {
|
||||||
List<String> fields = Arrays.stream(schema.getFieldNames()).collect(Collectors.toList());
|
List<String> fields = Arrays.stream(schema.getFieldNames()).collect(Collectors.toList());
|
||||||
|
|
||||||
// validate record key in pk absence.
|
// validate record key in pk absence.
|
||||||
@@ -128,6 +129,11 @@ public class HoodieTableFactory implements DynamicTableSourceFactory, DynamicTab
|
|||||||
throw new ValidationException("Field " + preCombineField + " does not exist in the table schema."
|
throw new ValidationException("Field " + preCombineField + " does not exist in the table schema."
|
||||||
+ "Please check 'write.precombine.field' option.");
|
+ "Please check 'write.precombine.field' option.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (conf.getString(FlinkOptions.TABLE_TYPE).toUpperCase().equals(FlinkOptions.TABLE_TYPE_MERGE_ON_READ)
|
||||||
|
&& conf.getBoolean(FlinkOptions.INSERT_ALLOW_DUP)) {
|
||||||
|
throw new ValidationException("Option 'write.insert.allow_dup' is only allowed for COPY_ON_WRITE table.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ import org.apache.hudi.common.config.SerializableConfiguration;
|
|||||||
import org.apache.hudi.common.config.TypedProperties;
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.common.engine.EngineType;
|
import org.apache.hudi.common.engine.EngineType;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
|
import org.apache.hudi.common.model.WriteOperationType;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
@@ -145,6 +146,7 @@ public class StreamerUtil {
|
|||||||
.withEngineType(EngineType.FLINK)
|
.withEngineType(EngineType.FLINK)
|
||||||
.withPath(conf.getString(FlinkOptions.PATH))
|
.withPath(conf.getString(FlinkOptions.PATH))
|
||||||
.combineInput(conf.getBoolean(FlinkOptions.INSERT_DROP_DUPS), true)
|
.combineInput(conf.getBoolean(FlinkOptions.INSERT_DROP_DUPS), true)
|
||||||
|
.withMergeAllowDuplicateOnInserts(allowDuplicateInserts(conf))
|
||||||
.withCompactionConfig(
|
.withCompactionConfig(
|
||||||
HoodieCompactionConfig.newBuilder()
|
HoodieCompactionConfig.newBuilder()
|
||||||
.withPayloadClass(conf.getString(FlinkOptions.PAYLOAD_CLASS))
|
.withPayloadClass(conf.getString(FlinkOptions.PAYLOAD_CLASS))
|
||||||
@@ -345,4 +347,9 @@ public class StreamerUtil {
|
|||||||
throw new IOException("Could not load transformer class(es) " + classNames, e);
|
throw new IOException("Could not load transformer class(es) " + classNames, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static boolean allowDuplicateInserts(Configuration conf) {
|
||||||
|
WriteOperationType operationType = WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION));
|
||||||
|
return operationType == WriteOperationType.INSERT && conf.getBoolean(FlinkOptions.INSERT_ALLOW_DUP);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.client.WriteStatus;
|
|||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
|
import org.apache.hudi.common.model.WriteOperationType;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageType;
|
||||||
@@ -532,6 +533,81 @@ public class TestWriteCopyOnWrite {
|
|||||||
checkWrittenData(tempFile, expected, 1);
|
checkWrittenData(tempFile, expected, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testInsertAllowsDuplication() throws Exception {
|
||||||
|
// reset the config option
|
||||||
|
conf.setDouble(FlinkOptions.WRITE_BATCH_SIZE, 0.0006); // 630 bytes batch size
|
||||||
|
conf.setString(FlinkOptions.OPERATION, WriteOperationType.INSERT.value());
|
||||||
|
funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);
|
||||||
|
|
||||||
|
// open the function and ingest data
|
||||||
|
funcWrapper.openFunction();
|
||||||
|
// Each record is 208 bytes. so 4 records expect to trigger a mini-batch write
|
||||||
|
for (RowData rowData : TestData.DATA_SET_INSERT_SAME_KEY) {
|
||||||
|
funcWrapper.invoke(rowData);
|
||||||
|
}
|
||||||
|
|
||||||
|
// this triggers the data write and event send
|
||||||
|
funcWrapper.checkpointFunction(1);
|
||||||
|
Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
|
||||||
|
assertThat("All data should be flushed out", dataBuffer.size(), is(0));
|
||||||
|
|
||||||
|
final OperatorEvent event1 = funcWrapper.getNextEvent(); // remove the first event first
|
||||||
|
final OperatorEvent event2 = funcWrapper.getNextEvent();
|
||||||
|
assertThat("The operator expect to send an event", event2, instanceOf(WriteMetadataEvent.class));
|
||||||
|
|
||||||
|
funcWrapper.getCoordinator().handleEventFromOperator(0, event1);
|
||||||
|
funcWrapper.getCoordinator().handleEventFromOperator(0, event2);
|
||||||
|
assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event");
|
||||||
|
|
||||||
|
String instant = funcWrapper.getWriteClient()
|
||||||
|
.getLastPendingInstant(getTableType());
|
||||||
|
|
||||||
|
funcWrapper.checkpointComplete(1);
|
||||||
|
|
||||||
|
Map<String, String> expected = new HashMap<>();
|
||||||
|
|
||||||
|
expected.put("par1", "["
|
||||||
|
+ "id1,par1,id1,Danny,23,0,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,1,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,2,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,3,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,4,par1]");
|
||||||
|
|
||||||
|
TestData.checkWrittenAllData(tempFile, expected, 1);
|
||||||
|
|
||||||
|
// started a new instant already
|
||||||
|
checkInflightInstant(funcWrapper.getWriteClient());
|
||||||
|
checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant);
|
||||||
|
|
||||||
|
// insert duplicates again
|
||||||
|
for (RowData rowData : TestData.DATA_SET_INSERT_SAME_KEY) {
|
||||||
|
funcWrapper.invoke(rowData);
|
||||||
|
}
|
||||||
|
|
||||||
|
funcWrapper.checkpointFunction(2);
|
||||||
|
|
||||||
|
final OperatorEvent event3 = funcWrapper.getNextEvent(); // remove the first event first
|
||||||
|
final OperatorEvent event4 = funcWrapper.getNextEvent();
|
||||||
|
funcWrapper.getCoordinator().handleEventFromOperator(0, event3);
|
||||||
|
funcWrapper.getCoordinator().handleEventFromOperator(0, event4);
|
||||||
|
funcWrapper.checkpointComplete(2);
|
||||||
|
|
||||||
|
// same with the original base file content.
|
||||||
|
expected.put("par1", "["
|
||||||
|
+ "id1,par1,id1,Danny,23,0,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,0,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,1,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,1,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,2,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,2,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,3,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,3,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,4,par1, "
|
||||||
|
+ "id1,par1,id1,Danny,23,4,par1]");
|
||||||
|
TestData.checkWrittenAllData(tempFile, expected, 1);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testInsertWithSmallBufferSize() throws Exception {
|
public void testInsertWithSmallBufferSize() throws Exception {
|
||||||
// reset the config option
|
// reset the config option
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ import org.apache.avro.Schema;
|
|||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@@ -67,6 +68,11 @@ public class TestWriteMergeOnRead extends TestWriteCopyOnWrite {
|
|||||||
conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false);
|
conf.setBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testInsertAllowsDuplication() {
|
||||||
|
// ignore the test because only COW table supports INSERT duplication
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void checkWrittenData(File baseFile, Map<String, String> expected, int partitions) throws Exception {
|
protected void checkWrittenData(File baseFile, Map<String, String> expected, int partitions) throws Exception {
|
||||||
HoodieTableMetaClient metaClient = HoodieFlinkTable.create(writeConfig, context).getMetaClient();
|
HoodieTableMetaClient metaClient = HoodieFlinkTable.create(writeConfig, context).getMetaClient();
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ import org.apache.hudi.common.model.HoodieTableType;
|
|||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.junit.jupiter.api.Disabled;
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@@ -39,10 +38,14 @@ public class TestWriteMergeOnReadWithCompact extends TestWriteCopyOnWrite {
|
|||||||
conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1);
|
conf.setInteger(FlinkOptions.COMPACTION_DELTA_COMMITS, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Disabled
|
|
||||||
@Test
|
@Test
|
||||||
public void testIndexStateBootstrap() {
|
public void testInsertAllowsDuplication() {
|
||||||
// Ignore the index bootstrap because we only support parquet load now.
|
// ignore the test because only COW table supports INSERT duplication
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Map<String, String> getExpectedBeforeCheckpointComplete() {
|
||||||
|
return EXPECTED1;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Map<String, String> getMiniBatchExpected() {
|
protected Map<String, String> getMiniBatchExpected() {
|
||||||
|
|||||||
@@ -340,6 +340,24 @@ public class TestHoodieTableFactory {
|
|||||||
assertThat(conf2.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS), is(45));
|
assertThat(conf2.getInteger(FlinkOptions.ARCHIVE_MAX_COMMITS), is(45));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testMorTableInsertAllowDuplication() {
|
||||||
|
TableSchema schema = TableSchema.builder()
|
||||||
|
.field("f0", DataTypes.INT().notNull())
|
||||||
|
.field("f1", DataTypes.VARCHAR(20))
|
||||||
|
.field("f2", DataTypes.TIMESTAMP(3))
|
||||||
|
.field("ts", DataTypes.TIMESTAMP(3))
|
||||||
|
.primaryKey("f0")
|
||||||
|
.build();
|
||||||
|
// overwrite the operation
|
||||||
|
this.conf.setString(FlinkOptions.OPERATION.key(), "insert");
|
||||||
|
this.conf.setString(FlinkOptions.TABLE_TYPE.key(), FlinkOptions.TABLE_TYPE_MERGE_ON_READ);
|
||||||
|
|
||||||
|
final MockContext sinkContext = MockContext.getInstance(this.conf, schema, "f2");
|
||||||
|
assertThrows(ValidationException.class, () -> new HoodieTableFactory().createDynamicTableSink(sinkContext),
|
||||||
|
"Option 'write.insert.allow_dup' is only allowed for COPY_ON_WRITE table.");
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// Inner Class
|
// Inner Class
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -394,6 +394,48 @@ public class TestData {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks the source data set are written as expected.
|
||||||
|
* Different with {@link #checkWrittenData}, it reads all the data files.
|
||||||
|
*
|
||||||
|
* <p>Note: Replace it with the Flink reader when it is supported.
|
||||||
|
*
|
||||||
|
* @param baseFile The file base to check, should be a directory
|
||||||
|
* @param expected The expected results mapping, the key should be the partition path
|
||||||
|
* and value should be values list with the key partition
|
||||||
|
* @param partitions The expected partition number
|
||||||
|
*/
|
||||||
|
public static void checkWrittenAllData(
|
||||||
|
File baseFile,
|
||||||
|
Map<String, String> expected,
|
||||||
|
int partitions) throws IOException {
|
||||||
|
assert baseFile.isDirectory();
|
||||||
|
FileFilter filter = file -> !file.getName().startsWith(".");
|
||||||
|
File[] partitionDirs = baseFile.listFiles(filter);
|
||||||
|
|
||||||
|
assertNotNull(partitionDirs);
|
||||||
|
assertThat(partitionDirs.length, is(partitions));
|
||||||
|
|
||||||
|
for (File partitionDir : partitionDirs) {
|
||||||
|
File[] dataFiles = partitionDir.listFiles(filter);
|
||||||
|
assertNotNull(dataFiles);
|
||||||
|
|
||||||
|
List<String> readBuffer = new ArrayList<>();
|
||||||
|
for (File dataFile : dataFiles) {
|
||||||
|
ParquetReader<GenericRecord> reader = AvroParquetReader
|
||||||
|
.<GenericRecord>builder(new Path(dataFile.getAbsolutePath())).build();
|
||||||
|
GenericRecord nextRecord = reader.read();
|
||||||
|
while (nextRecord != null) {
|
||||||
|
readBuffer.add(filterOutVariables(nextRecord));
|
||||||
|
nextRecord = reader.read();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
readBuffer.sort(Comparator.naturalOrder());
|
||||||
|
assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName())));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks the source data are written as expected.
|
* Checks the source data are written as expected.
|
||||||
*
|
*
|
||||||
|
|||||||
Reference in New Issue
Block a user