[HUDI-1878] Add max memory option for flink writer task (#2920)
Also removes the rate limiter because it has the similar functionality, modify the create and merge handle cleans the retry files automatically.
This commit is contained in:
@@ -132,13 +132,6 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
|
|||||||
return writerSchema;
|
return writerSchema;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the data file name.
|
|
||||||
*/
|
|
||||||
protected String generatesDataFileName() {
|
|
||||||
return FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract old file path, initialize StorageWriter and WriteStatus.
|
* Extract old file path, initialize StorageWriter and WriteStatus.
|
||||||
*/
|
*/
|
||||||
@@ -155,11 +148,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
|
|||||||
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
||||||
partitionMetadata.trySave(getPartitionId());
|
partitionMetadata.trySave(getPartitionId());
|
||||||
|
|
||||||
oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
|
String newFileName = FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension());
|
||||||
String newFileName = generatesDataFileName();
|
makeOldAndNewFilePaths(partitionPath, latestValidFilePath, newFileName);
|
||||||
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
|
||||||
+ newFileName).toString();
|
|
||||||
newFilePath = new Path(config.getBasePath(), relativePath);
|
|
||||||
|
|
||||||
LOG.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
|
LOG.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
|
||||||
newFilePath.toString()));
|
newFilePath.toString()));
|
||||||
@@ -183,6 +173,11 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected void makeOldAndNewFilePaths(String partitionPath, String oldFileName, String newFileName) {
|
||||||
|
oldFilePath = makeNewFilePath(partitionPath, oldFileName);
|
||||||
|
newFilePath = makeNewFilePath(partitionPath, newFileName);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initialize a spillable map for incoming records.
|
* Initialize a spillable map for incoming records.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -119,6 +119,15 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
hoodieTable.getMetaClient().getTableConfig().getBaseFileFormat().getFileExtension()));
|
hoodieTable.getMetaClient().getTableConfig().getBaseFileFormat().getFileExtension()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Make new file path with given file name.
|
||||||
|
*/
|
||||||
|
protected Path makeNewFilePath(String partitionPath, String fileName) {
|
||||||
|
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
||||||
|
+ fileName).toString();
|
||||||
|
return new Path(config.getBasePath(), relativePath);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an empty marker file corresponding to storage writer path.
|
* Creates an empty marker file corresponding to storage writer path.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ package org.apache.hudi.io;
|
|||||||
|
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||||
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.util.HoodieTimer;
|
import org.apache.hudi.common.util.HoodieTimer;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
@@ -27,9 +28,9 @@ import org.apache.hudi.config.HoodieWriteConfig;
|
|||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieInsertException;
|
import org.apache.hudi.exception.HoodieInsertException;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.MarkerFiles;
|
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
@@ -66,25 +67,68 @@ public class FlinkCreateHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
TaskContextSupplier taskContextSupplier) {
|
TaskContextSupplier taskContextSupplier) {
|
||||||
super(config, instantTime, hoodieTable, partitionPath, fileId, writerSchemaIncludingAndExcludingMetadataPair,
|
super(config, instantTime, hoodieTable, partitionPath, fileId, writerSchemaIncludingAndExcludingMetadataPair,
|
||||||
taskContextSupplier);
|
taskContextSupplier);
|
||||||
|
// delete invalid data files generated by task retry.
|
||||||
|
if (getAttemptId() > 0) {
|
||||||
|
deleteInvalidDataFile(getAttemptId() - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The flink checkpoints start in sequence and asynchronously, when one write task finish the checkpoint(A)
|
||||||
|
* (thus the fs view got the written data files some of which may be invalid),
|
||||||
|
* it goes on with the next round checkpoint(B) write immediately,
|
||||||
|
* if it tries to reuse the last small data bucket(small file) of an invalid data file,
|
||||||
|
* finally, when the coordinator receives the checkpoint success event of checkpoint(A),
|
||||||
|
* the invalid data file would be cleaned,
|
||||||
|
* and this merger got a FileNotFoundException when it close the write file handle.
|
||||||
|
*
|
||||||
|
* <p> To solve, deletes the invalid data file eagerly
|
||||||
|
* so that the invalid file small bucket would never be reused.
|
||||||
|
*
|
||||||
|
* @param lastAttemptId The last attempt ID
|
||||||
|
*/
|
||||||
|
private void deleteInvalidDataFile(long lastAttemptId) {
|
||||||
|
final String lastWriteToken = FSUtils.makeWriteToken(getPartitionId(), getStageId(), lastAttemptId);
|
||||||
|
final String lastDataFileName = FSUtils.makeDataFileName(instantTime,
|
||||||
|
lastWriteToken, this.fileId, hoodieTable.getBaseFileExtension());
|
||||||
|
final Path path = makeNewFilePath(partitionPath, lastDataFileName);
|
||||||
|
try {
|
||||||
|
if (fs.exists(path)) {
|
||||||
|
LOG.info("Deleting invalid INSERT file due to task retry: " + lastDataFileName);
|
||||||
|
fs.delete(path, false);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieException("Error while deleting the INSERT file due to task retry: " + lastDataFileName, e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void createMarkerFile(String partitionPath, String dataFileName) {
|
public Path makeNewPath(String partitionPath) {
|
||||||
MarkerFiles markerFiles = new MarkerFiles(hoodieTable, instantTime);
|
Path path = super.makeNewPath(partitionPath);
|
||||||
boolean created = markerFiles.createIfNotExists(partitionPath, dataFileName, getIOType());
|
// If the data file already exists, it means the write task write new data bucket multiple times
|
||||||
if (!created) {
|
// in one hoodie commit, rolls over to a new name instead.
|
||||||
// If the marker file already exists, that means the write task
|
|
||||||
// was pulled up again with same data file name, removes the legacy
|
// Write to a new file which behaves like a different task write.
|
||||||
// data file first.
|
|
||||||
try {
|
try {
|
||||||
if (fs.exists(path)) {
|
int rollNumber = 0;
|
||||||
fs.delete(path, false);
|
while (fs.exists(path)) {
|
||||||
LOG.warn("Legacy data file: " + path + " delete success");
|
Path existing = path;
|
||||||
|
path = newFilePathWithRollover(rollNumber++);
|
||||||
|
LOG.warn("Duplicate write for INSERT bucket with path: " + existing + ", rolls over to new path: " + path);
|
||||||
}
|
}
|
||||||
|
return path;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieException("Error while deleting legacy data file: " + path, e);
|
throw new HoodieException("Checking existing path for create handle error: " + path, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use the writeToken + "-" + rollNumber as the new writeToken of a mini-batch write.
|
||||||
|
*/
|
||||||
|
private Path newFilePathWithRollover(int rollNumber) {
|
||||||
|
final String dataFileName = FSUtils.makeDataFileName(instantTime, writeToken + "-" + rollNumber, fileId,
|
||||||
|
hoodieTable.getBaseFileExtension());
|
||||||
|
return makeNewFilePath(partitionPath, dataFileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -111,7 +155,7 @@ public class FlinkCreateHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected long computeTotalWriteBytes() throws IOException {
|
protected long computeTotalWriteBytes() {
|
||||||
long fileSizeInBytes = computeFileSizeInBytes();
|
long fileSizeInBytes = computeFileSizeInBytes();
|
||||||
long incFileSizeInBytes = fileSizeInBytes - lastFileSize;
|
long incFileSizeInBytes = fileSizeInBytes - lastFileSize;
|
||||||
this.lastFileSize = fileSizeInBytes;
|
this.lastFileSize = fileSizeInBytes;
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ import org.apache.hudi.common.model.HoodieRecord;
|
|||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.util.HoodieTimer;
|
import org.apache.hudi.common.util.HoodieTimer;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.MarkerFiles;
|
import org.apache.hudi.table.MarkerFiles;
|
||||||
@@ -78,18 +79,76 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
TaskContextSupplier taskContextSupplier) {
|
TaskContextSupplier taskContextSupplier) {
|
||||||
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier);
|
super(config, instantTime, hoodieTable, recordItr, partitionPath, fileId, taskContextSupplier);
|
||||||
if (rolloverPaths == null) {
|
if (rolloverPaths == null) {
|
||||||
// #createMarkerFile may already initialize it already
|
// #makeOldAndNewFilePaths may already initialize it already
|
||||||
rolloverPaths = new ArrayList<>();
|
rolloverPaths = new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
// delete invalid data files generated by task retry.
|
||||||
|
if (getAttemptId() > 0) {
|
||||||
|
deleteInvalidDataFile(getAttemptId() - 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Use the fileId + "-" + rollNumber as the new fileId of a mini-batch write.
|
* The flink checkpoints start in sequence and asynchronously, when one write task finish the checkpoint(A)
|
||||||
|
* (thus the fs view got the written data files some of which may be invalid),
|
||||||
|
* it goes on with the next round checkpoint(B) write immediately,
|
||||||
|
* if it tries to reuse the last small data bucket(small file) of an invalid data file,
|
||||||
|
* finally, when the coordinator receives the checkpoint success event of checkpoint(A),
|
||||||
|
* the invalid data file would be cleaned,
|
||||||
|
* and this merger got a FileNotFoundException when it close the write file handle.
|
||||||
|
*
|
||||||
|
* <p> To solve, deletes the invalid data file eagerly
|
||||||
|
* so that the invalid file small bucket would never be reused.
|
||||||
|
*
|
||||||
|
* @param lastAttemptId The last attempt ID
|
||||||
*/
|
*/
|
||||||
protected String generatesDataFileNameWithRollover() {
|
private void deleteInvalidDataFile(long lastAttemptId) {
|
||||||
|
final String lastWriteToken = FSUtils.makeWriteToken(getPartitionId(), getStageId(), lastAttemptId);
|
||||||
|
final String lastDataFileName = FSUtils.makeDataFileName(instantTime,
|
||||||
|
lastWriteToken, this.fileId, hoodieTable.getBaseFileExtension());
|
||||||
|
final Path path = makeNewFilePath(partitionPath, lastDataFileName);
|
||||||
|
try {
|
||||||
|
if (fs.exists(path)) {
|
||||||
|
LOG.info("Deleting invalid MERGE base file due to task retry: " + lastDataFileName);
|
||||||
|
fs.delete(path, false);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieException("Error while deleting the MERGE base file due to task retry: " + lastDataFileName, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected void makeOldAndNewFilePaths(String partitionPath, String oldFileName, String newFileName) {
|
||||||
|
// If the data file already exists, it means the write task write merge data bucket multiple times
|
||||||
|
// in one hoodie commit, rolls over to a new name instead.
|
||||||
|
|
||||||
|
// Use the existing file path as the base file path (file1),
|
||||||
|
// and generates new file path with roll over number (file2).
|
||||||
|
// the incremental data set would merge into the file2 instead of file1.
|
||||||
|
//
|
||||||
|
// When the task finalizes in #finishWrite, the intermediate files would be cleaned.
|
||||||
|
super.makeOldAndNewFilePaths(partitionPath, oldFileName, newFileName);
|
||||||
|
rolloverPaths = new ArrayList<>();
|
||||||
|
try {
|
||||||
|
while (fs.exists(newFilePath)) {
|
||||||
|
oldFilePath = newFilePath; // override the old file name
|
||||||
|
rolloverPaths.add(oldFilePath);
|
||||||
|
newFileName = newFileNameWithRollover(rollNumber++);
|
||||||
|
newFilePath = makeNewFilePath(partitionPath, newFileName);
|
||||||
|
LOG.warn("Duplicate write for MERGE bucket with path: " + oldFilePath + ", rolls over to new path: " + newFilePath);
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieException("Checking existing path for merge handle error: " + newFilePath, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use the writeToken + "-" + rollNumber as the new writeToken of a mini-batch write.
|
||||||
|
*/
|
||||||
|
protected String newFileNameWithRollover(int rollNumber) {
|
||||||
// make the intermediate file as hidden
|
// make the intermediate file as hidden
|
||||||
return FSUtils.makeDataFileName("." + instantTime,
|
return FSUtils.makeDataFileName(instantTime, writeToken + "-" + rollNumber,
|
||||||
writeToken + "-" + rollNumber, this.fileId, hoodieTable.getBaseFileExtension());
|
this.fileId, hoodieTable.getBaseFileExtension());
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean shouldRollover() {
|
public boolean shouldRollover() {
|
||||||
@@ -109,25 +168,6 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void createMarkerFile(String partitionPath, String dataFileName) {
|
|
||||||
MarkerFiles markerFiles = new MarkerFiles(hoodieTable, instantTime);
|
|
||||||
boolean created = markerFiles.createIfNotExists(partitionPath, dataFileName, getIOType());
|
|
||||||
if (!created) {
|
|
||||||
// If the marker file already exists, that means the write task
|
|
||||||
// was pulled up again with same data file name, performs rolling over action here:
|
|
||||||
// use the new file path as the base file path (file1),
|
|
||||||
// and generates new file path with roll over number (file2).
|
|
||||||
// the incremental data set would merge into the file2 instead of file1.
|
|
||||||
//
|
|
||||||
// When the task do finalization in #finishWrite, the intermediate files would be cleaned.
|
|
||||||
oldFilePath = newFilePath;
|
|
||||||
rolloverPaths = new ArrayList<>();
|
|
||||||
rolloverPaths.add(oldFilePath);
|
|
||||||
newFilePath = makeNewFilePathWithRollover();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* Rolls over the write handle to prepare for the next batch write.
|
* Rolls over the write handle to prepare for the next batch write.
|
||||||
@@ -156,7 +196,13 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
|
|
||||||
rolloverPaths.add(newFilePath);
|
rolloverPaths.add(newFilePath);
|
||||||
oldFilePath = newFilePath;
|
oldFilePath = newFilePath;
|
||||||
newFilePath = makeNewFilePathWithRollover();
|
final String newFileName = newFileNameWithRollover(rollNumber);
|
||||||
|
newFilePath = makeNewFilePath(partitionPath, newFileName);
|
||||||
|
|
||||||
|
// create the marker file so that the intermediate roll over files
|
||||||
|
// of the retry task can be cleaned.
|
||||||
|
MarkerFiles markerFiles = new MarkerFiles(hoodieTable, instantTime);
|
||||||
|
markerFiles.createIfNotExists(partitionPath, newFileName, getIOType());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config, writerSchemaWithMetafields, taskContextSupplier);
|
fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config, writerSchemaWithMetafields, taskContextSupplier);
|
||||||
@@ -168,16 +214,6 @@ public class FlinkMergeHandle<T extends HoodieRecordPayload, I, K, O>
|
|||||||
newFilePath.toString()));
|
newFilePath.toString()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Use the fileId + "-" + rollNumber as the new fileId of a mini-batch write.
|
|
||||||
*/
|
|
||||||
private Path makeNewFilePathWithRollover() {
|
|
||||||
String newFileName = generatesDataFileNameWithRollover();
|
|
||||||
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
|
||||||
+ newFileName).toString();
|
|
||||||
return new Path(config.getBasePath(), relativePath);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void finishWrite() {
|
public void finishWrite() {
|
||||||
// The file visibility should be kept by the configured ConsistencyGuard instance.
|
// The file visibility should be kept by the configured ConsistencyGuard instance.
|
||||||
rolloverPaths.add(newFilePath);
|
rolloverPaths.add(newFilePath);
|
||||||
|
|||||||
@@ -31,10 +31,12 @@ import org.apache.hudi.common.model.HoodieBaseFile;
|
|||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
|
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||||
import org.apache.hudi.exception.HoodieUpsertException;
|
import org.apache.hudi.exception.HoodieUpsertException;
|
||||||
import org.apache.hudi.io.HoodieCreateHandle;
|
import org.apache.hudi.io.HoodieCreateHandle;
|
||||||
@@ -318,6 +320,12 @@ public class HoodieFlinkCopyOnWriteTable<T extends HoodieRecordPayload> extends
|
|||||||
throw new HoodieNotSupportedException("Savepoint and restore is not supported yet");
|
throw new HoodieNotSupportedException("Savepoint and restore is not supported yet");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void finalizeWrite(HoodieEngineContext context, String instantTs, List<HoodieWriteStat> stats) throws HoodieIOException {
|
||||||
|
// do nothing because flink create and merge handles can clean the
|
||||||
|
// retry files by themselves.
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// Used for compaction
|
// Used for compaction
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -267,18 +267,19 @@ public class FlinkOptions {
|
|||||||
.defaultValue(4)
|
.defaultValue(4)
|
||||||
.withDescription("Parallelism of tasks that do actual write, default is 4");
|
.withDescription("Parallelism of tasks that do actual write, default is 4");
|
||||||
|
|
||||||
|
public static final ConfigOption<Double> WRITE_TASK_MAX_SIZE = ConfigOptions
|
||||||
|
.key("write.task.max.size")
|
||||||
|
.doubleType()
|
||||||
|
.defaultValue(1024D) // 1GB
|
||||||
|
.withDescription("Maximum memory in MB for a write task, when the threshold hits,\n"
|
||||||
|
+ "it flushes the max size data bucket to avoid OOM, default 1GB");
|
||||||
|
|
||||||
public static final ConfigOption<Double> WRITE_BATCH_SIZE = ConfigOptions
|
public static final ConfigOption<Double> WRITE_BATCH_SIZE = ConfigOptions
|
||||||
.key("write.batch.size")
|
.key("write.batch.size")
|
||||||
.doubleType()
|
.doubleType()
|
||||||
.defaultValue(64D) // 64MB
|
.defaultValue(64D) // 64MB
|
||||||
.withDescription("Batch buffer size in MB to flush data into the underneath filesystem, default 64MB");
|
.withDescription("Batch buffer size in MB to flush data into the underneath filesystem, default 64MB");
|
||||||
|
|
||||||
public static final ConfigOption<Long> WRITE_RATE_LIMIT = ConfigOptions
|
|
||||||
.key("write.rate.limit")
|
|
||||||
.longType()
|
|
||||||
.defaultValue(-1L) // default no limit
|
|
||||||
.withDescription("Write records rate limit per second to reduce risk of OOM, default -1 (no limit)");
|
|
||||||
|
|
||||||
public static final ConfigOption<Integer> WRITE_LOG_BLOCK_SIZE = ConfigOptions
|
public static final ConfigOption<Integer> WRITE_LOG_BLOCK_SIZE = ConfigOptions
|
||||||
.key("write.log_block.size")
|
.key("write.log_block.size")
|
||||||
.intType()
|
.intType()
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ import org.apache.hudi.table.action.commit.FlinkWriteHelper;
|
|||||||
import org.apache.hudi.util.StreamerUtil;
|
import org.apache.hudi.util.StreamerUtil;
|
||||||
|
|
||||||
import org.apache.flink.annotation.VisibleForTesting;
|
import org.apache.flink.annotation.VisibleForTesting;
|
||||||
|
import org.apache.flink.api.common.state.CheckpointListener;
|
||||||
import org.apache.flink.configuration.Configuration;
|
import org.apache.flink.configuration.Configuration;
|
||||||
import org.apache.flink.runtime.operators.coordination.OperatorEventGateway;
|
import org.apache.flink.runtime.operators.coordination.OperatorEventGateway;
|
||||||
import org.apache.flink.runtime.state.FunctionInitializationContext;
|
import org.apache.flink.runtime.state.FunctionInitializationContext;
|
||||||
@@ -52,6 +53,7 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sink function to write the data to the underneath filesystem.
|
* Sink function to write the data to the underneath filesystem.
|
||||||
@@ -59,7 +61,8 @@ import java.util.function.BiFunction;
|
|||||||
* <p><h2>Work Flow</h2>
|
* <p><h2>Work Flow</h2>
|
||||||
*
|
*
|
||||||
* <p>The function firstly buffers the data as a batch of {@link HoodieRecord}s,
|
* <p>The function firstly buffers the data as a batch of {@link HoodieRecord}s,
|
||||||
* It flushes(write) the records batch when a batch exceeds the configured size {@link FlinkOptions#WRITE_BATCH_SIZE}
|
* It flushes(write) the records batch when the batch size exceeds the configured size {@link FlinkOptions#WRITE_BATCH_SIZE}
|
||||||
|
* or the total buffer size exceeds the configured size {@link FlinkOptions#WRITE_TASK_MAX_SIZE}
|
||||||
* or a Flink checkpoint starts. After a batch has been written successfully,
|
* or a Flink checkpoint starts. After a batch has been written successfully,
|
||||||
* the function notifies its operator coordinator {@link StreamWriteOperatorCoordinator} to mark a successful write.
|
* the function notifies its operator coordinator {@link StreamWriteOperatorCoordinator} to mark a successful write.
|
||||||
*
|
*
|
||||||
@@ -91,7 +94,7 @@ import java.util.function.BiFunction;
|
|||||||
*/
|
*/
|
||||||
public class StreamWriteFunction<K, I, O>
|
public class StreamWriteFunction<K, I, O>
|
||||||
extends KeyedProcessFunction<K, I, O>
|
extends KeyedProcessFunction<K, I, O>
|
||||||
implements CheckpointedFunction {
|
implements CheckpointedFunction, CheckpointListener {
|
||||||
|
|
||||||
private static final long serialVersionUID = 1L;
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
@@ -134,6 +137,11 @@ public class StreamWriteFunction<K, I, O>
|
|||||||
*/
|
*/
|
||||||
private transient String actionType;
|
private transient String actionType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Total size tracer.
|
||||||
|
*/
|
||||||
|
private transient TotalSizeTracer tracer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a StreamingSinkFunction.
|
* Constructs a StreamingSinkFunction.
|
||||||
*
|
*
|
||||||
@@ -150,6 +158,7 @@ public class StreamWriteFunction<K, I, O>
|
|||||||
this.actionType = CommitUtils.getCommitActionType(
|
this.actionType = CommitUtils.getCommitActionType(
|
||||||
WriteOperationType.fromValue(config.getString(FlinkOptions.OPERATION)),
|
WriteOperationType.fromValue(config.getString(FlinkOptions.OPERATION)),
|
||||||
HoodieTableType.valueOf(config.getString(FlinkOptions.TABLE_TYPE)));
|
HoodieTableType.valueOf(config.getString(FlinkOptions.TABLE_TYPE)));
|
||||||
|
this.tracer = new TotalSizeTracer(this.config);
|
||||||
initBuffer();
|
initBuffer();
|
||||||
initWriteFunction();
|
initWriteFunction();
|
||||||
}
|
}
|
||||||
@@ -168,7 +177,7 @@ public class StreamWriteFunction<K, I, O>
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void processElement(I value, KeyedProcessFunction<K, I, O>.Context ctx, Collector<O> out) throws Exception {
|
public void processElement(I value, KeyedProcessFunction<K, I, O>.Context ctx, Collector<O> out) {
|
||||||
bufferRecord(value);
|
bufferRecord(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -180,6 +189,11 @@ public class StreamWriteFunction<K, I, O>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void notifyCheckpointComplete(long checkpointId) {
|
||||||
|
this.writeClient.cleanHandles();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* End input action for batch source.
|
* End input action for batch source.
|
||||||
*/
|
*/
|
||||||
@@ -294,6 +308,44 @@ public class StreamWriteFunction<K, I, O>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tool to trace the total buffer size. It computes the maximum buffer size,
|
||||||
|
* if current buffer size is greater than the maximum buffer size, the data bucket
|
||||||
|
* flush triggers.
|
||||||
|
*/
|
||||||
|
private static class TotalSizeTracer {
|
||||||
|
private long bufferSize = 0L;
|
||||||
|
private final double maxBufferSize;
|
||||||
|
|
||||||
|
TotalSizeTracer(Configuration conf) {
|
||||||
|
long mergeReaderMem = 100; // constant 100MB
|
||||||
|
long mergeMapMaxMem = conf.getInteger(FlinkOptions.WRITE_MERGE_MAX_MEMORY);
|
||||||
|
this.maxBufferSize = (conf.getDouble(FlinkOptions.WRITE_TASK_MAX_SIZE) - mergeReaderMem - mergeMapMaxMem) * 1024 * 1024;
|
||||||
|
final String errMsg = String.format("'%s' should be at least greater than '%s' plus merge reader memory(constant 100MB now)",
|
||||||
|
FlinkOptions.WRITE_TASK_MAX_SIZE.key(), FlinkOptions.WRITE_MERGE_MAX_MEMORY.key());
|
||||||
|
ValidationUtils.checkState(this.maxBufferSize > 0, errMsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Trace the given record size {@code recordSize}.
|
||||||
|
*
|
||||||
|
* @param recordSize The record size
|
||||||
|
* @return true if the buffer size exceeds the maximum buffer size
|
||||||
|
*/
|
||||||
|
boolean trace(long recordSize) {
|
||||||
|
this.bufferSize += recordSize;
|
||||||
|
return this.bufferSize > this.maxBufferSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
void countDown(long size) {
|
||||||
|
this.bufferSize -= size;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
this.bufferSize = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the bucket ID with the given value {@code value}.
|
* Returns the bucket ID with the given value {@code value}.
|
||||||
*/
|
*/
|
||||||
@@ -309,6 +361,9 @@ public class StreamWriteFunction<K, I, O>
|
|||||||
* <p>Flush the data bucket first if the bucket records size is greater than
|
* <p>Flush the data bucket first if the bucket records size is greater than
|
||||||
* the configured value {@link FlinkOptions#WRITE_BATCH_SIZE}.
|
* the configured value {@link FlinkOptions#WRITE_BATCH_SIZE}.
|
||||||
*
|
*
|
||||||
|
* <p>Flush the max size data bucket if the total buffer size exceeds the configured
|
||||||
|
* threshold {@link FlinkOptions#WRITE_TASK_MAX_SIZE}.
|
||||||
|
*
|
||||||
* @param value HoodieRecord
|
* @param value HoodieRecord
|
||||||
*/
|
*/
|
||||||
private void bufferRecord(I value) {
|
private void bufferRecord(I value) {
|
||||||
@@ -316,10 +371,21 @@ public class StreamWriteFunction<K, I, O>
|
|||||||
|
|
||||||
DataBucket bucket = this.buckets.computeIfAbsent(bucketID,
|
DataBucket bucket = this.buckets.computeIfAbsent(bucketID,
|
||||||
k -> new DataBucket(this.config.getDouble(FlinkOptions.WRITE_BATCH_SIZE)));
|
k -> new DataBucket(this.config.getDouble(FlinkOptions.WRITE_BATCH_SIZE)));
|
||||||
boolean needFlush = bucket.detector.detect(value);
|
boolean flushBucket = bucket.detector.detect(value);
|
||||||
if (needFlush) {
|
boolean flushBuffer = this.tracer.trace(bucket.detector.lastRecordSize);
|
||||||
|
if (flushBucket) {
|
||||||
flushBucket(bucket);
|
flushBucket(bucket);
|
||||||
|
this.tracer.countDown(bucket.detector.totalSize);
|
||||||
bucket.reset();
|
bucket.reset();
|
||||||
|
} else if (flushBuffer) {
|
||||||
|
// find the max size bucket and flush it out
|
||||||
|
List<DataBucket> sortedBuckets = this.buckets.values().stream()
|
||||||
|
.sorted((b1, b2) -> Long.compare(b2.detector.totalSize, b1.detector.totalSize))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
final DataBucket bucketToFlush = sortedBuckets.get(0);
|
||||||
|
flushBucket(bucketToFlush);
|
||||||
|
this.tracer.countDown(bucketToFlush.detector.totalSize);
|
||||||
|
bucketToFlush.reset();
|
||||||
}
|
}
|
||||||
bucket.records.add((HoodieRecord<?>) value);
|
bucket.records.add((HoodieRecord<?>) value);
|
||||||
}
|
}
|
||||||
@@ -384,7 +450,7 @@ public class StreamWriteFunction<K, I, O>
|
|||||||
.build();
|
.build();
|
||||||
this.eventGateway.sendEventToCoordinator(event);
|
this.eventGateway.sendEventToCoordinator(event);
|
||||||
this.buckets.clear();
|
this.buckets.clear();
|
||||||
this.writeClient.cleanHandles();
|
this.tracer.reset();
|
||||||
this.currentInstant = "";
|
this.currentInstant = "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -44,8 +44,6 @@ import javax.annotation.Nullable;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.lang.reflect.Constructor;
|
import java.lang.reflect.Constructor;
|
||||||
import java.util.Random;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function that transforms RowData to HoodieRecord.
|
* Function that transforms RowData to HoodieRecord.
|
||||||
@@ -82,12 +80,6 @@ public class RowDataToHoodieFunction<I extends RowData, O extends HoodieRecord<?
|
|||||||
*/
|
*/
|
||||||
private final Configuration config;
|
private final Configuration config;
|
||||||
|
|
||||||
/**
|
|
||||||
* Rate limit per second for this task.
|
|
||||||
* The task sleep a little while when the consuming rate exceeds the threshold.
|
|
||||||
*/
|
|
||||||
private transient RateLimiter rateLimiter;
|
|
||||||
|
|
||||||
public RowDataToHoodieFunction(RowType rowType, Configuration config) {
|
public RowDataToHoodieFunction(RowType rowType, Configuration config) {
|
||||||
this.rowType = rowType;
|
this.rowType = rowType;
|
||||||
this.config = config;
|
this.config = config;
|
||||||
@@ -100,31 +92,13 @@ public class RowDataToHoodieFunction<I extends RowData, O extends HoodieRecord<?
|
|||||||
this.converter = RowDataToAvroConverters.createConverter(this.rowType);
|
this.converter = RowDataToAvroConverters.createConverter(this.rowType);
|
||||||
this.keyGenerator = StreamerUtil.createKeyGenerator(FlinkOptions.flatOptions(this.config));
|
this.keyGenerator = StreamerUtil.createKeyGenerator(FlinkOptions.flatOptions(this.config));
|
||||||
this.payloadCreation = PayloadCreation.instance(config);
|
this.payloadCreation = PayloadCreation.instance(config);
|
||||||
long totalLimit = this.config.getLong(FlinkOptions.WRITE_RATE_LIMIT);
|
|
||||||
if (totalLimit > 0) {
|
|
||||||
this.rateLimiter = new RateLimiter(totalLimit / getRuntimeContext().getNumberOfParallelSubtasks());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
@Override
|
@Override
|
||||||
public O map(I i) throws Exception {
|
public O map(I i) throws Exception {
|
||||||
if (rateLimiter != null) {
|
|
||||||
final O hoodieRecord;
|
|
||||||
if (rateLimiter.sampling()) {
|
|
||||||
long startTime = System.currentTimeMillis();
|
|
||||||
hoodieRecord = (O) toHoodieRecord(i);
|
|
||||||
long endTime = System.currentTimeMillis();
|
|
||||||
rateLimiter.processTime(endTime - startTime);
|
|
||||||
} else {
|
|
||||||
hoodieRecord = (O) toHoodieRecord(i);
|
|
||||||
}
|
|
||||||
rateLimiter.sleepIfNeeded();
|
|
||||||
return hoodieRecord;
|
|
||||||
} else {
|
|
||||||
return (O) toHoodieRecord(i);
|
return (O) toHoodieRecord(i);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Converts the give record to a {@link HoodieRecord}.
|
* Converts the give record to a {@link HoodieRecord}.
|
||||||
@@ -191,43 +165,4 @@ public class RowDataToHoodieFunction<I extends RowData, O extends HoodieRecord<?
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
// Inner Class
|
|
||||||
// -------------------------------------------------------------------------
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Tool to decide whether the limit the processing rate.
|
|
||||||
* Sampling the record to compute the process time with 0.01 percentage.
|
|
||||||
*/
|
|
||||||
private static class RateLimiter {
|
|
||||||
private final Random random = new Random(47);
|
|
||||||
private static final int DENOMINATOR = 100;
|
|
||||||
|
|
||||||
private final long maxProcessTime;
|
|
||||||
|
|
||||||
private long processTime = -1L;
|
|
||||||
private long timeToSleep = -1;
|
|
||||||
|
|
||||||
RateLimiter(long rate) {
|
|
||||||
ValidationUtils.checkArgument(rate > 0, "rate should be positive");
|
|
||||||
this.maxProcessTime = 1000 / rate;
|
|
||||||
}
|
|
||||||
|
|
||||||
void processTime(long processTime) {
|
|
||||||
this.processTime = processTime;
|
|
||||||
this.timeToSleep = maxProcessTime - processTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean sampling() {
|
|
||||||
// 0.01 sampling percentage
|
|
||||||
return processTime == -1 || random.nextInt(DENOMINATOR) == 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
void sleepIfNeeded() throws Exception {
|
|
||||||
if (timeToSleep > 0) {
|
|
||||||
TimeUnit.MILLISECONDS.sleep(timeToSleep);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -436,6 +436,68 @@ public class TestWriteCopyOnWrite {
|
|||||||
checkWrittenData(tempFile, expected, 1);
|
checkWrittenData(tempFile, expected, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testInsertWithSmallBufferSize() throws Exception {
|
||||||
|
// reset the config option
|
||||||
|
conf.setDouble(FlinkOptions.WRITE_TASK_MAX_SIZE, 200.001); // 1Kb buffer size
|
||||||
|
funcWrapper = new StreamWriteFunctionWrapper<>(tempFile.getAbsolutePath(), conf);
|
||||||
|
|
||||||
|
// open the function and ingest data
|
||||||
|
funcWrapper.openFunction();
|
||||||
|
// each record is 424 bytes. so 3 records expect to trigger buffer flush:
|
||||||
|
// flush the max size bucket once at a time.
|
||||||
|
for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
|
||||||
|
funcWrapper.invoke(rowData);
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, List<HoodieRecord>> dataBuffer = funcWrapper.getDataBuffer();
|
||||||
|
assertThat("Should have 1 data bucket", dataBuffer.size(), is(1));
|
||||||
|
assertThat("2 records expect to flush out as a mini-batch",
|
||||||
|
dataBuffer.values().stream().findFirst().map(List::size).orElse(-1),
|
||||||
|
is(3));
|
||||||
|
|
||||||
|
// this triggers the data write and event send
|
||||||
|
funcWrapper.checkpointFunction(1);
|
||||||
|
dataBuffer = funcWrapper.getDataBuffer();
|
||||||
|
assertThat("All data should be flushed out", dataBuffer.size(), is(0));
|
||||||
|
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first
|
||||||
|
assertThat("The operator expect to send an event", event, instanceOf(BatchWriteSuccessEvent.class));
|
||||||
|
funcWrapper.getCoordinator().handleEventFromOperator(0, event);
|
||||||
|
}
|
||||||
|
assertNotNull(funcWrapper.getEventBuffer()[0], "The coordinator missed the event");
|
||||||
|
|
||||||
|
String instant = funcWrapper.getWriteClient()
|
||||||
|
.getLastPendingInstant(getTableType());
|
||||||
|
|
||||||
|
funcWrapper.checkpointComplete(1);
|
||||||
|
|
||||||
|
Map<String, String> expected = getMiniBatchExpected();
|
||||||
|
checkWrittenData(tempFile, expected, 1);
|
||||||
|
|
||||||
|
// started a new instant already
|
||||||
|
checkInflightInstant(funcWrapper.getWriteClient());
|
||||||
|
checkInstantState(funcWrapper.getWriteClient(), HoodieInstant.State.COMPLETED, instant);
|
||||||
|
|
||||||
|
// insert duplicates again
|
||||||
|
for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
|
||||||
|
funcWrapper.invoke(rowData);
|
||||||
|
}
|
||||||
|
|
||||||
|
funcWrapper.checkpointFunction(2);
|
||||||
|
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
final OperatorEvent event = funcWrapper.getNextEvent(); // remove the first event first
|
||||||
|
funcWrapper.getCoordinator().handleEventFromOperator(0, event);
|
||||||
|
}
|
||||||
|
|
||||||
|
funcWrapper.checkpointComplete(2);
|
||||||
|
|
||||||
|
// Same the original base file content.
|
||||||
|
checkWrittenData(tempFile, expected, 1);
|
||||||
|
}
|
||||||
|
|
||||||
Map<String, String> getMiniBatchExpected() {
|
Map<String, String> getMiniBatchExpected() {
|
||||||
Map<String, String> expected = new HashMap<>();
|
Map<String, String> expected = new HashMap<>();
|
||||||
expected.put("par1", "[id1,par1,id1,Danny,23,1,par1, "
|
expected.put("par1", "[id1,par1,id1,Danny,23,1,par1, "
|
||||||
|
|||||||
@@ -1,83 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi.sink.transform;
|
|
||||||
|
|
||||||
import org.apache.hudi.configuration.FlinkOptions;
|
|
||||||
import org.apache.hudi.sink.utils.MockStreamingRuntimeContext;
|
|
||||||
import org.apache.hudi.utils.TestConfigurations;
|
|
||||||
import org.apache.hudi.utils.TestData;
|
|
||||||
|
|
||||||
import org.apache.flink.configuration.Configuration;
|
|
||||||
import org.apache.flink.table.data.RowData;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.junit.jupiter.api.io.TempDir;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test cases for {@link RowDataToHoodieFunction}.
|
|
||||||
*/
|
|
||||||
public class TestRowDataToHoodieFunction {
|
|
||||||
@TempDir
|
|
||||||
File tempFile;
|
|
||||||
|
|
||||||
private Configuration conf;
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
public void before() {
|
|
||||||
final String basePath = tempFile.getAbsolutePath();
|
|
||||||
conf = TestConfigurations.getDefaultConf(basePath);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testRateLimit() throws Exception {
|
|
||||||
// at most 100 record per second
|
|
||||||
RowDataToHoodieFunction<RowData, ?> func1 = getFunc(100);
|
|
||||||
long instant1 = System.currentTimeMillis();
|
|
||||||
for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
|
|
||||||
func1.map(rowData);
|
|
||||||
}
|
|
||||||
long instant2 = System.currentTimeMillis();
|
|
||||||
long processTime1 = instant2 - instant1;
|
|
||||||
|
|
||||||
// at most 1 record per second
|
|
||||||
RowDataToHoodieFunction<RowData, ?> func2 = getFunc(1);
|
|
||||||
long instant3 = System.currentTimeMillis();
|
|
||||||
for (RowData rowData : TestData.DATA_SET_INSERT_DUPLICATES) {
|
|
||||||
func2.map(rowData);
|
|
||||||
}
|
|
||||||
long instant4 = System.currentTimeMillis();
|
|
||||||
long processTime2 = instant4 - instant3;
|
|
||||||
|
|
||||||
assertTrue(processTime2 > processTime1, "lower rate should have longer process time");
|
|
||||||
assertTrue(processTime2 > 5000, "should process at least 5 seconds");
|
|
||||||
}
|
|
||||||
|
|
||||||
private RowDataToHoodieFunction<RowData, ?> getFunc(long rate) throws Exception {
|
|
||||||
conf.setLong(FlinkOptions.WRITE_RATE_LIMIT, rate);
|
|
||||||
RowDataToHoodieFunction<RowData, ?> func =
|
|
||||||
new RowDataToHoodieFunction<>(TestConfigurations.ROW_TYPE, conf);
|
|
||||||
func.setRuntimeContext(new MockStreamingRuntimeContext(false, 1, 1));
|
|
||||||
func.open(conf);
|
|
||||||
return func;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -163,6 +163,7 @@ public class StreamWriteFunctionWrapper<I> {
|
|||||||
functionInitializationContext.getOperatorStateStore().checkpointSuccess(checkpointId);
|
functionInitializationContext.getOperatorStateStore().checkpointSuccess(checkpointId);
|
||||||
coordinator.notifyCheckpointComplete(checkpointId);
|
coordinator.notifyCheckpointComplete(checkpointId);
|
||||||
this.bucketAssignerFunction.notifyCheckpointComplete(checkpointId);
|
this.bucketAssignerFunction.notifyCheckpointComplete(checkpointId);
|
||||||
|
this.writeFunction.notifyCheckpointComplete(checkpointId);
|
||||||
if (conf.getBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED)) {
|
if (conf.getBoolean(FlinkOptions.COMPACTION_ASYNC_ENABLED)) {
|
||||||
try {
|
try {
|
||||||
compactFunctionWrapper.compact(checkpointId);
|
compactFunctionWrapper.compact(checkpointId);
|
||||||
|
|||||||
Reference in New Issue
Block a user