1
0

[HUDI-1867] Streaming read for Flink COW table (#2895)

Supports streaming read for Copy On Write table.
This commit is contained in:
Danny Chan
2021-04-29 20:44:45 +08:00
committed by GitHub
parent 6e9c5dd765
commit 6848a683bd
4 changed files with 112 additions and 17 deletions

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.source;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.BaseFile;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.table.HoodieTableMetaClient;
@@ -111,6 +112,8 @@ public class StreamReadMonitoringFunction
private final long maxCompactionMemoryInBytes;
private final boolean isDelta;
public StreamReadMonitoringFunction(
Configuration conf,
Path path,
@@ -121,6 +124,7 @@ public class StreamReadMonitoringFunction
this.metaClient = metaClient;
this.interval = conf.getInteger(FlinkOptions.READ_STREAMING_CHECK_INTERVAL);
this.maxCompactionMemoryInBytes = maxCompactionMemoryInBytes;
this.isDelta = conf.getString(FlinkOptions.TABLE_TYPE).equalsIgnoreCase(FlinkOptions.TABLE_TYPE_MERGE_ON_READ);
}
@Override
@@ -185,7 +189,10 @@ public class StreamReadMonitoringFunction
@VisibleForTesting
public void monitorDirAndForwardSplits(SourceContext<MergeOnReadInputSplit> context) {
metaClient.reloadActiveTimeline();
HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants();
HoodieTimeline commitTimeline = isDelta
// if is delta, exclude the parquet files from compaction
? metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants()
: metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
if (commitTimeline.empty()) {
LOG.warn("No splits found for the table under path " + path);
return;
@@ -238,8 +245,9 @@ public class StreamReadMonitoringFunction
.sorted(HoodieLogFile.getLogFileComparator())
.map(logFile -> logFile.getPath().toString())
.collect(Collectors.toList()));
String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
return new MergeOnReadInputSplit(cnt.getAndAdd(1),
null, logPaths, commitToIssue,
basePath, logPaths, commitToIssue,
metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange);
}).collect(Collectors.toList()))
.flatMap(Collection::stream)

View File

@@ -26,7 +26,6 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.TableSchemaResolver;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hadoop.HoodieROTablePathFilter;
@@ -156,11 +155,6 @@ public class HoodieTableSource implements
this.hadoopConf = StreamerUtil.getHadoopConf();
this.metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
this.maxCompactionMemoryInBytes = getMaxCompactionMemoryInBytes(new JobConf(this.hadoopConf));
if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING)) {
ValidationUtils.checkArgument(
conf.getString(FlinkOptions.TABLE_TYPE).equalsIgnoreCase(FlinkOptions.TABLE_TYPE_MERGE_ON_READ),
"Streaming read is only supported for table type: " + FlinkOptions.TABLE_TYPE_MERGE_ON_READ);
}
}
@Override
@@ -377,6 +371,25 @@ public class HoodieTableSource implements
.emitDelete(isStreaming)
.build();
case COPY_ON_WRITE:
if (isStreaming) {
final MergeOnReadTableState hoodieTableState2 = new MergeOnReadTableState(
rowType,
requiredRowType,
tableAvroSchema.toString(),
AvroSchemaConverter.convertToSchema(requiredRowType).toString(),
Collections.emptyList(),
conf.getString(FlinkOptions.RECORD_KEY_FIELD).split(","));
return MergeOnReadInputFormat.builder()
.config(this.conf)
.paths(FilePathUtils.toFlinkPaths(paths))
.tableState(hoodieTableState2)
// use the explicit fields data type because the AvroSchemaConverter
// is not very stable.
.fieldTypes(rowDataType.getChildren())
.defaultPartName(conf.getString(FlinkOptions.PARTITION_DEFAULT_NAME))
.limit(this.limit)
.build();
}
FileInputFormat<RowData> format = new CopyOnWriteInputFormat(
FilePathUtils.toFlinkPaths(paths),
this.schema.getFieldNames(),

View File

@@ -20,6 +20,7 @@ package org.apache.hudi.table.format.mor;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.log.InstantRange;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.exception.HoodieException;
@@ -64,6 +65,7 @@ import java.util.stream.IntStream;
import static org.apache.flink.table.data.vector.VectorizedColumnBatch.DEFAULT_SIZE;
import static org.apache.flink.table.filesystem.RowPartitionComputer.restorePartValueFromType;
import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_COMMIT_TIME_COL_POS;
import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.HOODIE_RECORD_KEY_COL_POS;
import static org.apache.hudi.table.format.FormatUtils.buildAvroRecordBySchema;
@@ -162,9 +164,17 @@ public class MergeOnReadInputFormat
public void open(MergeOnReadInputSplit split) throws IOException {
this.currentReadCount = 0L;
this.hadoopConf = StreamerUtil.getHadoopConf();
if (!split.getLogPaths().isPresent()) {
// base file only
this.iterator = new BaseFileOnlyIterator(getRequiredSchemaReader(split.getBasePath().get()));
if (!(split.getLogPaths().isPresent() && split.getLogPaths().get().size() > 0)) {
if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING)) {
// base file only with commit time filtering
this.iterator = new BaseFileOnlyFilteringIterator(
split.getInstantRange(),
this.tableState.getRequiredRowType(),
getReader(split.getBasePath().get(), getRequiredPosWithCommitTime(this.requiredPos)));
} else {
// base file only
this.iterator = new BaseFileOnlyIterator(getRequiredSchemaReader(split.getBasePath().get()));
}
} else if (!split.getBasePath().isPresent()) {
// log files only
this.iterator = new LogFileOnlyIterator(getLogFileIterator(split));
@@ -390,6 +400,57 @@ public class MergeOnReadInputFormat
}
}
/**
* Similar with {@link BaseFileOnlyIterator} but with instant time filtering.
*/
static class BaseFileOnlyFilteringIterator implements RecordIterator {
// base file reader
private final ParquetColumnarRowSplitReader reader;
private final InstantRange instantRange;
private final RowDataProjection projection;
private RowData currentRecord;
BaseFileOnlyFilteringIterator(
Option<InstantRange> instantRange,
RowType requiredRowType,
ParquetColumnarRowSplitReader reader) {
this.reader = reader;
this.instantRange = instantRange.orElse(null);
int[] positions = IntStream.range(1, 1 + requiredRowType.getFieldCount()).toArray();
projection = RowDataProjection.instance(requiredRowType, positions);
}
@Override
public boolean reachedEnd() throws IOException {
while (!this.reader.reachedEnd()) {
currentRecord = this.reader.nextRecord();
if (instantRange != null) {
boolean isInRange = instantRange.isInRange(currentRecord.getString(HOODIE_COMMIT_TIME_COL_POS).toString());
if (isInRange) {
return false;
}
} else {
return false;
}
}
return true;
}
@Override
public RowData nextRecord() {
// can promote: no need to project with null instant range
return projection.project(currentRecord);
}
@Override
public void close() throws IOException {
if (this.reader != null) {
this.reader.close();
}
}
}
static class LogFileOnlyIterator implements RecordIterator {
// iterator for log files
private final Iterator<RowData> iterator;
@@ -625,6 +686,17 @@ public class MergeOnReadInputFormat
}
}
// -------------------------------------------------------------------------
// Utilities
// -------------------------------------------------------------------------
private static int[] getRequiredPosWithCommitTime(int[] requiredPos) {
int[] requiredPos2 = new int[requiredPos.length + 1];
requiredPos2[0] = HOODIE_COMMIT_TIME_COL_POS;
System.arraycopy(requiredPos, 0, requiredPos2, 1, requiredPos.length);
return requiredPos2;
}
@VisibleForTesting
public void isEmitDelete(boolean emitDelete) {
this.emitDelete = emitDelete;

View File

@@ -82,8 +82,9 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
@TempDir
File tempFile;
@Test
void testStreamWriteAndRead() throws Exception {
@ParameterizedTest
@EnumSource(value = HoodieTableType.class)
void testStreamWriteAndRead(HoodieTableType tableType) throws Exception {
// create filesystem table named source
String createSource = TestConfigurations.getFileSourceDDL("source");
streamTableEnv.executeSql(createSource);
@@ -91,7 +92,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.READ_AS_STREAMING.key(), "true");
options.put(FlinkOptions.TABLE_TYPE.key(), FlinkOptions.TABLE_TYPE_MERGE_ON_READ);
options.put(FlinkOptions.TABLE_TYPE.key(), tableType.name());
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
streamTableEnv.executeSql(hoodieTableDDL);
String insertInto = "insert into t1 select * from source";
@@ -106,8 +107,9 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
assertRowsEquals(rows2, TestData.DATA_SET_SOURCE_INSERT);
}
@Test
void testStreamReadAppendData() throws Exception {
@ParameterizedTest
@EnumSource(value = HoodieTableType.class)
void testStreamReadAppendData(HoodieTableType tableType) throws Exception {
// create filesystem table named source
String createSource = TestConfigurations.getFileSourceDDL("source");
String createSource2 = TestConfigurations.getFileSourceDDL("source2", "test_source_2.data");
@@ -117,7 +119,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
Map<String, String> options = new HashMap<>();
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
options.put(FlinkOptions.READ_AS_STREAMING.key(), "true");
options.put(FlinkOptions.TABLE_TYPE.key(), FlinkOptions.TABLE_TYPE_MERGE_ON_READ);
options.put(FlinkOptions.TABLE_TYPE.key(), tableType.name());
String createHoodieTable = TestConfigurations.getCreateHoodieTableDDL("t1", options);
streamTableEnv.executeSql(createHoodieTable);
String insertInto = "insert into t1 select * from source";