[HUDI-2209] Bulk insert for flink writer (#3334)
This commit is contained in:
@@ -52,6 +52,30 @@
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-common</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-parquet_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-hadoop</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-avro</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Parquet -->
|
||||
<dependency>
|
||||
|
||||
@@ -0,0 +1,184 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.client.model;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
|
||||
import org.apache.flink.table.data.ArrayData;
|
||||
import org.apache.flink.table.data.DecimalData;
|
||||
import org.apache.flink.table.data.MapData;
|
||||
import org.apache.flink.table.data.RawValueData;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.data.StringData;
|
||||
import org.apache.flink.table.data.TimestampData;
|
||||
import org.apache.flink.types.RowKind;
|
||||
|
||||
/**
|
||||
* RowData implementation for Hoodie Row. It wraps an {@link RowData} and keeps meta columns locally. But the {@link RowData}
|
||||
* does include the meta columns as well just that {@link HoodieRowData} will intercept queries for meta columns and serve from its
|
||||
* copy rather than fetching from {@link RowData}.
|
||||
*/
|
||||
public class HoodieRowData implements RowData {
|
||||
|
||||
private final String commitTime;
|
||||
private final String commitSeqNumber;
|
||||
private final String recordKey;
|
||||
private final String partitionPath;
|
||||
private final String fileName;
|
||||
private final RowData row;
|
||||
private final int metaColumnsNum;
|
||||
|
||||
public HoodieRowData(String commitTime,
|
||||
String commitSeqNumber,
|
||||
String recordKey,
|
||||
String partitionPath,
|
||||
String fileName,
|
||||
RowData row) {
|
||||
this.commitTime = commitTime;
|
||||
this.commitSeqNumber = commitSeqNumber;
|
||||
this.recordKey = recordKey;
|
||||
this.partitionPath = partitionPath;
|
||||
this.fileName = fileName;
|
||||
this.row = row;
|
||||
this.metaColumnsNum = HoodieRecord.HOODIE_META_COLUMNS.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getArity() {
|
||||
return metaColumnsNum + row.getArity();
|
||||
}
|
||||
|
||||
@Override
|
||||
public RowKind getRowKind() {
|
||||
return row.getRowKind();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setRowKind(RowKind kind) {
|
||||
this.row.setRowKind(kind);
|
||||
}
|
||||
|
||||
private String getMetaColumnVal(int ordinal) {
|
||||
switch (ordinal) {
|
||||
case 0: {
|
||||
return commitTime;
|
||||
}
|
||||
case 1: {
|
||||
return commitSeqNumber;
|
||||
}
|
||||
case 2: {
|
||||
return recordKey;
|
||||
}
|
||||
case 3: {
|
||||
return partitionPath;
|
||||
}
|
||||
case 4: {
|
||||
return fileName;
|
||||
}
|
||||
default:
|
||||
throw new IllegalArgumentException("Not expected");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isNullAt(int ordinal) {
|
||||
if (ordinal < metaColumnsNum) {
|
||||
return null == getMetaColumnVal(ordinal);
|
||||
}
|
||||
return row.isNullAt(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getBoolean(int ordinal) {
|
||||
return row.getBoolean(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte getByte(int ordinal) {
|
||||
return row.getByte(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public short getShort(int ordinal) {
|
||||
return row.getShort(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getInt(int ordinal) {
|
||||
return row.getInt(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getLong(int ordinal) {
|
||||
return row.getLong(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getFloat(int ordinal) {
|
||||
return row.getFloat(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getDouble(int ordinal) {
|
||||
return row.getDouble(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DecimalData getDecimal(int ordinal, int precision, int scale) {
|
||||
return row.getDecimal(ordinal - metaColumnsNum, precision, scale);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TimestampData getTimestamp(int pos, int precision) {
|
||||
return row.getTimestamp(pos - metaColumnsNum, precision);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> RawValueData<T> getRawValue(int pos) {
|
||||
return row.getRawValue(pos - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringData getString(int ordinal) {
|
||||
if (ordinal < HoodieRecord.HOODIE_META_COLUMNS.size()) {
|
||||
return StringData.fromString(getMetaColumnVal(ordinal));
|
||||
}
|
||||
return row.getString(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBinary(int ordinal) {
|
||||
return row.getBinary(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public RowData getRow(int ordinal, int numFields) {
|
||||
return row.getRow(ordinal - metaColumnsNum, numFields);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ArrayData getArray(int ordinal) {
|
||||
return row.getArray(ordinal - metaColumnsNum);
|
||||
}
|
||||
|
||||
@Override
|
||||
public MapData getMap(int ordinal) {
|
||||
return row.getMap(ordinal - metaColumnsNum);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.client.HoodieInternalWriteStatus;
|
||||
import org.apache.hudi.client.model.HoodieRowData;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.model.IOType;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.util.HoodieTimer;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieInsertException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
import org.apache.hudi.table.MarkerFiles;
|
||||
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
/**
|
||||
* Create handle with RowData for datasource implemention of bulk insert.
|
||||
*/
|
||||
public class HoodieRowDataCreateHandle implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieRowDataCreateHandle.class);
|
||||
private static final AtomicLong SEQGEN = new AtomicLong(1);
|
||||
|
||||
private final String instantTime;
|
||||
private final int taskPartitionId;
|
||||
private final long taskId;
|
||||
private final long taskEpochId;
|
||||
private final HoodieTable table;
|
||||
private final HoodieWriteConfig writeConfig;
|
||||
protected final HoodieRowDataFileWriter fileWriter;
|
||||
private final String partitionPath;
|
||||
private final Path path;
|
||||
private final String fileId;
|
||||
private final FileSystem fs;
|
||||
protected final HoodieInternalWriteStatus writeStatus;
|
||||
private final HoodieTimer currTimer;
|
||||
|
||||
public HoodieRowDataCreateHandle(HoodieTable table, HoodieWriteConfig writeConfig, String partitionPath, String fileId,
|
||||
String instantTime, int taskPartitionId, long taskId, long taskEpochId,
|
||||
RowType rowType) {
|
||||
this.partitionPath = partitionPath;
|
||||
this.table = table;
|
||||
this.writeConfig = writeConfig;
|
||||
this.instantTime = instantTime;
|
||||
this.taskPartitionId = taskPartitionId;
|
||||
this.taskId = taskId;
|
||||
this.taskEpochId = taskEpochId;
|
||||
this.fileId = fileId;
|
||||
this.currTimer = new HoodieTimer();
|
||||
this.currTimer.startTimer();
|
||||
this.fs = table.getMetaClient().getFs();
|
||||
this.path = makeNewPath(partitionPath);
|
||||
this.writeStatus = new HoodieInternalWriteStatus(!table.getIndex().isImplicitWithStorage(),
|
||||
writeConfig.getWriteStatusFailureFraction());
|
||||
writeStatus.setPartitionPath(partitionPath);
|
||||
writeStatus.setFileId(fileId);
|
||||
try {
|
||||
HoodiePartitionMetadata partitionMetadata =
|
||||
new HoodiePartitionMetadata(
|
||||
fs,
|
||||
instantTime,
|
||||
new Path(writeConfig.getBasePath()),
|
||||
FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath));
|
||||
partitionMetadata.trySave(taskPartitionId);
|
||||
createMarkerFile(partitionPath, FSUtils.makeDataFileName(this.instantTime, getWriteToken(), this.fileId, table.getBaseFileExtension()));
|
||||
this.fileWriter = createNewFileWriter(path, table, writeConfig, rowType);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieInsertException("Failed to initialize file writer for path " + path, e);
|
||||
}
|
||||
LOG.info("New handle created for partition :" + partitionPath + " with fileId " + fileId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes an {@link RowData} to the underlying {@link HoodieRowDataFileWriter}.
|
||||
* Before writing, value for meta columns are computed as required
|
||||
* and wrapped in {@link HoodieRowData}. {@link HoodieRowData} is what gets written to HoodieRowDataFileWriter.
|
||||
*
|
||||
* @param recordKey The record key
|
||||
* @param partitionPath The partition path
|
||||
* @param record instance of {@link RowData} that needs to be written to the fileWriter.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void write(String recordKey, String partitionPath, RowData record) throws IOException {
|
||||
try {
|
||||
String seqId = HoodieRecord.generateSequenceId(instantTime, taskPartitionId, SEQGEN.getAndIncrement());
|
||||
HoodieRowData rowData = new HoodieRowData(instantTime, seqId, recordKey, partitionPath, path.getName(),
|
||||
record);
|
||||
try {
|
||||
fileWriter.writeRow(recordKey, rowData);
|
||||
writeStatus.markSuccess(recordKey);
|
||||
} catch (Throwable t) {
|
||||
writeStatus.markFailure(recordKey, t);
|
||||
}
|
||||
} catch (Throwable ge) {
|
||||
writeStatus.setGlobalError(ge);
|
||||
throw ge;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @returns {@code true} if this handle can take in more writes. else {@code false}.
|
||||
*/
|
||||
public boolean canWrite() {
|
||||
return fileWriter.canWrite();
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the {@link HoodieRowDataCreateHandle} and returns an instance of {@link HoodieInternalWriteStatus} containing the stats and
|
||||
* status of the writes to this handle.
|
||||
*
|
||||
* @return the {@link HoodieInternalWriteStatus} containing the stats and status of the writes to this handle.
|
||||
* @throws IOException
|
||||
*/
|
||||
public HoodieInternalWriteStatus close() throws IOException {
|
||||
fileWriter.close();
|
||||
HoodieWriteStat stat = new HoodieWriteStat();
|
||||
stat.setPartitionPath(partitionPath);
|
||||
stat.setNumWrites(writeStatus.getTotalRecords());
|
||||
stat.setNumDeletes(0);
|
||||
stat.setNumInserts(writeStatus.getTotalRecords());
|
||||
stat.setPrevCommit(HoodieWriteStat.NULL_COMMIT);
|
||||
stat.setFileId(fileId);
|
||||
stat.setPath(new Path(writeConfig.getBasePath()), path);
|
||||
long fileSizeInBytes = FSUtils.getFileSize(table.getMetaClient().getFs(), path);
|
||||
stat.setTotalWriteBytes(fileSizeInBytes);
|
||||
stat.setFileSizeInBytes(fileSizeInBytes);
|
||||
stat.setTotalWriteErrors(writeStatus.getFailedRowsSize());
|
||||
HoodieWriteStat.RuntimeStats runtimeStats = new HoodieWriteStat.RuntimeStats();
|
||||
runtimeStats.setTotalCreateTime(currTimer.endTimer());
|
||||
stat.setRuntimeStats(runtimeStats);
|
||||
writeStatus.setStat(stat);
|
||||
return writeStatus;
|
||||
}
|
||||
|
||||
public String getFileName() {
|
||||
return path.getName();
|
||||
}
|
||||
|
||||
private Path makeNewPath(String partitionPath) {
|
||||
Path path = FSUtils.getPartitionPath(writeConfig.getBasePath(), partitionPath);
|
||||
try {
|
||||
if (!fs.exists(path)) {
|
||||
fs.mkdirs(path); // create a new partition as needed.
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to make dir " + path, e);
|
||||
}
|
||||
HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig();
|
||||
return new Path(path.toString(), FSUtils.makeDataFileName(instantTime, getWriteToken(), fileId,
|
||||
tableConfig.getBaseFileFormat().getFileExtension()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an empty marker file corresponding to storage writer path.
|
||||
*
|
||||
* @param partitionPath Partition path
|
||||
*/
|
||||
private void createMarkerFile(String partitionPath, String dataFileName) {
|
||||
MarkerFiles markerFiles = new MarkerFiles(table, instantTime);
|
||||
markerFiles.create(partitionPath, dataFileName, IOType.CREATE);
|
||||
}
|
||||
|
||||
private String getWriteToken() {
|
||||
return taskPartitionId + "-" + taskId + "-" + taskEpochId;
|
||||
}
|
||||
|
||||
protected HoodieRowDataFileWriter createNewFileWriter(
|
||||
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, RowType rowType)
|
||||
throws IOException {
|
||||
return HoodieRowDataFileWriterFactory.getRowDataFileWriter(
|
||||
path, hoodieTable, config, rowType);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.flink.table.data.RowData;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Abstraction to assist in writing {@link RowData}s to be used in datasource implementation.
|
||||
*/
|
||||
public interface HoodieRowDataFileWriter {
|
||||
|
||||
/**
|
||||
* Returns {@code true} if this RowFileWriter can take in more writes. else {@code false}.
|
||||
*/
|
||||
boolean canWrite();
|
||||
|
||||
/**
|
||||
* Writes an {@link RowData} to the {@link HoodieRowDataFileWriter}. Also takes in associated record key to be added to bloom filter if required.
|
||||
*
|
||||
* @throws IOException on any exception while writing.
|
||||
*/
|
||||
void writeRow(String key, RowData row) throws IOException;
|
||||
|
||||
/**
|
||||
* Writes an {@link RowData} to the {@link HoodieRowDataFileWriter}.
|
||||
*
|
||||
* @throws IOException on any exception while writing.
|
||||
*/
|
||||
void writeRow(RowData row) throws IOException;
|
||||
|
||||
/**
|
||||
* Closes the {@link HoodieRowDataFileWriter} and may not take in any more writes.
|
||||
*/
|
||||
void close() throws IOException;
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
||||
|
||||
/**
|
||||
* Factory to assist in instantiating a new {@link HoodieRowDataFileWriter}.
|
||||
*/
|
||||
public class HoodieRowDataFileWriterFactory {
|
||||
|
||||
/**
|
||||
* Factory method to assist in instantiating an instance of {@link HoodieRowDataFileWriter}.
|
||||
*
|
||||
* @param path path of the RowFileWriter.
|
||||
* @param hoodieTable instance of {@link HoodieTable} in use.
|
||||
* @param config instance of {@link HoodieWriteConfig} to use.
|
||||
* @param schema schema of the dataset in use.
|
||||
* @return the instantiated {@link HoodieRowDataFileWriter}.
|
||||
* @throws IOException if format is not supported or if any exception during instantiating the RowFileWriter.
|
||||
*/
|
||||
public static HoodieRowDataFileWriter getRowDataFileWriter(
|
||||
Path path, HoodieTable hoodieTable, HoodieWriteConfig config, RowType schema)
|
||||
throws IOException {
|
||||
final String extension = FSUtils.getFileExtension(path.getName());
|
||||
if (PARQUET.getFileExtension().equals(extension)) {
|
||||
return newParquetInternalRowFileWriter(path, config, schema, hoodieTable);
|
||||
}
|
||||
throw new UnsupportedOperationException(extension + " format not supported yet.");
|
||||
}
|
||||
|
||||
private static HoodieRowDataFileWriter newParquetInternalRowFileWriter(
|
||||
Path path, HoodieWriteConfig writeConfig, RowType rowType, HoodieTable table)
|
||||
throws IOException {
|
||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(
|
||||
writeConfig.getBloomFilterNumEntries(),
|
||||
writeConfig.getBloomFilterFPP(),
|
||||
writeConfig.getDynamicBloomFilterMaxNumEntries(),
|
||||
writeConfig.getBloomFilterType());
|
||||
HoodieRowDataParquetWriteSupport writeSupport =
|
||||
new HoodieRowDataParquetWriteSupport(table.getHadoopConf(), rowType, filter);
|
||||
return new HoodieRowDataParquetWriter(
|
||||
path, new HoodieRowDataParquetConfig(
|
||||
writeSupport,
|
||||
writeConfig.getParquetCompressionCodec(),
|
||||
writeConfig.getParquetBlockSize(),
|
||||
writeConfig.getParquetPageSize(),
|
||||
writeConfig.getParquetMaxFileSize(),
|
||||
writeSupport.getHadoopConf(),
|
||||
writeConfig.getParquetCompressionRatio()));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.io.storage.HoodieBaseParquetConfig;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
|
||||
/**
|
||||
* ParquetConfig for datasource implementation with {@link org.apache.flink.table.data.RowData}.
|
||||
*/
|
||||
public class HoodieRowDataParquetConfig extends HoodieBaseParquetConfig<HoodieRowDataParquetWriteSupport> {
|
||||
|
||||
public HoodieRowDataParquetConfig(HoodieRowDataParquetWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
||||
double compressionRatio) {
|
||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
|
||||
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY;
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_BLOOM_FILTER_TYPE_CODE;
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER;
|
||||
import static org.apache.hudi.avro.HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER;
|
||||
|
||||
/**
|
||||
* Hoodie Write Support for directly writing {@link RowData} to Parquet.
|
||||
*/
|
||||
public class HoodieRowDataParquetWriteSupport extends RowDataParquetWriteSupport {
|
||||
|
||||
private final Configuration hadoopConf;
|
||||
private final BloomFilter bloomFilter;
|
||||
private String minRecordKey;
|
||||
private String maxRecordKey;
|
||||
|
||||
public HoodieRowDataParquetWriteSupport(Configuration conf, RowType rowType, BloomFilter bloomFilter) {
|
||||
super(rowType);
|
||||
this.hadoopConf = new Configuration(conf);
|
||||
this.bloomFilter = bloomFilter;
|
||||
}
|
||||
|
||||
public Configuration getHadoopConf() {
|
||||
return hadoopConf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public WriteSupport.FinalizedWriteContext finalizeWrite() {
|
||||
HashMap<String, String> extraMetaData = new HashMap<>();
|
||||
if (bloomFilter != null) {
|
||||
extraMetaData.put(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY, bloomFilter.serializeToString());
|
||||
if (minRecordKey != null && maxRecordKey != null) {
|
||||
extraMetaData.put(HOODIE_MIN_RECORD_KEY_FOOTER, minRecordKey);
|
||||
extraMetaData.put(HOODIE_MAX_RECORD_KEY_FOOTER, maxRecordKey);
|
||||
}
|
||||
if (bloomFilter.getBloomFilterTypeCode().name().contains(HoodieDynamicBoundedBloomFilter.TYPE_CODE_PREFIX)) {
|
||||
extraMetaData.put(HOODIE_BLOOM_FILTER_TYPE_CODE, bloomFilter.getBloomFilterTypeCode().name());
|
||||
}
|
||||
}
|
||||
return new WriteSupport.FinalizedWriteContext(extraMetaData);
|
||||
}
|
||||
|
||||
public void add(String recordKey) {
|
||||
this.bloomFilter.add(recordKey);
|
||||
if (minRecordKey != null) {
|
||||
minRecordKey = minRecordKey.compareTo(recordKey) <= 0 ? minRecordKey : recordKey;
|
||||
} else {
|
||||
minRecordKey = recordKey;
|
||||
}
|
||||
|
||||
if (maxRecordKey != null) {
|
||||
maxRecordKey = maxRecordKey.compareTo(recordKey) >= 0 ? maxRecordKey : recordKey;
|
||||
} else {
|
||||
maxRecordKey = recordKey;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
|
||||
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.hadoop.ParquetFileWriter;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Parquet's impl of {@link HoodieRowDataFileWriter} to write {@link RowData}s.
|
||||
*/
|
||||
public class HoodieRowDataParquetWriter extends ParquetWriter<RowData>
|
||||
implements HoodieRowDataFileWriter {
|
||||
|
||||
private final Path file;
|
||||
private final HoodieWrapperFileSystem fs;
|
||||
private final long maxFileSize;
|
||||
private final HoodieRowDataParquetWriteSupport writeSupport;
|
||||
|
||||
public HoodieRowDataParquetWriter(Path file, HoodieRowDataParquetConfig parquetConfig)
|
||||
throws IOException {
|
||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
||||
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||
DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED,
|
||||
DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
||||
this.fs = (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file,
|
||||
parquetConfig.getHadoopConf()));
|
||||
this.maxFileSize = parquetConfig.getMaxFileSize()
|
||||
+ Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
|
||||
this.writeSupport = parquetConfig.getWriteSupport();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canWrite() {
|
||||
return fs.getBytesWritten(file) < maxFileSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeRow(String key, RowData row) throws IOException {
|
||||
super.write(row);
|
||||
writeSupport.add(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeRow(RowData row) throws IOException {
|
||||
super.write(row);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage.row;
|
||||
|
||||
import org.apache.flink.formats.parquet.row.ParquetRowDataWriter;
|
||||
import org.apache.flink.formats.parquet.utils.ParquetSchemaConverter;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||
import org.apache.parquet.io.api.RecordConsumer;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Row data parquet write support.
|
||||
*/
|
||||
public class RowDataParquetWriteSupport extends WriteSupport<RowData> {
|
||||
|
||||
private final RowType rowType;
|
||||
private final MessageType schema;
|
||||
private ParquetRowDataWriter writer;
|
||||
|
||||
public RowDataParquetWriteSupport(RowType rowType) {
|
||||
super();
|
||||
this.rowType = rowType;
|
||||
this.schema = ParquetSchemaConverter.convertToParquetMessageType("flink_schema", rowType);
|
||||
}
|
||||
|
||||
@Override
|
||||
public WriteContext init(Configuration configuration) {
|
||||
return new WriteContext(schema, new HashMap<>());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void prepareForWrite(RecordConsumer recordConsumer) {
|
||||
// should make the utc timestamp configurable
|
||||
this.writer = new ParquetRowDataWriter(recordConsumer, rowType, schema, true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(RowData record) {
|
||||
try {
|
||||
this.writer.write(record);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -339,5 +339,11 @@
|
||||
<scope>test</scope>
|
||||
<type>test-jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-csv</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
||||
@@ -344,6 +344,22 @@ public class FlinkOptions extends HoodieConfig {
|
||||
.withDescription("Timeout limit for a writer task after it finishes a checkpoint and\n"
|
||||
+ "waits for the instant commit success, only for internal use");
|
||||
|
||||
public static final ConfigOption<Boolean> SINK_SHUFFLE_BY_PARTITION = ConfigOptions
|
||||
.key("sink.shuffle-by-partition.enable")
|
||||
.booleanType()
|
||||
.defaultValue(false)
|
||||
.withDescription(
|
||||
"The option to enable shuffle data by dynamic partition fields in sink"
|
||||
+ " phase, this can greatly reduce the number of file for filesystem sink but may"
|
||||
+ " lead data skew.");
|
||||
|
||||
// this is only for internal use
|
||||
public static final ConfigOption<Boolean> WRITE_BULK_INSERT_PARTITION_SORTED = ConfigOptions
|
||||
.key("write.bulk_insert.partition.sorted")
|
||||
.booleanType()
|
||||
.defaultValue(false)
|
||||
.withDescription("Whether the bulk insert write task input records are already sorted by the partition path");
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// Compaction Options
|
||||
// ------------------------------------------------------------------------
|
||||
@@ -581,7 +597,9 @@ public class FlinkOptions extends HoodieConfig {
|
||||
return options.keySet().stream().anyMatch(k -> k.startsWith(PROPERTIES_PREFIX));
|
||||
}
|
||||
|
||||
/** Creates a new configuration that is initialized with the options of the given map. */
|
||||
/**
|
||||
* Creates a new configuration that is initialized with the options of the given map.
|
||||
*/
|
||||
public static Configuration fromMap(Map<String, String> map) {
|
||||
final Configuration configuration = new Configuration();
|
||||
map.forEach(configuration::setString);
|
||||
|
||||
@@ -34,6 +34,7 @@ import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.sink.event.CommitAckEvent;
|
||||
import org.apache.hudi.sink.event.WriteMetadataEvent;
|
||||
import org.apache.hudi.sink.utils.TimeWait;
|
||||
import org.apache.hudi.table.action.commit.FlinkWriteHelper;
|
||||
import org.apache.hudi.util.StreamerUtil;
|
||||
|
||||
@@ -61,7 +62,6 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@@ -568,24 +568,17 @@ public class StreamWriteFunction<K, I, O>
|
||||
String instant = this.writeClient.getLastPendingInstant(this.actionType);
|
||||
// if exactly-once semantics turns on,
|
||||
// waits for the checkpoint notification until the checkpoint timeout threshold hits.
|
||||
long waitingTime = 0L;
|
||||
long ckpTimeout = config.getLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT);
|
||||
long interval = 500L;
|
||||
TimeWait timeWait = TimeWait.builder()
|
||||
.timeout(config.getLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT))
|
||||
.action("instant initialize")
|
||||
.build();
|
||||
while (confirming) {
|
||||
// wait condition:
|
||||
// 1. there is no inflight instant
|
||||
// 2. the inflight instant does not change and the checkpoint has buffering data
|
||||
if (instant == null || (instant.equals(this.currentInstant) && hasData)) {
|
||||
// sleep for a while
|
||||
try {
|
||||
if (waitingTime > ckpTimeout) {
|
||||
throw new HoodieException("Timeout(" + waitingTime + "ms) while waiting for instant " + instant + " to commit");
|
||||
}
|
||||
TimeUnit.MILLISECONDS.sleep(interval);
|
||||
waitingTime += interval;
|
||||
} catch (InterruptedException e) {
|
||||
throw new HoodieException("Error while waiting for instant " + instant + " to commit", e);
|
||||
}
|
||||
timeWait.waitFor();
|
||||
// refresh the inflight instant
|
||||
instant = this.writeClient.getLastPendingInstant(this.actionType);
|
||||
} else {
|
||||
|
||||
@@ -27,7 +27,6 @@ import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory;
|
||||
import org.apache.flink.streaming.api.operators.SimpleUdfStreamOperatorFactory;
|
||||
import org.apache.flink.streaming.api.operators.StreamOperator;
|
||||
import org.apache.flink.streaming.api.operators.StreamOperatorParameters;
|
||||
import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService;
|
||||
|
||||
/**
|
||||
* Factory class for {@link StreamWriteOperator}.
|
||||
@@ -63,9 +62,4 @@ public class StreamWriteOperatorFactory<I>
|
||||
public OperatorCoordinator.Provider getCoordinatorProvider(String s, OperatorID operatorID) {
|
||||
return new StreamWriteOperatorCoordinator.Provider(operatorID, this.conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setProcessingTimeService(ProcessingTimeService processingTimeService) {
|
||||
super.setProcessingTimeService(processingTimeService);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,225 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sink.bulk;
|
||||
|
||||
import org.apache.hudi.client.HoodieFlinkWriteClient;
|
||||
import org.apache.hudi.client.HoodieInternalWriteStatus;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.common.util.CommitUtils;
|
||||
import org.apache.hudi.configuration.FlinkOptions;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.sink.StreamWriteOperatorCoordinator;
|
||||
import org.apache.hudi.sink.event.WriteMetadataEvent;
|
||||
import org.apache.hudi.sink.utils.TimeWait;
|
||||
import org.apache.hudi.util.StreamerUtil;
|
||||
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.runtime.operators.coordination.OperatorEventGateway;
|
||||
import org.apache.flink.streaming.api.functions.ProcessFunction;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.flink.util.Collector;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Sink function to write the data to the underneath filesystem.
|
||||
*
|
||||
* <p>The function should only be used in operation type {@link WriteOperationType#BULK_INSERT}.
|
||||
*
|
||||
* <p>Note: The function task requires the input stream be shuffled by partition path.
|
||||
*
|
||||
* @param <I> Type of the input record
|
||||
* @see StreamWriteOperatorCoordinator
|
||||
*/
|
||||
public class BulkInsertWriteFunction<I, O>
|
||||
extends ProcessFunction<I, O> {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(BulkInsertWriteFunction.class);
|
||||
|
||||
/**
|
||||
* Helper class for bulk insert mode.
|
||||
*/
|
||||
private transient BulkInsertWriterHelper writerHelper;
|
||||
|
||||
/**
|
||||
* Config options.
|
||||
*/
|
||||
private final Configuration config;
|
||||
|
||||
/**
|
||||
* Table row type.
|
||||
*/
|
||||
private final RowType rowType;
|
||||
|
||||
/**
|
||||
* Id of current subtask.
|
||||
*/
|
||||
private int taskID;
|
||||
|
||||
/**
|
||||
* Write Client.
|
||||
*/
|
||||
private transient HoodieFlinkWriteClient writeClient;
|
||||
|
||||
/**
|
||||
* The initial inflight instant when start up.
|
||||
*/
|
||||
private volatile String initInstant;
|
||||
|
||||
/**
|
||||
* Gateway to send operator events to the operator coordinator.
|
||||
*/
|
||||
private transient OperatorEventGateway eventGateway;
|
||||
|
||||
/**
|
||||
* Commit action type.
|
||||
*/
|
||||
private transient String actionType;
|
||||
|
||||
/**
|
||||
* Constructs a StreamingSinkFunction.
|
||||
*
|
||||
* @param config The config options
|
||||
*/
|
||||
public BulkInsertWriteFunction(Configuration config, RowType rowType) {
|
||||
this.config = config;
|
||||
this.rowType = rowType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void open(Configuration parameters) throws IOException {
|
||||
this.taskID = getRuntimeContext().getIndexOfThisSubtask();
|
||||
this.writeClient = StreamerUtil.createWriteClient(this.config, getRuntimeContext());
|
||||
this.actionType = CommitUtils.getCommitActionType(
|
||||
WriteOperationType.fromValue(config.getString(FlinkOptions.OPERATION)),
|
||||
HoodieTableType.valueOf(config.getString(FlinkOptions.TABLE_TYPE)));
|
||||
|
||||
this.initInstant = this.writeClient.getLastPendingInstant(this.actionType);
|
||||
sendBootstrapEvent();
|
||||
initWriterHelper();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processElement(I value, Context ctx, Collector<O> out) throws IOException {
|
||||
this.writerHelper.write((RowData) value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
if (this.writeClient != null) {
|
||||
this.writeClient.cleanHandlesGracefully();
|
||||
this.writeClient.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* End input action for batch source.
|
||||
*/
|
||||
public void endInput() {
|
||||
final List<WriteStatus> writeStatus;
|
||||
try {
|
||||
this.writerHelper.close();
|
||||
writeStatus = this.writerHelper.getWriteStatuses().stream()
|
||||
.map(BulkInsertWriteFunction::toWriteStatus).collect(Collectors.toList());
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException("Error collect the write status for task [" + this.taskID + "]");
|
||||
}
|
||||
final WriteMetadataEvent event = WriteMetadataEvent.builder()
|
||||
.taskID(taskID)
|
||||
.instantTime(this.writerHelper.getInstantTime())
|
||||
.writeStatus(writeStatus)
|
||||
.lastBatch(true)
|
||||
.endInput(true)
|
||||
.build();
|
||||
this.eventGateway.sendEventToCoordinator(event);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tool to convert {@link HoodieInternalWriteStatus} into {@link WriteStatus}.
|
||||
*/
|
||||
private static WriteStatus toWriteStatus(HoodieInternalWriteStatus internalWriteStatus) {
|
||||
WriteStatus writeStatus = new WriteStatus(false, 0.1);
|
||||
writeStatus.setStat(internalWriteStatus.getStat());
|
||||
writeStatus.setFileId(internalWriteStatus.getFileId());
|
||||
writeStatus.setGlobalError(internalWriteStatus.getGlobalError());
|
||||
writeStatus.setTotalRecords(internalWriteStatus.getTotalRecords());
|
||||
writeStatus.setTotalErrorRecords(internalWriteStatus.getTotalErrorRecords());
|
||||
return writeStatus;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Getter/Setter
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
public void setOperatorEventGateway(OperatorEventGateway operatorEventGateway) {
|
||||
this.eventGateway = operatorEventGateway;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Utilities
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
private void initWriterHelper() {
|
||||
String instant = instantToWrite();
|
||||
this.writerHelper = new BulkInsertWriterHelper(this.config, this.writeClient.getHoodieTable(), this.writeClient.getConfig(),
|
||||
instant, this.taskID, getRuntimeContext().getNumberOfParallelSubtasks(), getRuntimeContext().getAttemptNumber(),
|
||||
this.rowType);
|
||||
}
|
||||
|
||||
private void sendBootstrapEvent() {
|
||||
WriteMetadataEvent event = WriteMetadataEvent.builder()
|
||||
.taskID(taskID)
|
||||
.writeStatus(Collections.emptyList())
|
||||
.instantTime("")
|
||||
.bootstrap(true)
|
||||
.build();
|
||||
this.eventGateway.sendEventToCoordinator(event);
|
||||
LOG.info("Send bootstrap write metadata event to coordinator, task[{}].", taskID);
|
||||
}
|
||||
|
||||
private String instantToWrite() {
|
||||
String instant = this.writeClient.getLastPendingInstant(this.actionType);
|
||||
// if exactly-once semantics turns on,
|
||||
// waits for the checkpoint notification until the checkpoint timeout threshold hits.
|
||||
TimeWait timeWait = TimeWait.builder()
|
||||
.timeout(config.getLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT))
|
||||
.action("instant initialize")
|
||||
.build();
|
||||
while (instant == null || instant.equals(this.initInstant)) {
|
||||
// wait condition:
|
||||
// 1. there is no inflight instant
|
||||
// 2. the inflight instant does not change
|
||||
// sleep for a while
|
||||
timeWait.waitFor();
|
||||
// refresh the inflight instant
|
||||
instant = this.writeClient.getLastPendingInstant(this.actionType);
|
||||
}
|
||||
return instant;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sink.bulk;
|
||||
|
||||
import org.apache.hudi.sink.StreamWriteOperatorCoordinator;
|
||||
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.runtime.jobgraph.OperatorID;
|
||||
import org.apache.flink.runtime.operators.coordination.OperatorCoordinator;
|
||||
import org.apache.flink.runtime.operators.coordination.OperatorEvent;
|
||||
import org.apache.flink.runtime.operators.coordination.OperatorEventDispatcher;
|
||||
import org.apache.flink.runtime.operators.coordination.OperatorEventGateway;
|
||||
import org.apache.flink.runtime.operators.coordination.OperatorEventHandler;
|
||||
import org.apache.flink.streaming.api.operators.BoundedOneInput;
|
||||
import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory;
|
||||
import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory;
|
||||
import org.apache.flink.streaming.api.operators.ProcessOperator;
|
||||
import org.apache.flink.streaming.api.operators.SimpleUdfStreamOperatorFactory;
|
||||
import org.apache.flink.streaming.api.operators.StreamOperator;
|
||||
import org.apache.flink.streaming.api.operators.StreamOperatorParameters;
|
||||
import org.apache.flink.streaming.runtime.tasks.ProcessingTimeService;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
|
||||
/**
|
||||
* Operator for bulk insert mode sink.
|
||||
*
|
||||
* @param <I> The input type
|
||||
*/
|
||||
public class BulkInsertWriteOperator<I>
|
||||
extends ProcessOperator<I, Object>
|
||||
implements OperatorEventHandler, BoundedOneInput {
|
||||
private final BulkInsertWriteFunction<I, Object> sinkFunction;
|
||||
|
||||
public BulkInsertWriteOperator(Configuration conf, RowType rowType) {
|
||||
super(new BulkInsertWriteFunction<>(conf, rowType));
|
||||
this.sinkFunction = (BulkInsertWriteFunction<I, Object>) getUserFunction();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handleOperatorEvent(OperatorEvent event) {
|
||||
// no operation
|
||||
}
|
||||
|
||||
void setOperatorEventGateway(OperatorEventGateway operatorEventGateway) {
|
||||
sinkFunction.setOperatorEventGateway(operatorEventGateway);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endInput() {
|
||||
sinkFunction.endInput();
|
||||
}
|
||||
|
||||
public static OperatorFactory<RowData> getFactory(Configuration conf, RowType rowType) {
|
||||
return new OperatorFactory<>(conf, rowType);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Inner Class
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
public static class OperatorFactory<I>
|
||||
extends SimpleUdfStreamOperatorFactory<Object>
|
||||
implements CoordinatedOperatorFactory<Object>, OneInputStreamOperatorFactory<I, Object> {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private final BulkInsertWriteOperator<I> operator;
|
||||
private final Configuration conf;
|
||||
|
||||
public OperatorFactory(Configuration conf, RowType rowType) {
|
||||
super(new BulkInsertWriteOperator<>(conf, rowType));
|
||||
this.operator = (BulkInsertWriteOperator<I>) getOperator();
|
||||
this.conf = conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public <T extends StreamOperator<Object>> T createStreamOperator(StreamOperatorParameters<Object> parameters) {
|
||||
final OperatorID operatorID = parameters.getStreamConfig().getOperatorID();
|
||||
final OperatorEventDispatcher eventDispatcher = parameters.getOperatorEventDispatcher();
|
||||
|
||||
this.operator.setOperatorEventGateway(eventDispatcher.getOperatorEventGateway(operatorID));
|
||||
this.operator.setup(parameters.getContainingTask(), parameters.getStreamConfig(), parameters.getOutput());
|
||||
this.operator.setProcessingTimeService(this.processingTimeService);
|
||||
eventDispatcher.registerEventHandler(operatorID, operator);
|
||||
return (T) operator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public OperatorCoordinator.Provider getCoordinatorProvider(String s, OperatorID operatorID) {
|
||||
return new StreamWriteOperatorCoordinator.Provider(operatorID, this.conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setProcessingTimeService(ProcessingTimeService processingTimeService) {
|
||||
super.setProcessingTimeService(processingTimeService);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,169 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sink.bulk;
|
||||
|
||||
import org.apache.hudi.client.HoodieInternalWriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.configuration.FlinkOptions;
|
||||
import org.apache.hudi.io.storage.row.HoodieRowDataCreateHandle;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.table.api.DataTypes;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.types.logical.LogicalType;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Helper class for bulk insert used by Flink.
|
||||
*/
|
||||
public class BulkInsertWriterHelper {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(BulkInsertWriterHelper.class);
|
||||
|
||||
private final String instantTime;
|
||||
private final int taskPartitionId;
|
||||
private final long taskId;
|
||||
private final long taskEpochId;
|
||||
private final HoodieTable hoodieTable;
|
||||
private final HoodieWriteConfig writeConfig;
|
||||
private final RowType rowType;
|
||||
private final Boolean arePartitionRecordsSorted;
|
||||
private final List<HoodieInternalWriteStatus> writeStatusList = new ArrayList<>();
|
||||
private HoodieRowDataCreateHandle handle;
|
||||
private String lastKnownPartitionPath = null;
|
||||
private final String fileIdPrefix;
|
||||
private int numFilesWritten = 0;
|
||||
private final Map<String, HoodieRowDataCreateHandle> handles = new HashMap<>();
|
||||
private final RowDataKeyGen keyGen;
|
||||
|
||||
public BulkInsertWriterHelper(Configuration conf, HoodieTable hoodieTable, HoodieWriteConfig writeConfig,
|
||||
String instantTime, int taskPartitionId, long taskId, long taskEpochId, RowType rowType) {
|
||||
this.hoodieTable = hoodieTable;
|
||||
this.writeConfig = writeConfig;
|
||||
this.instantTime = instantTime;
|
||||
this.taskPartitionId = taskPartitionId;
|
||||
this.taskId = taskId;
|
||||
this.taskEpochId = taskEpochId;
|
||||
this.rowType = addMetadataFields(rowType); // patch up with metadata fields
|
||||
this.arePartitionRecordsSorted = conf.getBoolean(FlinkOptions.WRITE_BULK_INSERT_PARTITION_SORTED);
|
||||
this.fileIdPrefix = UUID.randomUUID().toString();
|
||||
this.keyGen = RowDataKeyGen.instance(conf, rowType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the write instant time.
|
||||
*/
|
||||
public String getInstantTime() {
|
||||
return this.instantTime;
|
||||
}
|
||||
|
||||
public void write(RowData record) throws IOException {
|
||||
try {
|
||||
String recordKey = keyGen.getRecordKey(record);
|
||||
String partitionPath = keyGen.getPartitionPath(record);
|
||||
|
||||
if ((lastKnownPartitionPath == null) || !lastKnownPartitionPath.equals(partitionPath) || !handle.canWrite()) {
|
||||
LOG.info("Creating new file for partition path " + partitionPath);
|
||||
handle = getRowCreateHandle(partitionPath);
|
||||
lastKnownPartitionPath = partitionPath;
|
||||
}
|
||||
handle.write(recordKey, partitionPath, record);
|
||||
} catch (Throwable t) {
|
||||
LOG.error("Global error thrown while trying to write records in HoodieRowCreateHandle ", t);
|
||||
throw t;
|
||||
}
|
||||
}
|
||||
|
||||
public List<HoodieInternalWriteStatus> getWriteStatuses() throws IOException {
|
||||
close();
|
||||
return writeStatusList;
|
||||
}
|
||||
|
||||
private HoodieRowDataCreateHandle getRowCreateHandle(String partitionPath) throws IOException {
|
||||
if (!handles.containsKey(partitionPath)) { // if there is no handle corresponding to the partition path
|
||||
// if records are sorted, we can close all existing handles
|
||||
if (arePartitionRecordsSorted) {
|
||||
close();
|
||||
}
|
||||
HoodieRowDataCreateHandle rowCreateHandle = new HoodieRowDataCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
|
||||
instantTime, taskPartitionId, taskId, taskEpochId, rowType);
|
||||
handles.put(partitionPath, rowCreateHandle);
|
||||
} else if (!handles.get(partitionPath).canWrite()) {
|
||||
// even if there is a handle to the partition path, it could have reached its max size threshold. So, we close the handle here and
|
||||
// create a new one.
|
||||
writeStatusList.add(handles.remove(partitionPath).close());
|
||||
HoodieRowDataCreateHandle rowCreateHandle = new HoodieRowDataCreateHandle(hoodieTable, writeConfig, partitionPath, getNextFileId(),
|
||||
instantTime, taskPartitionId, taskId, taskEpochId, rowType);
|
||||
handles.put(partitionPath, rowCreateHandle);
|
||||
}
|
||||
return handles.get(partitionPath);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
for (HoodieRowDataCreateHandle rowCreateHandle : handles.values()) {
|
||||
writeStatusList.add(rowCreateHandle.close());
|
||||
}
|
||||
handles.clear();
|
||||
handle = null;
|
||||
}
|
||||
|
||||
private String getNextFileId() {
|
||||
return String.format("%s-%d", fileIdPrefix, numFilesWritten++);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the Hoodie metadata fields to the given row type.
|
||||
*/
|
||||
private static RowType addMetadataFields(RowType rowType) {
|
||||
List<RowType.RowField> mergedFields = new ArrayList<>();
|
||||
|
||||
LogicalType metadataFieldType = DataTypes.STRING().getLogicalType();
|
||||
RowType.RowField commitTimeField =
|
||||
new RowType.RowField(HoodieRecord.COMMIT_TIME_METADATA_FIELD, metadataFieldType, "commit time");
|
||||
RowType.RowField commitSeqnoField =
|
||||
new RowType.RowField(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, metadataFieldType, "commit seqno");
|
||||
RowType.RowField recordKeyField =
|
||||
new RowType.RowField(HoodieRecord.RECORD_KEY_METADATA_FIELD, metadataFieldType, "record key");
|
||||
RowType.RowField partitionPathField =
|
||||
new RowType.RowField(HoodieRecord.PARTITION_PATH_METADATA_FIELD, metadataFieldType, "partition path");
|
||||
RowType.RowField fileNameField =
|
||||
new RowType.RowField(HoodieRecord.FILENAME_METADATA_FIELD, metadataFieldType, "field name");
|
||||
|
||||
mergedFields.add(commitTimeField);
|
||||
mergedFields.add(commitSeqnoField);
|
||||
mergedFields.add(recordKeyField);
|
||||
mergedFields.add(partitionPathField);
|
||||
mergedFields.add(fileNameField);
|
||||
mergedFields.addAll(rowType.getFields());
|
||||
|
||||
return new RowType(false, mergedFields);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sink.bulk;
|
||||
|
||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.configuration.FlinkOptions;
|
||||
import org.apache.hudi.exception.HoodieKeyException;
|
||||
import org.apache.hudi.util.RowDataProjection;
|
||||
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.types.logical.LogicalType;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Key generator for {@link RowData}.
|
||||
*/
|
||||
public class RowDataKeyGen {
|
||||
|
||||
// reference: NonpartitionedAvroKeyGenerator
|
||||
private static final String EMPTY_PARTITION = "";
|
||||
|
||||
// reference: org.apache.hudi.keygen.KeyGenUtils
|
||||
private static final String NULL_RECORDKEY_PLACEHOLDER = "__null__";
|
||||
private static final String EMPTY_RECORDKEY_PLACEHOLDER = "__empty__";
|
||||
|
||||
private static final String DEFAULT_PARTITION_PATH = "default";
|
||||
private static final String DEFAULT_PARTITION_PATH_SEPARATOR = "/";
|
||||
|
||||
private final String[] recordKeyFields;
|
||||
private final String[] partitionPathFields;
|
||||
|
||||
private final RowDataProjection recordKeyProjection;
|
||||
private final RowDataProjection partitionPathProjection;
|
||||
|
||||
private final boolean hiveStylePartitioning;
|
||||
private final boolean encodePartitionPath;
|
||||
|
||||
// efficient code path
|
||||
private boolean simpleRecordKey = false;
|
||||
private RowData.FieldGetter recordKeyFieldGetter;
|
||||
|
||||
private boolean simplePartitionPath = false;
|
||||
private RowData.FieldGetter partitionPathFieldGetter;
|
||||
|
||||
private boolean nonPartitioned;
|
||||
|
||||
private RowDataKeyGen(
|
||||
String recordKeys,
|
||||
String partitionFields,
|
||||
RowType rowType,
|
||||
boolean hiveStylePartitioning,
|
||||
boolean encodePartitionPath) {
|
||||
this.recordKeyFields = recordKeys.split(",");
|
||||
this.partitionPathFields = partitionFields.split(",");
|
||||
List<String> fieldNames = rowType.getFieldNames();
|
||||
List<LogicalType> fieldTypes = rowType.getChildren();
|
||||
|
||||
this.hiveStylePartitioning = hiveStylePartitioning;
|
||||
this.encodePartitionPath = encodePartitionPath;
|
||||
if (this.recordKeyFields.length == 1) {
|
||||
// efficient code path
|
||||
this.simpleRecordKey = true;
|
||||
int recordKeyIdx = fieldNames.indexOf(this.recordKeyFields[0]);
|
||||
this.recordKeyFieldGetter = RowData.createFieldGetter(fieldTypes.get(recordKeyIdx), recordKeyIdx);
|
||||
this.recordKeyProjection = null;
|
||||
} else {
|
||||
this.recordKeyProjection = getProjection(this.recordKeyFields, fieldNames, fieldTypes);
|
||||
}
|
||||
if (this.partitionPathFields.length == 1) {
|
||||
// efficient code path
|
||||
if (this.partitionPathFields[0].equals("")) {
|
||||
this.nonPartitioned = true;
|
||||
} else {
|
||||
this.simplePartitionPath = true;
|
||||
int partitionPathIdx = fieldNames.indexOf(this.partitionPathFields[0]);
|
||||
this.partitionPathFieldGetter = RowData.createFieldGetter(fieldTypes.get(partitionPathIdx), partitionPathIdx);
|
||||
}
|
||||
this.partitionPathProjection = null;
|
||||
} else {
|
||||
this.partitionPathProjection = getProjection(this.partitionPathFields, fieldNames, fieldTypes);
|
||||
}
|
||||
}
|
||||
|
||||
public static RowDataKeyGen instance(Configuration conf, RowType rowType) {
|
||||
return new RowDataKeyGen(conf.getString(FlinkOptions.RECORD_KEY_FIELD), conf.getString(FlinkOptions.PARTITION_PATH_FIELD),
|
||||
rowType, conf.getBoolean(FlinkOptions.HIVE_STYLE_PARTITIONING), conf.getBoolean(FlinkOptions.URL_ENCODE_PARTITIONING));
|
||||
}
|
||||
|
||||
public String getRecordKey(RowData rowData) {
|
||||
if (this.simpleRecordKey) {
|
||||
return getRecordKey(recordKeyFieldGetter.getFieldOrNull(rowData), this.recordKeyFields[0]);
|
||||
} else {
|
||||
Object[] keyValues = this.recordKeyProjection.projectAsValues(rowData);
|
||||
return getRecordKey(keyValues, this.recordKeyFields);
|
||||
}
|
||||
}
|
||||
|
||||
public String getPartitionPath(RowData rowData) {
|
||||
if (this.simplePartitionPath) {
|
||||
return getPartitionPath(partitionPathFieldGetter.getFieldOrNull(rowData),
|
||||
this.partitionPathFields[0], this.hiveStylePartitioning, this.encodePartitionPath);
|
||||
} else if (this.nonPartitioned) {
|
||||
return EMPTY_PARTITION;
|
||||
} else {
|
||||
Object[] partValues = this.partitionPathProjection.projectAsValues(rowData);
|
||||
return getRecordPartitionPath(partValues, this.partitionPathFields, this.hiveStylePartitioning, this.encodePartitionPath);
|
||||
}
|
||||
}
|
||||
|
||||
// reference: org.apache.hudi.keygen.KeyGenUtils.getRecordPartitionPath
|
||||
private static String getRecordKey(Object[] keyValues, String[] keyFields) {
|
||||
boolean keyIsNullEmpty = true;
|
||||
StringBuilder recordKey = new StringBuilder();
|
||||
for (int i = 0; i < keyValues.length; i++) {
|
||||
String recordKeyField = keyFields[i];
|
||||
String recordKeyValue = StringUtils.objToString(keyValues[i]);
|
||||
if (recordKeyValue == null) {
|
||||
recordKey.append(recordKeyField).append(":").append(NULL_RECORDKEY_PLACEHOLDER).append(",");
|
||||
} else if (recordKeyValue.isEmpty()) {
|
||||
recordKey.append(recordKeyField).append(":").append(EMPTY_RECORDKEY_PLACEHOLDER).append(",");
|
||||
} else {
|
||||
recordKey.append(recordKeyField).append(":").append(recordKeyValue).append(",");
|
||||
keyIsNullEmpty = false;
|
||||
}
|
||||
}
|
||||
recordKey.deleteCharAt(recordKey.length() - 1);
|
||||
if (keyIsNullEmpty) {
|
||||
throw new HoodieKeyException("recordKey values: \"" + recordKey + "\" for fields: "
|
||||
+ Arrays.toString(keyFields) + " cannot be entirely null or empty.");
|
||||
}
|
||||
return recordKey.toString();
|
||||
}
|
||||
|
||||
// reference: org.apache.hudi.keygen.KeyGenUtils.getRecordPartitionPath
|
||||
private static String getRecordPartitionPath(
|
||||
Object[] partValues,
|
||||
String[] partFields,
|
||||
boolean hiveStylePartitioning,
|
||||
boolean encodePartitionPath) {
|
||||
StringBuilder partitionPath = new StringBuilder();
|
||||
for (int i = 0; i < partFields.length; i++) {
|
||||
String partField = partFields[i];
|
||||
String partValue = StringUtils.objToString(partValues[i]);
|
||||
if (partValue == null || partValue.isEmpty()) {
|
||||
partitionPath.append(hiveStylePartitioning ? partField + "=" + DEFAULT_PARTITION_PATH
|
||||
: DEFAULT_PARTITION_PATH);
|
||||
} else {
|
||||
if (encodePartitionPath) {
|
||||
partValue = PartitionPathEncodeUtils.escapePathName(partValue);
|
||||
}
|
||||
partitionPath.append(hiveStylePartitioning ? partField + "=" + partValue : partValue);
|
||||
}
|
||||
partitionPath.append(DEFAULT_PARTITION_PATH_SEPARATOR);
|
||||
}
|
||||
partitionPath.deleteCharAt(partitionPath.length() - 1);
|
||||
return partitionPath.toString();
|
||||
}
|
||||
|
||||
// reference: org.apache.hudi.keygen.KeyGenUtils.getRecordKey
|
||||
public static String getRecordKey(Object recordKeyValue, String recordKeyField) {
|
||||
String recordKey = StringUtils.objToString(recordKeyValue);
|
||||
if (recordKey == null || recordKey.isEmpty()) {
|
||||
throw new HoodieKeyException("recordKey value: \"" + recordKey + "\" for field: \"" + recordKeyField + "\" cannot be null or empty.");
|
||||
}
|
||||
return recordKey;
|
||||
}
|
||||
|
||||
// reference: org.apache.hudi.keygen.KeyGenUtils.getPartitionPath
|
||||
public static String getPartitionPath(
|
||||
Object partValue,
|
||||
String partField,
|
||||
boolean hiveStylePartitioning,
|
||||
boolean encodePartitionPath) {
|
||||
String partitionPath = StringUtils.objToString(partValue);
|
||||
if (partitionPath == null || partitionPath.isEmpty()) {
|
||||
partitionPath = DEFAULT_PARTITION_PATH;
|
||||
}
|
||||
if (encodePartitionPath) {
|
||||
partitionPath = PartitionPathEncodeUtils.escapePathName(partitionPath);
|
||||
}
|
||||
if (hiveStylePartitioning) {
|
||||
partitionPath = partField + "=" + partitionPath;
|
||||
}
|
||||
return partitionPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the row data projection for the given field names and table schema.
|
||||
*
|
||||
* @param fields The projected field names
|
||||
* @param schemaFields The table schema names
|
||||
* @param schemaTypes The table schema types
|
||||
* @return the row data projection for the fields
|
||||
*/
|
||||
private static RowDataProjection getProjection(String[] fields, List<String> schemaFields, List<LogicalType> schemaTypes) {
|
||||
int[] positions = getFieldPositions(fields, schemaFields);
|
||||
LogicalType[] types = Arrays.stream(positions).mapToObj(schemaTypes::get).toArray(LogicalType[]::new);
|
||||
return RowDataProjection.instance(types, positions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the field positions of the given fields {@code fields} among all the fields {@code allFields}.
|
||||
*/
|
||||
private static int[] getFieldPositions(String[] fields, List<String> allFields) {
|
||||
return Arrays.stream(fields).mapToInt(allFields::indexOf).toArray();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sink.utils;
|
||||
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
/**
|
||||
* Tool used for time waiting.
|
||||
*/
|
||||
public class TimeWait {
|
||||
private final long timeout; // timeout in SECONDS
|
||||
private final long interval; // interval in MILLISECONDS
|
||||
private final String action; // action to report error message
|
||||
private long waitingTime = 0L;
|
||||
|
||||
private TimeWait(long timeout, long interval, String action) {
|
||||
this.timeout = timeout;
|
||||
this.interval = interval;
|
||||
this.action = action;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for an interval time.
|
||||
*/
|
||||
public void waitFor() {
|
||||
try {
|
||||
if (waitingTime > timeout) {
|
||||
throw new HoodieException("Timeout(" + waitingTime + "ms) while waiting for " + action);
|
||||
}
|
||||
TimeUnit.MILLISECONDS.sleep(interval);
|
||||
waitingTime += interval;
|
||||
} catch (InterruptedException e) {
|
||||
throw new HoodieException("Error while waiting for " + action, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder.
|
||||
*/
|
||||
public static class Builder {
|
||||
private long timeout;
|
||||
private long interval;
|
||||
private String action;
|
||||
|
||||
public Builder() {
|
||||
this.timeout = 3600;
|
||||
this.interval = 500;
|
||||
}
|
||||
|
||||
public Builder timeout(long timeout) {
|
||||
this.timeout = timeout;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder interval(long interval) {
|
||||
this.interval = interval;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder action(String action) {
|
||||
this.action = action;
|
||||
return this;
|
||||
}
|
||||
|
||||
public TimeWait build() {
|
||||
Objects.requireNonNull(this.action);
|
||||
return new TimeWait(this.timeout, this.interval, this.action);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -24,6 +24,7 @@ import org.apache.hudi.configuration.FlinkOptions;
|
||||
import org.apache.hudi.sink.CleanFunction;
|
||||
import org.apache.hudi.sink.StreamWriteOperatorFactory;
|
||||
import org.apache.hudi.sink.bootstrap.BootstrapFunction;
|
||||
import org.apache.hudi.sink.bulk.BulkInsertWriteOperator;
|
||||
import org.apache.hudi.sink.compact.CompactFunction;
|
||||
import org.apache.hudi.sink.compact.CompactionCommitEvent;
|
||||
import org.apache.hudi.sink.compact.CompactionCommitSink;
|
||||
@@ -45,6 +46,7 @@ import org.apache.flink.table.connector.sink.DataStreamSinkProvider;
|
||||
import org.apache.flink.table.connector.sink.DynamicTableSink;
|
||||
import org.apache.flink.table.connector.sink.abilities.SupportsOverwrite;
|
||||
import org.apache.flink.table.connector.sink.abilities.SupportsPartitioning;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.flink.types.RowKind;
|
||||
|
||||
@@ -58,29 +60,57 @@ public class HoodieTableSink implements DynamicTableSink, SupportsPartitioning,
|
||||
private final Configuration conf;
|
||||
private final TableSchema schema;
|
||||
private boolean overwrite = false;
|
||||
private boolean supportsGrouping = false;
|
||||
|
||||
public HoodieTableSink(Configuration conf, TableSchema schema) {
|
||||
this.conf = conf;
|
||||
this.schema = schema;
|
||||
}
|
||||
|
||||
public HoodieTableSink(Configuration conf, TableSchema schema, boolean overwrite, boolean supportsGrouping) {
|
||||
this.conf = conf;
|
||||
this.schema = schema;
|
||||
this.overwrite = overwrite;
|
||||
this.supportsGrouping = supportsGrouping;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SinkRuntimeProvider getSinkRuntimeProvider(Context context) {
|
||||
return (DataStreamSinkProvider) dataStream -> {
|
||||
// Read from kafka source
|
||||
RowType rowType = (RowType) schema.toRowDataType().notNull().getLogicalType();
|
||||
|
||||
// setup configuration
|
||||
long ckpTimeout = dataStream.getExecutionEnvironment()
|
||||
.getCheckpointConfig().getCheckpointTimeout();
|
||||
int parallelism = dataStream.getExecutionConfig().getParallelism();
|
||||
conf.setLong(FlinkOptions.WRITE_COMMIT_ACK_TIMEOUT, ckpTimeout);
|
||||
|
||||
RowType rowType = (RowType) schema.toRowDataType().notNull().getLogicalType();
|
||||
|
||||
// bulk_insert mode
|
||||
final String writeOperation = this.conf.get(FlinkOptions.OPERATION);
|
||||
if (WriteOperationType.fromValue(writeOperation) == WriteOperationType.BULK_INSERT) {
|
||||
this.conf.set(FlinkOptions.WRITE_BULK_INSERT_PARTITION_SORTED, this.supportsGrouping);
|
||||
BulkInsertWriteOperator.OperatorFactory<RowData> operatorFactory = BulkInsertWriteOperator.getFactory(this.conf, rowType);
|
||||
return dataStream.transform("hoodie_bulk_insert_write",
|
||||
TypeInformation.of(Object.class),
|
||||
operatorFactory)
|
||||
// follow the parallelism of upstream operators to avoid shuffle
|
||||
.setParallelism(dataStream.getParallelism())
|
||||
.addSink(new CleanFunction<>(conf))
|
||||
.setParallelism(1)
|
||||
.name("clean_commits");
|
||||
}
|
||||
|
||||
// stream write
|
||||
int parallelism = dataStream.getExecutionConfig().getParallelism();
|
||||
StreamWriteOperatorFactory<HoodieRecord> operatorFactory = new StreamWriteOperatorFactory<>(conf);
|
||||
|
||||
DataStream<HoodieRecord> hoodieDataStream = dataStream
|
||||
DataStream<HoodieRecord> dataStream1 = dataStream
|
||||
.map(new RowDataToHoodieFunction<>(rowType, conf), TypeInformation.of(HoodieRecord.class));
|
||||
|
||||
// bootstrap index
|
||||
// TODO: This is a very time-consuming operation, will optimization
|
||||
if (conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED)) {
|
||||
hoodieDataStream = hoodieDataStream.rebalance()
|
||||
dataStream1 = dataStream1.rebalance()
|
||||
.transform(
|
||||
"index_bootstrap",
|
||||
TypeInformation.of(HoodieRecord.class),
|
||||
@@ -89,7 +119,7 @@ public class HoodieTableSink implements DynamicTableSink, SupportsPartitioning,
|
||||
.uid("uid_index_bootstrap_" + conf.getString(FlinkOptions.TABLE_NAME));
|
||||
}
|
||||
|
||||
DataStream<Object> pipeline = hoodieDataStream
|
||||
DataStream<Object> pipeline = dataStream1
|
||||
// Key-by record key, to avoid multiple subtasks write to a bucket at the same time
|
||||
.keyBy(HoodieRecord::getRecordKey)
|
||||
.transform(
|
||||
@@ -103,6 +133,7 @@ public class HoodieTableSink implements DynamicTableSink, SupportsPartitioning,
|
||||
.transform("hoodie_stream_write", TypeInformation.of(Object.class), operatorFactory)
|
||||
.uid("uid_hoodie_stream_write" + conf.getString(FlinkOptions.TABLE_NAME))
|
||||
.setParallelism(conf.getInteger(FlinkOptions.WRITE_TASKS));
|
||||
// compaction
|
||||
if (StreamerUtil.needsAsyncCompaction(conf)) {
|
||||
return pipeline.transform("compact_plan_generate",
|
||||
TypeInformation.of(CompactionPlanEvent.class),
|
||||
@@ -141,7 +172,7 @@ public class HoodieTableSink implements DynamicTableSink, SupportsPartitioning,
|
||||
|
||||
@Override
|
||||
public DynamicTableSink copy() {
|
||||
return new HoodieTableSink(this.conf, this.schema);
|
||||
return new HoodieTableSink(this.conf, this.schema, this.overwrite, this.supportsGrouping);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -167,4 +198,10 @@ public class HoodieTableSink implements DynamicTableSink, SupportsPartitioning,
|
||||
public void applyOverwrite(boolean b) {
|
||||
this.overwrite = b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean requiresPartitionGrouping(boolean supportsGrouping) {
|
||||
this.supportsGrouping = supportsGrouping;
|
||||
return supportsGrouping;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,6 +47,10 @@ public class RowDataProjection {
|
||||
return new RowDataProjection(types, positions);
|
||||
}
|
||||
|
||||
public static RowDataProjection instance(LogicalType[] types, int[] positions) {
|
||||
return new RowDataProjection(types, positions);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the projected row data.
|
||||
*/
|
||||
@@ -58,4 +62,16 @@ public class RowDataProjection {
|
||||
}
|
||||
return genericRowData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the projected values array.
|
||||
*/
|
||||
public Object[] projectAsValues(RowData rowData) {
|
||||
Object[] values = new Object[this.fieldGetters.length];
|
||||
for (int i = 0; i < this.fieldGetters.length; i++) {
|
||||
final Object val = this.fieldGetters[i].getFieldOrNull(rowData);
|
||||
values[i] = val;
|
||||
}
|
||||
return values;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.sink.bulk;
|
||||
|
||||
import org.apache.hudi.configuration.FlinkOptions;
|
||||
import org.apache.hudi.exception.HoodieKeyException;
|
||||
import org.apache.hudi.utils.TestConfigurations;
|
||||
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.data.StringData;
|
||||
import org.apache.flink.table.data.TimestampData;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.apache.hudi.utils.TestData.insertRow;
|
||||
import static org.hamcrest.CoreMatchers.is;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
/**
|
||||
* Test cases for {@link RowDataKeyGen}.
|
||||
*/
|
||||
public class TestRowDataKeyGen {
|
||||
@Test
|
||||
void testSimpleKeyAndPartition() {
|
||||
Configuration conf = TestConfigurations.getDefaultConf("path1");
|
||||
final RowData rowData1 = insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
|
||||
TimestampData.fromEpochMillis(1), StringData.fromString("par1"));
|
||||
final RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE);
|
||||
assertThat(keyGen1.getRecordKey(rowData1), is("id1"));
|
||||
assertThat(keyGen1.getPartitionPath(rowData1), is("par1"));
|
||||
|
||||
// null record key and partition path
|
||||
final RowData rowData2 = insertRow(null, StringData.fromString("Danny"), 23,
|
||||
TimestampData.fromEpochMillis(1), null);
|
||||
assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2));
|
||||
assertThat(keyGen1.getPartitionPath(rowData2), is("default"));
|
||||
// empty record key and partition path
|
||||
final RowData rowData3 = insertRow(StringData.fromString(""), StringData.fromString("Danny"), 23,
|
||||
TimestampData.fromEpochMillis(1), StringData.fromString(""));
|
||||
assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3));
|
||||
assertThat(keyGen1.getPartitionPath(rowData3), is("default"));
|
||||
|
||||
// hive style partitioning
|
||||
conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true);
|
||||
final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE);
|
||||
assertThat(keyGen2.getPartitionPath(rowData1), is("partition=par1"));
|
||||
assertThat(keyGen2.getPartitionPath(rowData2), is("partition=default"));
|
||||
assertThat(keyGen2.getPartitionPath(rowData3), is("partition=default"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testComplexKeyAndPartition() {
|
||||
Configuration conf = TestConfigurations.getDefaultConf("path1");
|
||||
conf.set(FlinkOptions.RECORD_KEY_FIELD, "uuid,name");
|
||||
conf.set(FlinkOptions.PARTITION_PATH_FIELD, "partition,ts");
|
||||
RowData rowData1 = insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
|
||||
TimestampData.fromEpochMillis(1), StringData.fromString("par1"));
|
||||
RowDataKeyGen keyGen1 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE);
|
||||
assertThat(keyGen1.getRecordKey(rowData1), is("uuid:id1,name:Danny"));
|
||||
assertThat(keyGen1.getPartitionPath(rowData1), is("par1/1970-01-01T00:00:00.001"));
|
||||
|
||||
// null record key and partition path
|
||||
final RowData rowData2 = insertRow(null, null, 23, null, null);
|
||||
assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData2));
|
||||
assertThat(keyGen1.getPartitionPath(rowData2), is("default/default"));
|
||||
// empty record key and partition path
|
||||
final RowData rowData3 = insertRow(StringData.fromString(""), StringData.fromString(""), 23,
|
||||
TimestampData.fromEpochMillis(1), StringData.fromString(""));
|
||||
assertThrows(HoodieKeyException.class, () -> keyGen1.getRecordKey(rowData3));
|
||||
assertThat(keyGen1.getPartitionPath(rowData3), is("default/1970-01-01T00:00:00.001"));
|
||||
|
||||
// hive style partitioning
|
||||
conf.set(FlinkOptions.HIVE_STYLE_PARTITIONING, true);
|
||||
final RowDataKeyGen keyGen2 = RowDataKeyGen.instance(conf, TestConfigurations.ROW_TYPE);
|
||||
assertThat(keyGen2.getPartitionPath(rowData1), is("partition=par1/ts=1970-01-01T00:00:00.001"));
|
||||
assertThat(keyGen2.getPartitionPath(rowData2), is("partition=default/ts=default"));
|
||||
assertThat(keyGen2.getPartitionPath(rowData3), is("partition=default/ts=1970-01-01T00:00:00.001"));
|
||||
}
|
||||
}
|
||||
@@ -25,6 +25,7 @@ import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.util.StreamerUtil;
|
||||
import org.apache.hudi.utils.TestConfigurations;
|
||||
import org.apache.hudi.utils.TestData;
|
||||
import org.apache.hudi.utils.TestSQL;
|
||||
import org.apache.hudi.utils.TestUtils;
|
||||
import org.apache.hudi.utils.factory.CollectSinkTableFactory;
|
||||
|
||||
@@ -48,6 +49,7 @@ import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.EnumSource;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
import org.junit.jupiter.params.provider.ValueSource;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
@@ -66,7 +68,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
/**
|
||||
* IT cases for Hoodie table source and sink.
|
||||
*
|
||||
* <p>
|
||||
* Note: should add more SQL cases when batch write is supported.
|
||||
*/
|
||||
public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
@@ -267,17 +269,8 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
}
|
||||
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
|
||||
tableEnv.executeSql(hoodieTableDDL);
|
||||
String insertInto = "insert into t1 values\n"
|
||||
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),\n"
|
||||
+ "('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n"
|
||||
+ "('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),\n"
|
||||
+ "('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n"
|
||||
+ "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n"
|
||||
+ "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
|
||||
+ "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
|
||||
+ "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')";
|
||||
|
||||
execInsertSql(tableEnv, insertInto);
|
||||
execInsertSql(tableEnv, TestSQL.INSERT_T1);
|
||||
|
||||
List<Row> result1 = CollectionUtil.iterableToList(
|
||||
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
|
||||
@@ -296,40 +289,40 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
void testWriteAndReadParMiddle(ExecMode execMode) throws Exception {
|
||||
boolean streaming = execMode == ExecMode.STREAM;
|
||||
String hoodieTableDDL = "create table t1(\n"
|
||||
+ " uuid varchar(20),\n"
|
||||
+ " name varchar(10),\n"
|
||||
+ " age int,\n"
|
||||
+ " `partition` varchar(20),\n" // test streaming read with partition field in the middle
|
||||
+ " ts timestamp(3),\n"
|
||||
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
|
||||
+ ")\n"
|
||||
+ "PARTITIONED BY (`partition`)\n"
|
||||
+ "with (\n"
|
||||
+ " 'connector' = 'hudi',\n"
|
||||
+ " 'path' = '" + tempFile.getAbsolutePath() + "',\n"
|
||||
+ " 'read.streaming.enabled' = '" + streaming + "'\n"
|
||||
+ ")";
|
||||
+ " uuid varchar(20),\n"
|
||||
+ " name varchar(10),\n"
|
||||
+ " age int,\n"
|
||||
+ " `partition` varchar(20),\n" // test streaming read with partition field in the middle
|
||||
+ " ts timestamp(3),\n"
|
||||
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
|
||||
+ ")\n"
|
||||
+ "PARTITIONED BY (`partition`)\n"
|
||||
+ "with (\n"
|
||||
+ " 'connector' = 'hudi',\n"
|
||||
+ " 'path' = '" + tempFile.getAbsolutePath() + "',\n"
|
||||
+ " 'read.streaming.enabled' = '" + streaming + "'\n"
|
||||
+ ")";
|
||||
streamTableEnv.executeSql(hoodieTableDDL);
|
||||
String insertInto = "insert into t1 values\n"
|
||||
+ "('id1','Danny',23,'par1',TIMESTAMP '1970-01-01 00:00:01'),\n"
|
||||
+ "('id2','Stephen',33,'par1',TIMESTAMP '1970-01-01 00:00:02'),\n"
|
||||
+ "('id3','Julian',53,'par2',TIMESTAMP '1970-01-01 00:00:03'),\n"
|
||||
+ "('id4','Fabian',31,'par2',TIMESTAMP '1970-01-01 00:00:04'),\n"
|
||||
+ "('id5','Sophia',18,'par3',TIMESTAMP '1970-01-01 00:00:05'),\n"
|
||||
+ "('id6','Emma',20,'par3',TIMESTAMP '1970-01-01 00:00:06'),\n"
|
||||
+ "('id7','Bob',44,'par4',TIMESTAMP '1970-01-01 00:00:07'),\n"
|
||||
+ "('id8','Han',56,'par4',TIMESTAMP '1970-01-01 00:00:08')";
|
||||
+ "('id1','Danny',23,'par1',TIMESTAMP '1970-01-01 00:00:01'),\n"
|
||||
+ "('id2','Stephen',33,'par1',TIMESTAMP '1970-01-01 00:00:02'),\n"
|
||||
+ "('id3','Julian',53,'par2',TIMESTAMP '1970-01-01 00:00:03'),\n"
|
||||
+ "('id4','Fabian',31,'par2',TIMESTAMP '1970-01-01 00:00:04'),\n"
|
||||
+ "('id5','Sophia',18,'par3',TIMESTAMP '1970-01-01 00:00:05'),\n"
|
||||
+ "('id6','Emma',20,'par3',TIMESTAMP '1970-01-01 00:00:06'),\n"
|
||||
+ "('id7','Bob',44,'par4',TIMESTAMP '1970-01-01 00:00:07'),\n"
|
||||
+ "('id8','Han',56,'par4',TIMESTAMP '1970-01-01 00:00:08')";
|
||||
execInsertSql(streamTableEnv, insertInto);
|
||||
|
||||
final String expected = "["
|
||||
+ "id1,Danny,23,par1,1970-01-01T00:00:01, "
|
||||
+ "id2,Stephen,33,par1,1970-01-01T00:00:02, "
|
||||
+ "id3,Julian,53,par2,1970-01-01T00:00:03, "
|
||||
+ "id4,Fabian,31,par2,1970-01-01T00:00:04, "
|
||||
+ "id5,Sophia,18,par3,1970-01-01T00:00:05, "
|
||||
+ "id6,Emma,20,par3,1970-01-01T00:00:06, "
|
||||
+ "id7,Bob,44,par4,1970-01-01T00:00:07, "
|
||||
+ "id8,Han,56,par4,1970-01-01T00:00:08]";
|
||||
+ "id1,Danny,23,par1,1970-01-01T00:00:01, "
|
||||
+ "id2,Stephen,33,par1,1970-01-01T00:00:02, "
|
||||
+ "id3,Julian,53,par2,1970-01-01T00:00:03, "
|
||||
+ "id4,Fabian,31,par2,1970-01-01T00:00:04, "
|
||||
+ "id5,Sophia,18,par3,1970-01-01T00:00:05, "
|
||||
+ "id6,Emma,20,par3,1970-01-01T00:00:06, "
|
||||
+ "id7,Bob,44,par4,1970-01-01T00:00:07, "
|
||||
+ "id8,Han,56,par4,1970-01-01T00:00:08]";
|
||||
|
||||
List<Row> result = execSelectSql(streamTableEnv, "select * from t1", execMode);
|
||||
|
||||
@@ -350,17 +343,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
|
||||
tableEnv.executeSql(hoodieTableDDL);
|
||||
|
||||
final String insertInto1 = "insert into t1 values\n"
|
||||
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),\n"
|
||||
+ "('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n"
|
||||
+ "('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),\n"
|
||||
+ "('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n"
|
||||
+ "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n"
|
||||
+ "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
|
||||
+ "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
|
||||
+ "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')";
|
||||
|
||||
execInsertSql(tableEnv, insertInto1);
|
||||
execInsertSql(tableEnv, TestSQL.INSERT_T1);
|
||||
|
||||
// overwrite partition 'par1' and increase in age by 1
|
||||
final String insertInto2 = "insert overwrite t1 partition(`partition`='par1') values\n"
|
||||
@@ -519,7 +502,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
|
||||
// execute query and assert throws exception
|
||||
assertThrows(HoodieException.class, () -> execSelectSql(streamTableEnv, "select * from t1", 10),
|
||||
"No successful commits under path " + tempFile.getAbsolutePath());
|
||||
"No successful commits under path " + tempFile.getAbsolutePath());
|
||||
|
||||
}
|
||||
|
||||
@@ -575,6 +558,80 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
assertRowsEquals(result, expected);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@ValueSource(booleans = {true, false})
|
||||
void testBulkInsert(boolean hiveStylePartitioning) {
|
||||
TableEnvironment tableEnv = batchTableEnv;
|
||||
// csv source
|
||||
String csvSourceDDL = TestConfigurations.getCsvSourceDDL("csv_source", "test_source_5.data");
|
||||
tableEnv.executeSql(csvSourceDDL);
|
||||
|
||||
Map<String, String> options = new HashMap<>();
|
||||
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
|
||||
options.put(FlinkOptions.OPERATION.key(), "bulk_insert");
|
||||
options.put(FlinkOptions.SINK_SHUFFLE_BY_PARTITION.key(), "true");
|
||||
if (hiveStylePartitioning) {
|
||||
options.put(FlinkOptions.HIVE_STYLE_PARTITIONING.key(), "true");
|
||||
}
|
||||
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("hoodie_sink", options);
|
||||
tableEnv.executeSql(hoodieTableDDL);
|
||||
|
||||
String insertInto = "insert into hoodie_sink select * from csv_source";
|
||||
execInsertSql(tableEnv, insertInto);
|
||||
|
||||
List<Row> result1 = CollectionUtil.iterableToList(
|
||||
() -> tableEnv.sqlQuery("select * from hoodie_sink").execute().collect());
|
||||
assertRowsEquals(result1, TestData.DATA_SET_SOURCE_INSERT);
|
||||
// apply filters
|
||||
List<Row> result2 = CollectionUtil.iterableToList(
|
||||
() -> tableEnv.sqlQuery("select * from hoodie_sink where uuid > 'id5'").execute().collect());
|
||||
assertRowsEquals(result2, "["
|
||||
+ "id6,Emma,20,1970-01-01T00:00:06,par3, "
|
||||
+ "id7,Bob,44,1970-01-01T00:00:07,par4, "
|
||||
+ "id8,Han,56,1970-01-01T00:00:08,par4]");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBulkInsertNonPartitionedTable() {
|
||||
TableEnvironment tableEnv = batchTableEnv;
|
||||
String hoodieTableDDL = "create table t1(\n"
|
||||
+ " uuid varchar(20),\n"
|
||||
+ " name varchar(10),\n"
|
||||
+ " age int,\n"
|
||||
+ " ts timestamp(3),\n"
|
||||
+ " `partition` varchar(20),\n"
|
||||
+ " PRIMARY KEY(uuid) NOT ENFORCED\n"
|
||||
+ ")\n"
|
||||
+ "with (\n"
|
||||
+ " 'connector' = 'hudi',\n"
|
||||
+ " 'path' = '" + tempFile.getAbsolutePath() + "',\n"
|
||||
+ " 'write.operation' = 'bulk_insert'\n"
|
||||
+ ")";
|
||||
tableEnv.executeSql(hoodieTableDDL);
|
||||
|
||||
final String insertInto1 = "insert into t1 values\n"
|
||||
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1')";
|
||||
|
||||
execInsertSql(tableEnv, insertInto1);
|
||||
|
||||
final String insertInto2 = "insert into t1 values\n"
|
||||
+ "('id1','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par2'),\n"
|
||||
+ "('id1','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par3'),\n"
|
||||
+ "('id1','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par4'),\n"
|
||||
+ "('id1','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par5')";
|
||||
|
||||
execInsertSql(tableEnv, insertInto2);
|
||||
|
||||
List<Row> result = CollectionUtil.iterableToList(
|
||||
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
|
||||
assertRowsEquals(result, "["
|
||||
+ "id1,Danny,23,1970-01-01T00:00:01,par1, "
|
||||
+ "id1,Stephen,33,1970-01-01T00:00:02,par2, "
|
||||
+ "id1,Julian,53,1970-01-01T00:00:03,par3, "
|
||||
+ "id1,Fabian,31,1970-01-01T00:00:04,par4, "
|
||||
+ "id1,Sophia,18,1970-01-01T00:00:05,par5]", 3);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Utilities
|
||||
// -------------------------------------------------------------------------
|
||||
@@ -606,7 +663,7 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
}
|
||||
|
||||
private List<Row> execSelectSql(TableEnvironment tEnv, String select, ExecMode execMode)
|
||||
throws TableNotExistException, InterruptedException {
|
||||
throws TableNotExistException, InterruptedException {
|
||||
final String[] splits = select.split(" ");
|
||||
final String tableName = splits[splits.length - 1];
|
||||
switch (execMode) {
|
||||
@@ -621,12 +678,12 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
||||
}
|
||||
|
||||
private List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout)
|
||||
throws InterruptedException, TableNotExistException {
|
||||
throws InterruptedException, TableNotExistException {
|
||||
return execSelectSql(tEnv, select, timeout, null);
|
||||
}
|
||||
|
||||
private List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout, String sourceTable)
|
||||
throws InterruptedException, TableNotExistException {
|
||||
throws InterruptedException, TableNotExistException {
|
||||
final String sinkDDL;
|
||||
if (sourceTable != null) {
|
||||
// use the source table schema as the sink schema if the source table was specified, .
|
||||
|
||||
@@ -137,6 +137,22 @@ public class TestConfigurations {
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static String getCsvSourceDDL(String tableName, String fileName) {
|
||||
String sourcePath = Objects.requireNonNull(Thread.currentThread()
|
||||
.getContextClassLoader().getResource(fileName)).toString();
|
||||
return "create table " + tableName + "(\n"
|
||||
+ " uuid varchar(20),\n"
|
||||
+ " name varchar(10),\n"
|
||||
+ " age int,\n"
|
||||
+ " ts timestamp(3),\n"
|
||||
+ " `partition` varchar(20)\n"
|
||||
+ ") with (\n"
|
||||
+ " 'connector' = 'filesystem',\n"
|
||||
+ " 'path' = '" + sourcePath + "',\n"
|
||||
+ " 'format' = 'csv'\n"
|
||||
+ ")";
|
||||
}
|
||||
|
||||
public static final RowDataSerializer SERIALIZER = new RowDataSerializer(ROW_TYPE);
|
||||
|
||||
public static Configuration getDefaultConf(String tablePath) {
|
||||
|
||||
@@ -515,7 +515,7 @@ public class TestData {
|
||||
return Strings.join(fields, ",");
|
||||
}
|
||||
|
||||
private static BinaryRowData insertRow(Object... fields) {
|
||||
public static BinaryRowData insertRow(Object... fields) {
|
||||
LogicalType[] types = TestConfigurations.ROW_TYPE.getFields().stream().map(RowType.RowField::getType)
|
||||
.toArray(LogicalType[]::new);
|
||||
assertEquals(
|
||||
|
||||
36
hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java
Normal file
36
hudi-flink/src/test/java/org/apache/hudi/utils/TestSQL.java
Normal file
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utils;
|
||||
|
||||
/**
|
||||
* Test sql statements.
|
||||
*/
|
||||
public class TestSQL {
|
||||
private TestSQL() {}
|
||||
|
||||
public static final String INSERT_T1 = "insert into t1 values\n"
|
||||
+ "('id1','Danny',23,TIMESTAMP '1970-01-01 00:00:01','par1'),\n"
|
||||
+ "('id2','Stephen',33,TIMESTAMP '1970-01-01 00:00:02','par1'),\n"
|
||||
+ "('id3','Julian',53,TIMESTAMP '1970-01-01 00:00:03','par2'),\n"
|
||||
+ "('id4','Fabian',31,TIMESTAMP '1970-01-01 00:00:04','par2'),\n"
|
||||
+ "('id5','Sophia',18,TIMESTAMP '1970-01-01 00:00:05','par3'),\n"
|
||||
+ "('id6','Emma',20,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
|
||||
+ "('id7','Bob',44,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
|
||||
+ "('id8','Han',56,TIMESTAMP '1970-01-01 00:00:08','par4')";
|
||||
}
|
||||
8
hudi-flink/src/test/resources/test_source_5.data
Normal file
8
hudi-flink/src/test/resources/test_source_5.data
Normal file
@@ -0,0 +1,8 @@
|
||||
id1,Danny,23,1970-01-01 00:00:01,par1
|
||||
id2,Stephen,33,1970-01-01 00:00:02,par1
|
||||
id3,Julian,53,1970-01-01 00:00:03,par2
|
||||
id4,Fabian,31,1970-01-01 00:00:04,par2
|
||||
id5,Sophia,18,1970-01-01 00:00:05,par3
|
||||
id6,Emma,20,1970-01-01 00:00:06,par3
|
||||
id7,Bob,44,1970-01-01 00:00:07,par4
|
||||
id8,Han,56,1970-01-01 00:00:08,par4
|
||||
Reference in New Issue
Block a user