[HUDI-1902] Global index for flink writer (#2958)
Supports deduplication for record keys with different partition path.
This commit is contained in:
@@ -37,7 +37,7 @@ public abstract class BaseAvroPayload implements Serializable {
|
|||||||
/**
|
/**
|
||||||
* For purposes of preCombining.
|
* For purposes of preCombining.
|
||||||
*/
|
*/
|
||||||
protected final Comparable orderingVal;
|
public final Comparable orderingVal;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiate {@link BaseAvroPayload}.
|
* Instantiate {@link BaseAvroPayload}.
|
||||||
|
|||||||
@@ -0,0 +1,97 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.common.model;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Similar with {@link org.apache.hudi.common.model.HoodieRecordLocation} but with partition path.
|
||||||
|
*/
|
||||||
|
public class HoodieRecordGlobalLocation extends HoodieRecordLocation {
|
||||||
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
|
private String partitionPath;
|
||||||
|
|
||||||
|
public HoodieRecordGlobalLocation() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieRecordGlobalLocation(String partitionPath, String instantTime, String fileId) {
|
||||||
|
super(instantTime, fileId);
|
||||||
|
this.partitionPath = partitionPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
final StringBuilder sb = new StringBuilder("HoodieGlobalRecordLocation {");
|
||||||
|
sb.append("partitionPath=").append(partitionPath).append(", ");
|
||||||
|
sb.append("instantTime=").append(instantTime).append(", ");
|
||||||
|
sb.append("fileId=").append(fileId);
|
||||||
|
sb.append('}');
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (o == null || getClass() != o.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
HoodieRecordGlobalLocation otherLoc = (HoodieRecordGlobalLocation) o;
|
||||||
|
return Objects.equals(partitionPath, otherLoc.partitionPath)
|
||||||
|
&& Objects.equals(instantTime, otherLoc.instantTime)
|
||||||
|
&& Objects.equals(fileId, otherLoc.fileId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(partitionPath, instantTime, fileId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPartitionPath() {
|
||||||
|
return partitionPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPartitionPath(String partitionPath) {
|
||||||
|
this.partitionPath = partitionPath;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the global record location from local.
|
||||||
|
*/
|
||||||
|
public static HoodieRecordGlobalLocation fromLocal(String partitionPath, HoodieRecordLocation localLoc) {
|
||||||
|
return new HoodieRecordGlobalLocation(partitionPath, localLoc.getInstantTime(), localLoc.getFileId());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the record location as local.
|
||||||
|
*/
|
||||||
|
public HoodieRecordLocation toLocal(String instantTime) {
|
||||||
|
return new HoodieRecordLocation(instantTime, fileId);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copy the location with given partition path.
|
||||||
|
*/
|
||||||
|
public HoodieRecordGlobalLocation copy(String partitionPath) {
|
||||||
|
return new HoodieRecordGlobalLocation(partitionPath, instantTime, fileId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -26,8 +26,8 @@ import java.util.Objects;
|
|||||||
*/
|
*/
|
||||||
public class HoodieRecordLocation implements Serializable {
|
public class HoodieRecordLocation implements Serializable {
|
||||||
|
|
||||||
private String instantTime;
|
protected String instantTime;
|
||||||
private String fileId;
|
protected String fileId;
|
||||||
|
|
||||||
public HoodieRecordLocation() {
|
public HoodieRecordLocation() {
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,6 +86,13 @@ public class FlinkOptions {
|
|||||||
.defaultValue(1.5D)
|
.defaultValue(1.5D)
|
||||||
.withDescription("Index state ttl in days, default 1.5 day");
|
.withDescription("Index state ttl in days, default 1.5 day");
|
||||||
|
|
||||||
|
public static final ConfigOption<Boolean> INDEX_GLOBAL_ENABLED = ConfigOptions
|
||||||
|
.key("index.global.enabled")
|
||||||
|
.booleanType()
|
||||||
|
.defaultValue(true)
|
||||||
|
.withDescription("Whether to update index for the old partition path\n"
|
||||||
|
+ "if same key record with different partition path came in, default true");
|
||||||
|
|
||||||
// ------------------------------------------------------------------------
|
// ------------------------------------------------------------------------
|
||||||
// Read Options
|
// Read Options
|
||||||
// ------------------------------------------------------------------------
|
// ------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -21,9 +21,11 @@ package org.apache.hudi.sink.partitioner;
|
|||||||
import org.apache.hudi.client.FlinkTaskContextSupplier;
|
import org.apache.hudi.client.FlinkTaskContextSupplier;
|
||||||
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
|
import org.apache.hudi.client.common.HoodieFlinkEngineContext;
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.model.BaseAvroPayload;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
|
import org.apache.hudi.common.model.HoodieRecordGlobalLocation;
|
||||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
import org.apache.hudi.common.model.WriteOperationType;
|
import org.apache.hudi.common.model.WriteOperationType;
|
||||||
@@ -32,6 +34,7 @@ import org.apache.hudi.config.HoodieWriteConfig;
|
|||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.index.HoodieIndexUtils;
|
import org.apache.hudi.index.HoodieIndexUtils;
|
||||||
|
import org.apache.hudi.sink.utils.PayloadCreation;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.action.commit.BucketInfo;
|
import org.apache.hudi.table.action.commit.BucketInfo;
|
||||||
import org.apache.hudi.util.StreamerUtil;
|
import org.apache.hudi.util.StreamerUtil;
|
||||||
@@ -90,7 +93,7 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
* <li>If it does not, use the {@link BucketAssigner} to generate a new bucket ID</li>
|
* <li>If it does not, use the {@link BucketAssigner} to generate a new bucket ID</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*/
|
*/
|
||||||
private MapState<HoodieKey, HoodieRecordLocation> indexState;
|
private MapState<String, HoodieRecordGlobalLocation> indexState;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Bucket assigner to assign new bucket IDs or reuse existing ones.
|
* Bucket assigner to assign new bucket IDs or reuse existing ones.
|
||||||
@@ -110,11 +113,23 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
*/
|
*/
|
||||||
private MapState<String, Integer> partitionLoadState;
|
private MapState<String, Integer> partitionLoadState;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used to create DELETE payload.
|
||||||
|
*/
|
||||||
|
private PayloadCreation payloadCreation;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the index is global, update the index for the old partition path
|
||||||
|
* if same key record with different partition path came in.
|
||||||
|
*/
|
||||||
|
private final boolean globalIndex;
|
||||||
|
|
||||||
public BucketAssignFunction(Configuration conf) {
|
public BucketAssignFunction(Configuration conf) {
|
||||||
this.conf = conf;
|
this.conf = conf;
|
||||||
this.isChangingRecords = WriteOperationType.isChangingRecords(
|
this.isChangingRecords = WriteOperationType.isChangingRecords(
|
||||||
WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)));
|
WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)));
|
||||||
this.bootstrapIndex = conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED);
|
this.bootstrapIndex = conf.getBoolean(FlinkOptions.INDEX_BOOTSTRAP_ENABLED);
|
||||||
|
this.globalIndex = conf.getBoolean(FlinkOptions.INDEX_GLOBAL_ENABLED);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -132,6 +147,7 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE)),
|
HoodieTableType.valueOf(conf.getString(FlinkOptions.TABLE_TYPE)),
|
||||||
context,
|
context,
|
||||||
writeConfig);
|
writeConfig);
|
||||||
|
this.payloadCreation = PayloadCreation.instance(this.conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -141,11 +157,11 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void initializeState(FunctionInitializationContext context) {
|
public void initializeState(FunctionInitializationContext context) {
|
||||||
MapStateDescriptor<HoodieKey, HoodieRecordLocation> indexStateDesc =
|
MapStateDescriptor<String, HoodieRecordGlobalLocation> indexStateDesc =
|
||||||
new MapStateDescriptor<>(
|
new MapStateDescriptor<>(
|
||||||
"indexState",
|
"indexState",
|
||||||
TypeInformation.of(HoodieKey.class),
|
Types.STRING,
|
||||||
TypeInformation.of(HoodieRecordLocation.class));
|
TypeInformation.of(HoodieRecordGlobalLocation.class));
|
||||||
double ttl = conf.getDouble(FlinkOptions.INDEX_STATE_TTL) * 24 * 60 * 60 * 1000;
|
double ttl = conf.getDouble(FlinkOptions.INDEX_STATE_TTL) * 24 * 60 * 60 * 1000;
|
||||||
if (ttl > 0) {
|
if (ttl > 0) {
|
||||||
indexStateDesc.enableTimeToLive(StateTtlConfigUtil.createTtlConfig((long) ttl));
|
indexStateDesc.enableTimeToLive(StateTtlConfigUtil.createTtlConfig((long) ttl));
|
||||||
@@ -166,38 +182,41 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
// 3. if it is an INSERT, decide the location using the BucketAssigner then send it out.
|
// 3. if it is an INSERT, decide the location using the BucketAssigner then send it out.
|
||||||
HoodieRecord<?> record = (HoodieRecord<?>) value;
|
HoodieRecord<?> record = (HoodieRecord<?>) value;
|
||||||
final HoodieKey hoodieKey = record.getKey();
|
final HoodieKey hoodieKey = record.getKey();
|
||||||
final BucketInfo bucketInfo;
|
final String recordKey = hoodieKey.getRecordKey();
|
||||||
|
final String partitionPath = hoodieKey.getPartitionPath();
|
||||||
final HoodieRecordLocation location;
|
final HoodieRecordLocation location;
|
||||||
|
|
||||||
// The dataset may be huge, thus the processing would block for long,
|
// The dataset may be huge, thus the processing would block for long,
|
||||||
// disabled by default.
|
// disabled by default.
|
||||||
if (bootstrapIndex && !partitionLoadState.contains(hoodieKey.getPartitionPath())) {
|
if (bootstrapIndex && !partitionLoadState.contains(partitionPath)) {
|
||||||
// If the partition records are never loaded, load the records first.
|
// If the partition records are never loaded, load the records first.
|
||||||
loadRecords(hoodieKey.getPartitionPath());
|
loadRecords(partitionPath);
|
||||||
}
|
}
|
||||||
// Only changing records need looking up the index for the location,
|
// Only changing records need looking up the index for the location,
|
||||||
// append only records are always recognized as INSERT.
|
// append only records are always recognized as INSERT.
|
||||||
if (isChangingRecords && this.indexState.contains(hoodieKey)) {
|
if (isChangingRecords && indexState.contains(recordKey)) {
|
||||||
// Set up the instant time as "U" to mark the bucket as an update bucket.
|
// Set up the instant time as "U" to mark the bucket as an update bucket.
|
||||||
location = new HoodieRecordLocation("U", this.indexState.get(hoodieKey).getFileId());
|
HoodieRecordGlobalLocation oldLoc = this.indexState.get(recordKey);
|
||||||
this.bucketAssigner.addUpdate(record.getPartitionPath(), location.getFileId());
|
if (!StreamerUtil.equal(oldLoc.getPartitionPath(), partitionPath)) {
|
||||||
} else {
|
if (globalIndex) {
|
||||||
bucketInfo = this.bucketAssigner.addInsert(hoodieKey.getPartitionPath());
|
// if partition path changes, emit a delete record for old partition path,
|
||||||
switch (bucketInfo.getBucketType()) {
|
// then update the index state using location with new partition path.
|
||||||
case INSERT:
|
HoodieRecord<?> deleteRecord = new HoodieRecord<>(new HoodieKey(recordKey, oldLoc.getPartitionPath()),
|
||||||
// This is an insert bucket, use HoodieRecordLocation instant time as "I".
|
payloadCreation.createDeletePayload((BaseAvroPayload) record.getData()));
|
||||||
// Downstream operators can then check the instant time to know whether
|
deleteRecord.setCurrentLocation(oldLoc.toLocal("U"));
|
||||||
// a record belongs to an insert bucket.
|
deleteRecord.seal();
|
||||||
location = new HoodieRecordLocation("I", bucketInfo.getFileIdPrefix());
|
out.collect((O) deleteRecord);
|
||||||
break;
|
}
|
||||||
case UPDATE:
|
location = getNewRecordLocation(partitionPath);
|
||||||
location = new HoodieRecordLocation("U", bucketInfo.getFileIdPrefix());
|
updateIndexState(recordKey, partitionPath, location);
|
||||||
break;
|
} else {
|
||||||
default:
|
location = oldLoc.toLocal("U");
|
||||||
throw new AssertionError();
|
this.bucketAssigner.addUpdate(partitionPath, location.getFileId());
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
location = getNewRecordLocation(partitionPath);
|
||||||
if (isChangingRecords) {
|
if (isChangingRecords) {
|
||||||
this.indexState.put(hoodieKey, location);
|
updateIndexState(recordKey, partitionPath, location);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
record.unseal();
|
record.unseal();
|
||||||
@@ -206,6 +225,32 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
out.collect((O) record);
|
out.collect((O) record);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private HoodieRecordLocation getNewRecordLocation(String partitionPath) {
|
||||||
|
final BucketInfo bucketInfo = this.bucketAssigner.addInsert(partitionPath);
|
||||||
|
final HoodieRecordLocation location;
|
||||||
|
switch (bucketInfo.getBucketType()) {
|
||||||
|
case INSERT:
|
||||||
|
// This is an insert bucket, use HoodieRecordLocation instant time as "I".
|
||||||
|
// Downstream operators can then check the instant time to know whether
|
||||||
|
// a record belongs to an insert bucket.
|
||||||
|
location = new HoodieRecordLocation("I", bucketInfo.getFileIdPrefix());
|
||||||
|
break;
|
||||||
|
case UPDATE:
|
||||||
|
location = new HoodieRecordLocation("U", bucketInfo.getFileIdPrefix());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new AssertionError();
|
||||||
|
}
|
||||||
|
return location;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateIndexState(
|
||||||
|
String recordKey,
|
||||||
|
String partitionPath,
|
||||||
|
HoodieRecordLocation localLoc) throws Exception {
|
||||||
|
this.indexState.put(recordKey, HoodieRecordGlobalLocation.fromLocal(partitionPath, localLoc));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void notifyCheckpointComplete(long l) {
|
public void notifyCheckpointComplete(long l) {
|
||||||
// Refresh the table state when there are new commits.
|
// Refresh the table state when there are new commits.
|
||||||
@@ -245,7 +290,8 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
boolean shouldLoad = KeyGroupRangeAssignment.assignKeyToParallelOperator(
|
boolean shouldLoad = KeyGroupRangeAssignment.assignKeyToParallelOperator(
|
||||||
hoodieKey.getRecordKey(), maxParallelism, parallelism) == taskID;
|
hoodieKey.getRecordKey(), maxParallelism, parallelism) == taskID;
|
||||||
if (shouldLoad) {
|
if (shouldLoad) {
|
||||||
this.indexState.put(hoodieKey, new HoodieRecordLocation(baseFile.getCommitTime(), baseFile.getFileId()));
|
this.indexState.put(hoodieKey.getRecordKey(),
|
||||||
|
new HoodieRecordGlobalLocation(hoodieKey.getPartitionPath(), baseFile.getCommitTime(), baseFile.getFileId()));
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
LOG.error("Error when putting record keys into the state from file: {}", baseFile);
|
LOG.error("Error when putting record keys into the state from file: {}", baseFile);
|
||||||
@@ -265,7 +311,7 @@ public class BucketAssignFunction<K, I, O extends HoodieRecord<?>>
|
|||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public boolean isKeyInState(HoodieKey hoodieKey) {
|
public boolean isKeyInState(HoodieKey hoodieKey) {
|
||||||
try {
|
try {
|
||||||
return this.indexState.contains(hoodieKey);
|
return this.indexState.contains(hoodieKey.getRecordKey());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieException(e);
|
throw new HoodieException(e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,16 +18,12 @@
|
|||||||
|
|
||||||
package org.apache.hudi.sink.transform;
|
package org.apache.hudi.sink.transform;
|
||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.model.WriteOperationType;
|
|
||||||
import org.apache.hudi.common.util.Option;
|
|
||||||
import org.apache.hudi.common.util.ReflectionUtils;
|
|
||||||
import org.apache.hudi.common.util.ValidationUtils;
|
|
||||||
import org.apache.hudi.configuration.FlinkOptions;
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
import org.apache.hudi.keygen.KeyGenerator;
|
import org.apache.hudi.keygen.KeyGenerator;
|
||||||
|
import org.apache.hudi.sink.utils.PayloadCreation;
|
||||||
import org.apache.hudi.util.RowDataToAvroConverters;
|
import org.apache.hudi.util.RowDataToAvroConverters;
|
||||||
import org.apache.hudi.util.StreamerUtil;
|
import org.apache.hudi.util.StreamerUtil;
|
||||||
|
|
||||||
@@ -39,11 +35,7 @@ import org.apache.flink.table.data.RowData;
|
|||||||
import org.apache.flink.table.types.logical.RowType;
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
import org.apache.flink.types.RowKind;
|
import org.apache.flink.types.RowKind;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
|
||||||
import java.lang.reflect.Constructor;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Function that transforms RowData to HoodieRecord.
|
* Function that transforms RowData to HoodieRecord.
|
||||||
@@ -116,53 +108,4 @@ public class RowDataToHoodieFunction<I extends RowData, O extends HoodieRecord<?
|
|||||||
HoodieRecordPayload payload = payloadCreation.createPayload(gr, isDelete);
|
HoodieRecordPayload payload = payloadCreation.createPayload(gr, isDelete);
|
||||||
return new HoodieRecord<>(hoodieKey, payload);
|
return new HoodieRecord<>(hoodieKey, payload);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Util to create hoodie pay load instance.
|
|
||||||
*/
|
|
||||||
private static class PayloadCreation implements Serializable {
|
|
||||||
private static final long serialVersionUID = 1L;
|
|
||||||
|
|
||||||
private final boolean shouldCombine;
|
|
||||||
private final Constructor<?> constructor;
|
|
||||||
private final String preCombineField;
|
|
||||||
|
|
||||||
private PayloadCreation(
|
|
||||||
boolean shouldCombine,
|
|
||||||
Constructor<?> constructor,
|
|
||||||
@Nullable String preCombineField) {
|
|
||||||
this.shouldCombine = shouldCombine;
|
|
||||||
this.constructor = constructor;
|
|
||||||
this.preCombineField = preCombineField;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static PayloadCreation instance(Configuration conf) throws Exception {
|
|
||||||
boolean shouldCombine = conf.getBoolean(FlinkOptions.INSERT_DROP_DUPS)
|
|
||||||
|| WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)) == WriteOperationType.UPSERT;
|
|
||||||
String preCombineField = null;
|
|
||||||
final Class<?>[] argTypes;
|
|
||||||
final Constructor<?> constructor;
|
|
||||||
if (shouldCombine) {
|
|
||||||
preCombineField = conf.getString(FlinkOptions.PRECOMBINE_FIELD);
|
|
||||||
argTypes = new Class<?>[] {GenericRecord.class, Comparable.class};
|
|
||||||
} else {
|
|
||||||
argTypes = new Class<?>[] {Option.class};
|
|
||||||
}
|
|
||||||
final String clazz = conf.getString(FlinkOptions.PAYLOAD_CLASS);
|
|
||||||
constructor = ReflectionUtils.getClass(clazz).getConstructor(argTypes);
|
|
||||||
return new PayloadCreation(shouldCombine, constructor, preCombineField);
|
|
||||||
}
|
|
||||||
|
|
||||||
public HoodieRecordPayload<?> createPayload(GenericRecord record, boolean isDelete) throws Exception {
|
|
||||||
if (shouldCombine) {
|
|
||||||
ValidationUtils.checkState(preCombineField != null);
|
|
||||||
Comparable<?> orderingVal = (Comparable<?>) HoodieAvroUtils.getNestedFieldVal(record,
|
|
||||||
preCombineField, false);
|
|
||||||
return (HoodieRecordPayload<?>) constructor.newInstance(
|
|
||||||
isDelete ? null : record, orderingVal);
|
|
||||||
} else {
|
|
||||||
return (HoodieRecordPayload<?>) this.constructor.newInstance(Option.of(record));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,93 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.sink.utils;
|
||||||
|
|
||||||
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
|
import org.apache.hudi.common.model.BaseAvroPayload;
|
||||||
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
|
import org.apache.hudi.common.model.WriteOperationType;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.common.util.ReflectionUtils;
|
||||||
|
import org.apache.hudi.common.util.ValidationUtils;
|
||||||
|
import org.apache.hudi.configuration.FlinkOptions;
|
||||||
|
|
||||||
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.flink.configuration.Configuration;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.Constructor;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Util to create hoodie pay load instance.
|
||||||
|
*/
|
||||||
|
public class PayloadCreation implements Serializable {
|
||||||
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
|
private final boolean shouldCombine;
|
||||||
|
private final Constructor<?> constructor;
|
||||||
|
private final String preCombineField;
|
||||||
|
|
||||||
|
private PayloadCreation(
|
||||||
|
boolean shouldCombine,
|
||||||
|
Constructor<?> constructor,
|
||||||
|
@Nullable String preCombineField) {
|
||||||
|
this.shouldCombine = shouldCombine;
|
||||||
|
this.constructor = constructor;
|
||||||
|
this.preCombineField = preCombineField;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static PayloadCreation instance(Configuration conf) throws Exception {
|
||||||
|
boolean shouldCombine = conf.getBoolean(FlinkOptions.INSERT_DROP_DUPS)
|
||||||
|
|| WriteOperationType.fromValue(conf.getString(FlinkOptions.OPERATION)) == WriteOperationType.UPSERT;
|
||||||
|
String preCombineField = null;
|
||||||
|
final Class<?>[] argTypes;
|
||||||
|
final Constructor<?> constructor;
|
||||||
|
if (shouldCombine) {
|
||||||
|
preCombineField = conf.getString(FlinkOptions.PRECOMBINE_FIELD);
|
||||||
|
argTypes = new Class<?>[] {GenericRecord.class, Comparable.class};
|
||||||
|
} else {
|
||||||
|
argTypes = new Class<?>[] {Option.class};
|
||||||
|
}
|
||||||
|
final String clazz = conf.getString(FlinkOptions.PAYLOAD_CLASS);
|
||||||
|
constructor = ReflectionUtils.getClass(clazz).getConstructor(argTypes);
|
||||||
|
return new PayloadCreation(shouldCombine, constructor, preCombineField);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieRecordPayload<?> createPayload(GenericRecord record, boolean isDelete) throws Exception {
|
||||||
|
if (shouldCombine) {
|
||||||
|
ValidationUtils.checkState(preCombineField != null);
|
||||||
|
Comparable<?> orderingVal = (Comparable<?>) HoodieAvroUtils.getNestedFieldVal(record,
|
||||||
|
preCombineField, false);
|
||||||
|
return (HoodieRecordPayload<?>) constructor.newInstance(
|
||||||
|
isDelete ? null : record, orderingVal);
|
||||||
|
} else {
|
||||||
|
return (HoodieRecordPayload<?>) this.constructor.newInstance(Option.of(record));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieRecordPayload<?> createDeletePayload(BaseAvroPayload payload) throws Exception {
|
||||||
|
if (shouldCombine) {
|
||||||
|
return (HoodieRecordPayload<?>) constructor.newInstance(null, payload.orderingVal);
|
||||||
|
} else {
|
||||||
|
return (HoodieRecordPayload<?>) this.constructor.newInstance(Option.empty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -392,6 +392,56 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
|||||||
assertRowsEquals(result, "[id1,Sophia,18,1970-01-01T00:00:05,par5]");
|
assertRowsEquals(result, "[id1,Sophia,18,1970-01-01T00:00:05,par5]");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testWriteGlobalIndex() {
|
||||||
|
// the source generates 4 commits
|
||||||
|
String createSource = TestConfigurations.getFileSourceDDL(
|
||||||
|
"source", "test_source_4.data", 4);
|
||||||
|
streamTableEnv.executeSql(createSource);
|
||||||
|
|
||||||
|
Map<String, String> options = new HashMap<>();
|
||||||
|
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
|
||||||
|
options.put(FlinkOptions.INSERT_DROP_DUPS.key(), "true");
|
||||||
|
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
|
||||||
|
streamTableEnv.executeSql(hoodieTableDDL);
|
||||||
|
|
||||||
|
final String insertInto2 = "insert into t1 select * from source";
|
||||||
|
|
||||||
|
execInsertSql(streamTableEnv, insertInto2);
|
||||||
|
|
||||||
|
List<Row> result = CollectionUtil.iterableToList(
|
||||||
|
() -> streamTableEnv.sqlQuery("select * from t1").execute().collect());
|
||||||
|
assertRowsEquals(result, "[id1,Phoebe,52,1970-01-01T00:00:08,par4]");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testWriteLocalIndex() {
|
||||||
|
// the source generates 4 commits
|
||||||
|
String createSource = TestConfigurations.getFileSourceDDL(
|
||||||
|
"source", "test_source_4.data", 4);
|
||||||
|
streamTableEnv.executeSql(createSource);
|
||||||
|
|
||||||
|
Map<String, String> options = new HashMap<>();
|
||||||
|
options.put(FlinkOptions.PATH.key(), tempFile.getAbsolutePath());
|
||||||
|
options.put(FlinkOptions.INDEX_GLOBAL_ENABLED.key(), "false");
|
||||||
|
options.put(FlinkOptions.INSERT_DROP_DUPS.key(), "true");
|
||||||
|
String hoodieTableDDL = TestConfigurations.getCreateHoodieTableDDL("t1", options);
|
||||||
|
streamTableEnv.executeSql(hoodieTableDDL);
|
||||||
|
|
||||||
|
final String insertInto2 = "insert into t1 select * from source";
|
||||||
|
|
||||||
|
execInsertSql(streamTableEnv, insertInto2);
|
||||||
|
|
||||||
|
List<Row> result = CollectionUtil.iterableToList(
|
||||||
|
() -> streamTableEnv.sqlQuery("select * from t1").execute().collect());
|
||||||
|
final String expected = "["
|
||||||
|
+ "id1,Stephen,34,1970-01-01T00:00:02,par1, "
|
||||||
|
+ "id1,Fabian,32,1970-01-01T00:00:04,par2, "
|
||||||
|
+ "id1,Jane,19,1970-01-01T00:00:06,par3, "
|
||||||
|
+ "id1,Phoebe,52,1970-01-01T00:00:08,par4]";
|
||||||
|
assertRowsEquals(result, expected, 3);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void testStreamReadEmptyTablePath() throws Exception {
|
void testStreamReadEmptyTablePath() throws Exception {
|
||||||
// create an empty table
|
// create an empty table
|
||||||
|
|||||||
@@ -256,8 +256,20 @@ public class TestData {
|
|||||||
* @param expected Expected string of the sorted rows
|
* @param expected Expected string of the sorted rows
|
||||||
*/
|
*/
|
||||||
public static void assertRowsEquals(List<Row> rows, String expected) {
|
public static void assertRowsEquals(List<Row> rows, String expected) {
|
||||||
|
assertRowsEquals(rows, expected, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sort the {@code rows} using field at index {@code orderingPos} and asserts
|
||||||
|
* it equals with the expected string {@code expected}.
|
||||||
|
*
|
||||||
|
* @param rows Actual result rows
|
||||||
|
* @param expected Expected string of the sorted rows
|
||||||
|
* @param orderingPos Field position for ordering
|
||||||
|
*/
|
||||||
|
public static void assertRowsEquals(List<Row> rows, String expected, int orderingPos) {
|
||||||
String rowsString = rows.stream()
|
String rowsString = rows.stream()
|
||||||
.sorted(Comparator.comparing(o -> toStringSafely(o.getField(0))))
|
.sorted(Comparator.comparing(o -> toStringSafely(o.getField(orderingPos))))
|
||||||
.collect(Collectors.toList()).toString();
|
.collect(Collectors.toList()).toString();
|
||||||
assertThat(rowsString, is(expected));
|
assertThat(rowsString, is(expected));
|
||||||
}
|
}
|
||||||
|
|||||||
8
hudi-flink/src/test/resources/test_source_4.data
Normal file
8
hudi-flink/src/test/resources/test_source_4.data
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
{"uuid": "id1", "name": "Danny", "age": 24, "ts": "1970-01-01T00:00:01", "partition": "par1"}
|
||||||
|
{"uuid": "id1", "name": "Stephen", "age": 34, "ts": "1970-01-01T00:00:02", "partition": "par1"}
|
||||||
|
{"uuid": "id1", "name": "Julian", "age": 54, "ts": "1970-01-01T00:00:03", "partition": "par2"}
|
||||||
|
{"uuid": "id1", "name": "Fabian", "age": 32, "ts": "1970-01-01T00:00:04", "partition": "par2"}
|
||||||
|
{"uuid": "id1", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"}
|
||||||
|
{"uuid": "id1", "name": "Jane", "age": 19, "ts": "1970-01-01T00:00:06", "partition": "par3"}
|
||||||
|
{"uuid": "id1", "name": "Ella", "age": 38, "ts": "1970-01-01T00:00:07", "partition": "par4"}
|
||||||
|
{"uuid": "id1", "name": "Phoebe", "age": 52, "ts": "1970-01-01T00:00:08", "partition": "par4"}
|
||||||
Reference in New Issue
Block a user