[HUDI-1290] Add Debezium Source for deltastreamer (#4063)
* add source for postgres debezium * Add tests for debezium payload * Fix test * Fix test * Add tests for debezium source * Add tests for debezium source * Fix schema for debezium * Fix checkstyle issues * Fix config issue for schema registry * Add mysql source for debezium * Fix checkstyle issues an tests * Improve code for merging toasted values * Improve code for merging toasted values Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local>
This commit is contained in:
@@ -49,7 +49,7 @@ public class SchemaRegistryProvider extends SchemaProvider {
|
||||
*/
|
||||
public static class Config {
|
||||
|
||||
private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
|
||||
public static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
|
||||
private static final String TARGET_SCHEMA_REGISTRY_URL_PROP =
|
||||
"hoodie.deltastreamer.schemaprovider.registry.targetUrl";
|
||||
}
|
||||
|
||||
@@ -0,0 +1,243 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.sources.debezium;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.avro.Schema.Type;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
|
||||
import org.apache.hudi.AvroConversionUtils;
|
||||
import org.apache.hudi.DataSourceWriteOptions;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
import org.apache.hudi.utilities.schema.SchemaRegistryProvider;
|
||||
import org.apache.hudi.utilities.sources.RowSource;
|
||||
import org.apache.hudi.utilities.sources.helpers.AvroConvertor;
|
||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen;
|
||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
||||
|
||||
import org.apache.kafka.common.serialization.StringDeserializer;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.functions;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.StructField;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import org.apache.spark.streaming.kafka010.KafkaUtils;
|
||||
import org.apache.spark.streaming.kafka010.LocationStrategies;
|
||||
import org.apache.spark.streaming.kafka010.OffsetRange;
|
||||
|
||||
/**
|
||||
* Base class for Debezium streaming source which expects change events as Kafka Avro records.
|
||||
* Obtains the schema from the confluent schema-registry.
|
||||
*/
|
||||
public abstract class DebeziumSource extends RowSource {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(DebeziumSource.class);
|
||||
// these are native kafka's config. do not change the config names.
|
||||
private static final String NATIVE_KAFKA_KEY_DESERIALIZER_PROP = "key.deserializer";
|
||||
private static final String NATIVE_KAFKA_VALUE_DESERIALIZER_PROP = "value.deserializer";
|
||||
private static final String OVERRIDE_CHECKPOINT_STRING = "hoodie.debezium.override.initial.checkpoint.key";
|
||||
private static final String CONNECT_NAME_KEY = "connect.name";
|
||||
private static final String DATE_CONNECT_NAME = "custom.debezium.DateString";
|
||||
|
||||
private final KafkaOffsetGen offsetGen;
|
||||
private final HoodieDeltaStreamerMetrics metrics;
|
||||
private final SchemaRegistryProvider schemaRegistryProvider;
|
||||
private final String deserializerClassName;
|
||||
|
||||
public DebeziumSource(TypedProperties props, JavaSparkContext sparkContext,
|
||||
SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider,
|
||||
HoodieDeltaStreamerMetrics metrics) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider);
|
||||
|
||||
props.put(NATIVE_KAFKA_KEY_DESERIALIZER_PROP, StringDeserializer.class);
|
||||
deserializerClassName = props.getString(DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().key(),
|
||||
DataSourceWriteOptions.KAFKA_AVRO_VALUE_DESERIALIZER_CLASS().defaultValue());
|
||||
|
||||
try {
|
||||
props.put(NATIVE_KAFKA_VALUE_DESERIALIZER_PROP, Class.forName(deserializerClassName));
|
||||
} catch (ClassNotFoundException e) {
|
||||
String error = "Could not load custom avro kafka deserializer: " + deserializerClassName;
|
||||
LOG.error(error);
|
||||
throw new HoodieException(error, e);
|
||||
}
|
||||
|
||||
// Currently, debezium source requires Confluent/Kafka schema-registry to fetch the latest schema.
|
||||
if (schemaProvider == null || !(schemaProvider instanceof SchemaRegistryProvider)) {
|
||||
schemaRegistryProvider = new SchemaRegistryProvider(props, sparkContext);
|
||||
} else {
|
||||
schemaRegistryProvider = (SchemaRegistryProvider) schemaProvider;
|
||||
}
|
||||
|
||||
offsetGen = new KafkaOffsetGen(props);
|
||||
this.metrics = metrics;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {
|
||||
String overrideCheckpointStr = props.getString(OVERRIDE_CHECKPOINT_STRING, "");
|
||||
|
||||
OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCkptStr, sourceLimit, metrics);
|
||||
long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
|
||||
LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
|
||||
|
||||
if (totalNewMsgs == 0) {
|
||||
// If there are no new messages, use empty dataframe with no schema. This is because the schema from schema registry can only be considered
|
||||
// up to date if a change event has occurred.
|
||||
return Pair.of(Option.of(sparkSession.emptyDataFrame()), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr);
|
||||
} else {
|
||||
try {
|
||||
String schemaStr = schemaRegistryProvider.fetchSchemaFromRegistry(props.getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP));
|
||||
Dataset<Row> dataset = toDataset(offsetRanges, offsetGen, schemaStr);
|
||||
LOG.info(String.format("Spark schema of Kafka Payload for topic %s:\n%s", offsetGen.getTopicName(), dataset.schema().treeString()));
|
||||
LOG.info(String.format("New checkpoint string: %s", CheckpointUtils.offsetsToStr(offsetRanges)));
|
||||
return Pair.of(Option.of(dataset), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr);
|
||||
} catch (IOException exc) {
|
||||
LOG.error("Fatal error reading and parsing incoming debezium event", exc);
|
||||
throw new HoodieException("Fatal error reading and parsing incoming debezium event", exc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Debezium Kafka Payload has a nested structure, flatten it specific to the Database type.
|
||||
* @param rawKafkaData Dataset of the Debezium CDC event from the kafka
|
||||
* @return A flattened dataset.
|
||||
*/
|
||||
protected abstract Dataset<Row> processDataset(Dataset<Row> rawKafkaData);
|
||||
|
||||
/**
|
||||
* Converts a Kafka Topic offset into a Spark dataset.
|
||||
*
|
||||
* @param offsetRanges Offset ranges
|
||||
* @param offsetGen KafkaOffsetGen
|
||||
* @return Spark dataset
|
||||
*/
|
||||
private Dataset<Row> toDataset(OffsetRange[] offsetRanges, KafkaOffsetGen offsetGen, String schemaStr) {
|
||||
AvroConvertor convertor = new AvroConvertor(schemaStr);
|
||||
Dataset<Row> kafkaData;
|
||||
if (deserializerClassName.equals(StringDeserializer.class.getName())) {
|
||||
kafkaData = AvroConversionUtils.createDataFrame(
|
||||
KafkaUtils.<String, String>createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent())
|
||||
.map(obj -> convertor.fromJson(obj.value()))
|
||||
.rdd(), schemaStr, sparkSession);
|
||||
} else {
|
||||
kafkaData = AvroConversionUtils.createDataFrame(
|
||||
KafkaUtils.createRDD(sparkContext, offsetGen.getKafkaParams(), offsetRanges, LocationStrategies.PreferConsistent())
|
||||
.map(obj -> (GenericRecord) obj.value())
|
||||
.rdd(), schemaStr, sparkSession);
|
||||
}
|
||||
|
||||
// Flatten debezium payload, specific to each DB type (postgres/ mysql/ etc..)
|
||||
Dataset<Row> debeziumDataset = processDataset(kafkaData);
|
||||
|
||||
// Some required transformations to ensure debezium data types are converted to spark supported types.
|
||||
return convertArrayColumnsToString(convertColumnToNullable(sparkSession,
|
||||
convertDateColumns(debeziumDataset, new Schema.Parser().parse(schemaStr))));
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts string formatted date columns into Spark date columns.
|
||||
*
|
||||
* @param dataset Spark dataset
|
||||
* @param schema Avro schema from Debezium
|
||||
* @return Converted dataset
|
||||
*/
|
||||
public static Dataset<Row> convertDateColumns(Dataset<Row> dataset, Schema schema) {
|
||||
if (schema.getField("before") != null) {
|
||||
List<String> dateFields = schema.getField("before")
|
||||
.schema()
|
||||
.getTypes()
|
||||
.get(1)
|
||||
.getFields()
|
||||
.stream()
|
||||
.filter(field -> {
|
||||
if (field.schema().getType() == Type.UNION) {
|
||||
return field.schema().getTypes().stream().anyMatch(
|
||||
schemaInUnion -> DATE_CONNECT_NAME.equals(schemaInUnion.getProp(CONNECT_NAME_KEY))
|
||||
);
|
||||
} else {
|
||||
return DATE_CONNECT_NAME.equals(field.schema().getProp(CONNECT_NAME_KEY));
|
||||
}
|
||||
}).map(Field::name).collect(Collectors.toList());
|
||||
|
||||
LOG.info("Date fields: " + dateFields.toString());
|
||||
|
||||
for (String dateCol : dateFields) {
|
||||
dataset = dataset.withColumn(dateCol, functions.col(dateCol).cast(DataTypes.DateType));
|
||||
}
|
||||
}
|
||||
|
||||
return dataset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility function for converting columns to nullable. This is useful when required to make a column nullable to match a nullable column from Debezium change
|
||||
* events.
|
||||
*
|
||||
* @param sparkSession SparkSession object
|
||||
* @param dataset Dataframe to modify
|
||||
* @return Modified dataframe
|
||||
*/
|
||||
private static Dataset<Row> convertColumnToNullable(SparkSession sparkSession, Dataset<Row> dataset) {
|
||||
List<String> columns = Arrays.asList(dataset.columns());
|
||||
StructField[] modifiedStructFields = Arrays.stream(dataset.schema().fields()).map(field -> columns
|
||||
.contains(field.name()) ? new StructField(field.name(), field.dataType(), true, field.metadata()) : field)
|
||||
.toArray(StructField[]::new);
|
||||
|
||||
return sparkSession.createDataFrame(dataset.rdd(), new StructType(modifiedStructFields));
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts Array types to String types because not all Debezium array columns are supported to be converted
|
||||
* to Spark array columns.
|
||||
*
|
||||
* @param dataset Dataframe to modify
|
||||
* @return Modified dataframe
|
||||
*/
|
||||
private static Dataset<Row> convertArrayColumnsToString(Dataset<Row> dataset) {
|
||||
List<String> arrayColumns = Arrays.stream(dataset.schema().fields())
|
||||
.filter(field -> field.dataType().typeName().toLowerCase().startsWith("array"))
|
||||
.map(StructField::name)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
for (String colName : arrayColumns) {
|
||||
dataset = dataset.withColumn(colName, functions.col(colName).cast(DataTypes.StringType));
|
||||
}
|
||||
|
||||
return dataset;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.sources.debezium;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.debezium.DebeziumConstants;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.api.java.UDF2;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
|
||||
import static org.apache.spark.sql.functions.callUDF;
|
||||
|
||||
/**
|
||||
* Source for incrementally ingesting debezium generated change logs for Mysql DB.
|
||||
*/
|
||||
public class MysqlDebeziumSource extends DebeziumSource {
|
||||
|
||||
private final SQLContext sqlContext;
|
||||
private final String generateUniqueSeqUdfFn = "mysql_generate_order_key";
|
||||
|
||||
public MysqlDebeziumSource(TypedProperties props, JavaSparkContext sparkContext,
|
||||
SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider,
|
||||
HoodieDeltaStreamerMetrics metrics) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider, metrics);
|
||||
this.sqlContext = sparkSession.sqlContext();
|
||||
sqlContext.udf().register(generateUniqueSeqUdfFn, (UDF2<String, Long, String>) MysqlDebeziumSource::generateUniqueSequence, DataTypes.StringType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Debezium Kafka Payload has a nested structure (see https://debezium.io/documentation/reference/1.4/connectors/mysql.html).
|
||||
* This function flattens this nested structure for the Mysql data, and also extracts a subset of Debezium metadata fields.
|
||||
*
|
||||
* @param rowDataset Dataset containing Debezium Payloads
|
||||
* @return New dataset with flattened columns
|
||||
*/
|
||||
@Override
|
||||
protected Dataset<Row> processDataset(Dataset<Row> rowDataset) {
|
||||
Dataset<Row> flattenedDataset = rowDataset;
|
||||
if (rowDataset.columns().length > 0) {
|
||||
// Only flatten for non-empty schemas
|
||||
Dataset<Row> insertedOrUpdatedData = rowDataset
|
||||
.selectExpr(
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_OP_FIELD, DebeziumConstants.FLATTENED_OP_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_TS_MS_FIELD, DebeziumConstants.UPSTREAM_PROCESSING_TS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_NAME_FIELD, DebeziumConstants.FLATTENED_SHARD_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TS_MS_FIELD, DebeziumConstants.FLATTENED_TS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TXID_FIELD, DebeziumConstants.FLATTENED_TX_ID_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_FILE_FIELD, DebeziumConstants.FLATTENED_FILE_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_POS_FIELD, DebeziumConstants.FLATTENED_POS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_ROW_FIELD, DebeziumConstants.FLATTENED_ROW_COL_NAME),
|
||||
String.format("%s.*", DebeziumConstants.INCOMING_AFTER_FIELD)
|
||||
)
|
||||
.filter(rowDataset.col(DebeziumConstants.INCOMING_OP_FIELD).notEqual(DebeziumConstants.DELETE_OP));
|
||||
|
||||
Dataset<Row> deletedData = rowDataset
|
||||
.selectExpr(
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_OP_FIELD, DebeziumConstants.FLATTENED_OP_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_TS_MS_FIELD, DebeziumConstants.UPSTREAM_PROCESSING_TS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_NAME_FIELD, DebeziumConstants.FLATTENED_SHARD_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TS_MS_FIELD, DebeziumConstants.FLATTENED_TS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TXID_FIELD, DebeziumConstants.FLATTENED_TX_ID_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_FILE_FIELD, DebeziumConstants.FLATTENED_FILE_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_POS_FIELD, DebeziumConstants.FLATTENED_POS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_ROW_FIELD, DebeziumConstants.FLATTENED_ROW_COL_NAME),
|
||||
String.format("%s.*", DebeziumConstants.INCOMING_BEFORE_FIELD)
|
||||
)
|
||||
.filter(rowDataset.col(DebeziumConstants.INCOMING_OP_FIELD).equalTo(DebeziumConstants.DELETE_OP));
|
||||
|
||||
flattenedDataset = insertedOrUpdatedData.union(deletedData);
|
||||
}
|
||||
|
||||
return flattenedDataset.withColumn(DebeziumConstants.ADDED_SEQ_COL_NAME,
|
||||
callUDF(generateUniqueSeqUdfFn, flattenedDataset.col(DebeziumConstants.FLATTENED_FILE_COL_NAME),
|
||||
flattenedDataset.col(DebeziumConstants.FLATTENED_POS_COL_NAME)));
|
||||
}
|
||||
|
||||
private static String generateUniqueSequence(String fileId, Long pos) {
|
||||
return fileId.substring(fileId.lastIndexOf('.') + 1).concat("." + pos);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.sources.debezium;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.debezium.DebeziumConstants;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* Source for incrementally ingesting debezium generated change logs for PostgresDB.
|
||||
*/
|
||||
public class PostgresDebeziumSource extends DebeziumSource {
|
||||
|
||||
public PostgresDebeziumSource(TypedProperties props, JavaSparkContext sparkContext,
|
||||
SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider,
|
||||
HoodieDeltaStreamerMetrics metrics) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider, metrics);
|
||||
}
|
||||
|
||||
/**
|
||||
* Debezium Kafka Payload has a nested structure (see https://debezium.io/documentation/reference/1.4/connectors/postgresql.html#postgresql-create-events).
|
||||
* This function flattens this nested structure for the Postgres data, and also extracts a subset of Debezium metadata fields.
|
||||
*
|
||||
* @param rowDataset Dataset containing Debezium Payloads
|
||||
* @return New dataset with flattened columns
|
||||
*/
|
||||
@Override
|
||||
protected Dataset<Row> processDataset(Dataset<Row> rowDataset) {
|
||||
if (rowDataset.columns().length > 0) {
|
||||
// Pick selective debezium and postgres meta fields: pick the row values from before field for delete record
|
||||
// and row values from after field for insert or update records.
|
||||
Dataset<Row> insertedOrUpdatedData = rowDataset
|
||||
.selectExpr(
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_OP_FIELD, DebeziumConstants.FLATTENED_OP_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_TS_MS_FIELD, DebeziumConstants.UPSTREAM_PROCESSING_TS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_NAME_FIELD, DebeziumConstants.FLATTENED_SHARD_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TS_MS_FIELD, DebeziumConstants.FLATTENED_TS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TXID_FIELD, DebeziumConstants.FLATTENED_TX_ID_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_LSN_FIELD, DebeziumConstants.FLATTENED_LSN_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_XMIN_FIELD, DebeziumConstants.FLATTENED_XMIN_COL_NAME),
|
||||
String.format("%s.*", DebeziumConstants.INCOMING_AFTER_FIELD)
|
||||
)
|
||||
.filter(rowDataset.col(DebeziumConstants.INCOMING_OP_FIELD).notEqual(DebeziumConstants.DELETE_OP));
|
||||
|
||||
Dataset<Row> deletedData = rowDataset
|
||||
.selectExpr(
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_OP_FIELD, DebeziumConstants.FLATTENED_OP_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_TS_MS_FIELD, DebeziumConstants.UPSTREAM_PROCESSING_TS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_NAME_FIELD, DebeziumConstants.FLATTENED_SHARD_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TS_MS_FIELD, DebeziumConstants.FLATTENED_TS_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_TXID_FIELD, DebeziumConstants.FLATTENED_TX_ID_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_LSN_FIELD, DebeziumConstants.FLATTENED_LSN_COL_NAME),
|
||||
String.format("%s as %s", DebeziumConstants.INCOMING_SOURCE_XMIN_FIELD, DebeziumConstants.FLATTENED_XMIN_COL_NAME),
|
||||
String.format("%s.*", DebeziumConstants.INCOMING_BEFORE_FIELD)
|
||||
)
|
||||
.filter(rowDataset.col(DebeziumConstants.INCOMING_OP_FIELD).equalTo(DebeziumConstants.DELETE_OP));
|
||||
|
||||
return insertedOrUpdatedData.union(deletedData);
|
||||
} else {
|
||||
return rowDataset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,209 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.sources.debezium;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.debezium.DebeziumConstants;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.utilities.UtilHelpers;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics;
|
||||
import org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
import org.apache.hudi.utilities.schema.SchemaRegistryProvider;
|
||||
import org.apache.hudi.utilities.sources.InputBatch;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.kafka.clients.consumer.ConsumerConfig;
|
||||
import org.apache.kafka.common.serialization.StringDeserializer;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.streaming.kafka010.KafkaTestUtils;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.params.provider.Arguments.arguments;
|
||||
import static org.mockito.Mockito.mock;
|
||||
|
||||
public abstract class TestAbstractDebeziumSource extends UtilitiesTestBase {
|
||||
|
||||
private static final String TEST_TOPIC_NAME = "hoodie_test";
|
||||
|
||||
private final HoodieDeltaStreamerMetrics metrics = mock(HoodieDeltaStreamerMetrics.class);
|
||||
private KafkaTestUtils testUtils;
|
||||
|
||||
@BeforeAll
|
||||
public static void initClass() throws Exception {
|
||||
UtilitiesTestBase.initClass(false);
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanupClass() {
|
||||
UtilitiesTestBase.cleanupClass();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
testUtils = new KafkaTestUtils();
|
||||
testUtils.setup();
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() throws Exception {
|
||||
super.teardown();
|
||||
testUtils.teardown();
|
||||
}
|
||||
|
||||
private TypedProperties createPropsForJsonSource() {
|
||||
TypedProperties props = new TypedProperties();
|
||||
props.setProperty("hoodie.deltastreamer.source.kafka.topic", TEST_TOPIC_NAME);
|
||||
props.setProperty("bootstrap.servers", testUtils.brokerAddress());
|
||||
props.setProperty("auto.offset.reset", "earliest");
|
||||
props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
|
||||
props.setProperty("hoodie.deltastreamer.schemaprovider.registry.url", "localhost");
|
||||
props.setProperty("hoodie.deltastreamer.source.kafka.value.deserializer.class", StringDeserializer.class.getName());
|
||||
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString());
|
||||
|
||||
return props;
|
||||
}
|
||||
|
||||
protected abstract String getIndexName();
|
||||
|
||||
protected abstract String getSourceClass();
|
||||
|
||||
protected abstract String getSchema();
|
||||
|
||||
protected abstract GenericRecord generateMetaFields(GenericRecord record);
|
||||
|
||||
protected abstract void validateMetaFields(Dataset<Row> records);
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("testArguments")
|
||||
public void testDebeziumEvents(Operation operation) throws Exception {
|
||||
|
||||
String sourceClass = getSourceClass();
|
||||
|
||||
// topic setup.
|
||||
testUtils.createTopic(TEST_TOPIC_NAME, 2);
|
||||
TypedProperties props = createPropsForJsonSource();
|
||||
|
||||
SchemaProvider schemaProvider = new MockSchemaRegistryProvider(props, jsc, this);
|
||||
SourceFormatAdapter debeziumSource = new SourceFormatAdapter(UtilHelpers.createSource(sourceClass, props, jsc, sparkSession, schemaProvider, metrics));
|
||||
|
||||
testUtils.sendMessages(TEST_TOPIC_NAME, new String[] {generateDebeziumEvent(operation).toString()});
|
||||
|
||||
InputBatch<Dataset<Row>> fetch = debeziumSource.fetchNewDataInRowFormat(Option.empty(), 10);
|
||||
assertEquals(1, fetch.getBatch().get().count());
|
||||
|
||||
// Ensure the before fields are picked for DELETE CDC Events,
|
||||
// and after fields are picked for INSERT and UPDATE CDC Events.
|
||||
final String fieldPrefix = (operation.equals(Operation.DELETE)) ? "before_" : "after_";
|
||||
assertTrue(fetch.getBatch().get().select("type").collectAsList().stream()
|
||||
.allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
|
||||
assertTrue(fetch.getBatch().get().select("type").collectAsList().stream()
|
||||
.allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
|
||||
|
||||
// Validate DB specific meta fields
|
||||
validateMetaFields(fetch.getBatch().get());
|
||||
}
|
||||
|
||||
private GenericRecord generateDebeziumEvent(Operation op) {
|
||||
Schema schema = new Schema.Parser().parse(getSchema());
|
||||
String indexName = getIndexName().concat(".ghschema.gharchive.Value");
|
||||
GenericRecord rec = new GenericData.Record(schema);
|
||||
rec.put(DebeziumConstants.INCOMING_OP_FIELD, op.op);
|
||||
rec.put(DebeziumConstants.INCOMING_TS_MS_FIELD, 100L);
|
||||
|
||||
// Before
|
||||
Schema.Field beforeField = schema.getField(DebeziumConstants.INCOMING_BEFORE_FIELD);
|
||||
Schema beforeSchema = beforeField.schema().getTypes().get(beforeField.schema().getIndexNamed(indexName));
|
||||
GenericRecord beforeRecord = new GenericData.Record(beforeSchema);
|
||||
|
||||
beforeRecord.put("id", 1);
|
||||
beforeRecord.put("date", "1/1/2020");
|
||||
beforeRecord.put("type", "before_type");
|
||||
beforeRecord.put("payload", "before_payload");
|
||||
beforeRecord.put("timestamp", 1000L);
|
||||
rec.put(DebeziumConstants.INCOMING_BEFORE_FIELD, beforeRecord);
|
||||
|
||||
// After
|
||||
Schema.Field afterField = schema.getField(DebeziumConstants.INCOMING_AFTER_FIELD);
|
||||
Schema afterSchema = afterField.schema().getTypes().get(afterField.schema().getIndexNamed(indexName));
|
||||
GenericRecord afterRecord = new GenericData.Record(afterSchema);
|
||||
|
||||
afterRecord.put("id", 1);
|
||||
afterRecord.put("date", "1/1/2021");
|
||||
afterRecord.put("type", "after_type");
|
||||
afterRecord.put("payload", "after_payload");
|
||||
afterRecord.put("timestamp", 3000L);
|
||||
rec.put(DebeziumConstants.INCOMING_AFTER_FIELD, afterRecord);
|
||||
|
||||
return generateMetaFields(rec);
|
||||
}
|
||||
|
||||
private static class MockSchemaRegistryProvider extends SchemaRegistryProvider {
|
||||
|
||||
private final String schema;
|
||||
|
||||
public MockSchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc, TestAbstractDebeziumSource source) {
|
||||
super(props, jssc);
|
||||
schema = source.getSchema();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String fetchSchemaFromRegistry(String registryUrl) throws IOException {
|
||||
return schema;
|
||||
}
|
||||
}
|
||||
|
||||
private static Stream<Arguments> testArguments() {
|
||||
return Stream.of(
|
||||
arguments(Operation.INSERT),
|
||||
arguments(Operation.UPDATE),
|
||||
arguments(Operation.DELETE)
|
||||
);
|
||||
}
|
||||
|
||||
private enum Operation {
|
||||
INSERT("c"),
|
||||
UPDATE("u"),
|
||||
DELETE("d");
|
||||
|
||||
public final String op;
|
||||
|
||||
Operation(String op) {
|
||||
this.op = op;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.sources.debezium;
|
||||
|
||||
import org.apache.hudi.common.model.debezium.DebeziumConstants;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestMysqlDebeziumSource extends TestAbstractDebeziumSource {
|
||||
|
||||
private static final String MYSQL_GITHUB_SCHEMA = "{\"connect.name\": \"mysql.ghschema.gharchive.Envelope\",\n"
|
||||
+ " \"fields\": [{\"default\": null,\"name\": \"before\",\"type\": [\"null\",{\"connect.name\": \"mysql.ghschema.gharchive.Value\",\n"
|
||||
+ " \"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"date\",\"type\": \"string\"},{\"default\": null,\"name\": \"timestamp\",\n"
|
||||
+ " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"type\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"payload\",\n"
|
||||
+ " \"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"org\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"created_at\",\n"
|
||||
+ " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"public\",\"type\": [\"null\",\"boolean\"]}],\"name\": \"Value\",\"type\": \"record\"\n"
|
||||
+ " }]},{\"default\": null,\"name\": \"after\",\"type\": [\"null\",\"Value\"]},{\"name\": \"source\",\"type\": {\"connect.name\": \"io.debezium.connector.mysql.Source\",\n"
|
||||
+ " \"fields\": [{\"name\": \"connector\",\"type\": \"string\"},{\"name\": \"name\",\"type\": \"string\"},{\"name\": \"ts_ms\",\"type\": \"long\"},\n"
|
||||
+ " {\"name\": \"db\",\"type\": \"string\"},{\"name\": \"table\",\"type\": \"string\"},{\"default\": null,\n"
|
||||
+ " \"name\": \"txId\",\"type\": [\"null\",\"long\"]},{\"name\": \"file\",\"type\": \"string\"},{\"default\": null,\"name\": \"pos\",\"type\": [\"null\",\"long\"]},{\"default\": null,\n"
|
||||
+ " \"name\": \"row\",\"type\": [\"null\",\"long\"]}],\"name\": \"Source\",\"namespace\": \"io.debezium.connector.mysql\",\"type\": \"record\"\n"
|
||||
+ " }},{\"name\": \"op\",\"type\": \"string\"},{\"default\": null,\"name\": \"ts_ms\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"transaction\",\n"
|
||||
+ " \"type\": [\"null\",{\"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"total_order\",\"type\": \"long\"},{\"name\": \"data_collection_order\",\n"
|
||||
+ " \"type\": \"long\"}],\"name\": \"ConnectDefault\",\"namespace\": \"io.confluent.connect.avro\",\"type\": \"record\"}]}],\"name\": \"Envelope\",\n"
|
||||
+ " \"namespace\": \"mysql.ghschema.gharchive\",\"type\": \"record\"}";
|
||||
|
||||
private static final String TEST_DB = "ghschema";
|
||||
private static final String TEST_TABLE = "gharchive";
|
||||
private static final long TEST_TS_MS = 12345L;
|
||||
private static final long TEST_TXID = 543L;
|
||||
private static final String TEST_FILE = "mysql-bin.00007";
|
||||
private static final long TEST_POS = 98765L;
|
||||
private static final String EXPECTED_TEST_SEQ = "00007.98765";
|
||||
|
||||
@Override
|
||||
protected String getIndexName() {
|
||||
return "mysql";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getSourceClass() {
|
||||
return MysqlDebeziumSource.class.getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getSchema() {
|
||||
return MYSQL_GITHUB_SCHEMA;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected GenericRecord generateMetaFields(GenericRecord rec) {
|
||||
Schema schema = new Schema.Parser().parse(getSchema());
|
||||
// Source fields specific to Mysql DB
|
||||
GenericRecord sourceRecord = new GenericData.Record(schema.getField(DebeziumConstants.INCOMING_SOURCE_FIELD).schema());
|
||||
sourceRecord.put("name", getIndexName());
|
||||
sourceRecord.put("connector", getIndexName());
|
||||
sourceRecord.put("db", TEST_DB);
|
||||
sourceRecord.put("table", TEST_TABLE);
|
||||
sourceRecord.put("ts_ms", TEST_TS_MS);
|
||||
sourceRecord.put("txId", TEST_TXID);
|
||||
sourceRecord.put("file", TEST_FILE);
|
||||
sourceRecord.put("pos", TEST_POS);
|
||||
rec.put(DebeziumConstants.INCOMING_SOURCE_FIELD, sourceRecord);
|
||||
return rec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void validateMetaFields(Dataset<Row> records) {
|
||||
assertTrue(records.select(DebeziumConstants.FLATTENED_SHARD_NAME).collectAsList().stream()
|
||||
.allMatch(r -> r.getString(0).equals(getIndexName())));
|
||||
assertTrue(records.select(DebeziumConstants.FLATTENED_TS_COL_NAME).collectAsList().stream()
|
||||
.allMatch(r -> r.getLong(0) == TEST_TS_MS));
|
||||
assertTrue(records.select(DebeziumConstants.FLATTENED_TX_ID_COL_NAME).collectAsList().stream()
|
||||
.allMatch(r -> r.getLong(0) == TEST_TXID));
|
||||
assertTrue(records.select(DebeziumConstants.ADDED_SEQ_COL_NAME).collectAsList().stream()
|
||||
.allMatch(r -> r.getString(0).equals(EXPECTED_TEST_SEQ)));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.sources.debezium;
|
||||
|
||||
import org.apache.hudi.common.model.debezium.DebeziumConstants;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestPostgresDebeziumSource extends TestAbstractDebeziumSource {
|
||||
|
||||
private static final String POSTGRES_GITHUB_SCHEMA = "{\"connect.name\": \"postgres.ghschema.gharchive.Envelope\",\n"
|
||||
+ " \"fields\": [{\"default\": null,\"name\": \"before\",\"type\": [\"null\",{\"connect.name\": \"postgres.ghschema.gharchive.Value\",\n"
|
||||
+ " \"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"date\",\"type\": \"string\"},{\"default\": null,\"name\": \"timestamp\",\n"
|
||||
+ " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"type\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"payload\",\n"
|
||||
+ " \"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"org\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"created_at\",\n"
|
||||
+ " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"public\",\"type\": [\"null\",\"boolean\"]}],\"name\": \"Value\",\"type\": \"record\"\n"
|
||||
+ " }]},{\"default\": null,\"name\": \"after\",\"type\": [\"null\",\"Value\"]},{\"name\": \"source\",\"type\": {\"connect.name\": \"io.debezium.connector.postgresql.Source\",\n"
|
||||
+ " \"fields\": [{\"name\": \"connector\",\"type\": \"string\"},{\"name\": \"name\",\"type\": \"string\"},{\"name\": \"ts_ms\",\"type\": \"long\"},\n"
|
||||
+ " {\"name\": \"db\",\"type\": \"string\"},{\"name\": \"schema\",\"type\": \"string\"},{\"name\": \"table\",\"type\": \"string\"},{\"default\": null,\n"
|
||||
+ " \"name\": \"txId\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"lsn\",\"type\": [\"null\",\"long\"]},{\"default\": null,\n"
|
||||
+ " \"name\": \"xmin\",\"type\": [\"null\",\"long\"]}],\"name\": \"Source\",\"namespace\": \"io.debezium.connector.postgresql\",\"type\": \"record\"\n"
|
||||
+ " }},{\"name\": \"op\",\"type\": \"string\"},{\"default\": null,\"name\": \"ts_ms\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"transaction\",\n"
|
||||
+ " \"type\": [\"null\",{\"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"total_order\",\"type\": \"long\"},{\"name\": \"data_collection_order\",\n"
|
||||
+ " \"type\": \"long\"}],\"name\": \"ConnectDefault\",\"namespace\": \"io.confluent.connect.avro\",\"type\": \"record\"}]}],\"name\": \"Envelope\",\n"
|
||||
+ " \"namespace\": \"postgres.ghschema.gharchive\",\"type\": \"record\"}";
|
||||
|
||||
private static final String TEST_DB = "postgres";
|
||||
private static final String TEST_SCHEMA = "ghschema";
|
||||
private static final String TEST_TABLE = "gharchive";
|
||||
private static final long TEST_TS_MS = 12345L;
|
||||
private static final long TEST_TXID = 543L;
|
||||
private static final long TEST_LSN = 98765L;
|
||||
|
||||
@Override
|
||||
protected String getIndexName() {
|
||||
return "postgres";
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getSourceClass() {
|
||||
return PostgresDebeziumSource.class.getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getSchema() {
|
||||
return POSTGRES_GITHUB_SCHEMA;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected GenericRecord generateMetaFields(GenericRecord rec) {
|
||||
Schema schema = new Schema.Parser().parse(getSchema());
|
||||
// Source fields specific to Postgres DB
|
||||
GenericRecord sourceRecord = new GenericData.Record(schema.getField(DebeziumConstants.INCOMING_SOURCE_FIELD).schema());
|
||||
sourceRecord.put("name", getIndexName());
|
||||
sourceRecord.put("connector", getIndexName());
|
||||
sourceRecord.put("db", TEST_DB);
|
||||
sourceRecord.put("schema", TEST_SCHEMA);
|
||||
sourceRecord.put("table", TEST_TABLE);
|
||||
sourceRecord.put("ts_ms", TEST_TS_MS);
|
||||
sourceRecord.put("txId", TEST_TXID);
|
||||
sourceRecord.put("lsn", TEST_LSN);
|
||||
rec.put(DebeziumConstants.INCOMING_SOURCE_FIELD, sourceRecord);
|
||||
return rec;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void validateMetaFields(Dataset<Row> records) {
|
||||
assertTrue(records.select(DebeziumConstants.FLATTENED_TS_COL_NAME).collectAsList().stream()
|
||||
.allMatch(r -> r.getLong(0) == TEST_TS_MS));
|
||||
assertTrue(records.select(DebeziumConstants.FLATTENED_TX_ID_COL_NAME).collectAsList().stream()
|
||||
.allMatch(r -> r.getLong(0) == TEST_TXID));
|
||||
assertTrue(records.select(DebeziumConstants.FLATTENED_LSN_COL_NAME).collectAsList().stream()
|
||||
.allMatch(r -> r.getLong(0) == TEST_LSN));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user