1
0

[HUDI-1290] Add Debezium Source for deltastreamer (#4063)

* add source for postgres debezium

* Add tests for debezium payload

* Fix test

* Fix test

* Add tests for debezium source

* Add tests for debezium source

* Fix schema for debezium

* Fix checkstyle issues

* Fix config issue for schema registry

* Add mysql source for debezium

* Fix checkstyle issues an tests

* Improve code for merging toasted values

* Improve code for merging toasted values

Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local>
This commit is contained in:
rmahindra123
2021-11-24 17:57:02 -08:00
committed by GitHub
parent abc0175cf7
commit 83f8ed2ae3
13 changed files with 1520 additions and 1 deletions

View File

@@ -0,0 +1,209 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utilities.sources.debezium;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.debezium.DebeziumConstants;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics;
import org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.hudi.utilities.schema.SchemaRegistryProvider;
import org.apache.hudi.utilities.sources.InputBatch;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.streaming.kafka010.KafkaTestUtils;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import java.io.IOException;
import java.util.UUID;
import java.util.stream.Stream;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.params.provider.Arguments.arguments;
import static org.mockito.Mockito.mock;
public abstract class TestAbstractDebeziumSource extends UtilitiesTestBase {
private static final String TEST_TOPIC_NAME = "hoodie_test";
private final HoodieDeltaStreamerMetrics metrics = mock(HoodieDeltaStreamerMetrics.class);
private KafkaTestUtils testUtils;
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass(false);
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
testUtils = new KafkaTestUtils();
testUtils.setup();
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
testUtils.teardown();
}
private TypedProperties createPropsForJsonSource() {
TypedProperties props = new TypedProperties();
props.setProperty("hoodie.deltastreamer.source.kafka.topic", TEST_TOPIC_NAME);
props.setProperty("bootstrap.servers", testUtils.brokerAddress());
props.setProperty("auto.offset.reset", "earliest");
props.setProperty(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false");
props.setProperty("hoodie.deltastreamer.schemaprovider.registry.url", "localhost");
props.setProperty("hoodie.deltastreamer.source.kafka.value.deserializer.class", StringDeserializer.class.getName());
props.setProperty(ConsumerConfig.GROUP_ID_CONFIG, UUID.randomUUID().toString());
return props;
}
protected abstract String getIndexName();
protected abstract String getSourceClass();
protected abstract String getSchema();
protected abstract GenericRecord generateMetaFields(GenericRecord record);
protected abstract void validateMetaFields(Dataset<Row> records);
@ParameterizedTest
@MethodSource("testArguments")
public void testDebeziumEvents(Operation operation) throws Exception {
String sourceClass = getSourceClass();
// topic setup.
testUtils.createTopic(TEST_TOPIC_NAME, 2);
TypedProperties props = createPropsForJsonSource();
SchemaProvider schemaProvider = new MockSchemaRegistryProvider(props, jsc, this);
SourceFormatAdapter debeziumSource = new SourceFormatAdapter(UtilHelpers.createSource(sourceClass, props, jsc, sparkSession, schemaProvider, metrics));
testUtils.sendMessages(TEST_TOPIC_NAME, new String[] {generateDebeziumEvent(operation).toString()});
InputBatch<Dataset<Row>> fetch = debeziumSource.fetchNewDataInRowFormat(Option.empty(), 10);
assertEquals(1, fetch.getBatch().get().count());
// Ensure the before fields are picked for DELETE CDC Events,
// and after fields are picked for INSERT and UPDATE CDC Events.
final String fieldPrefix = (operation.equals(Operation.DELETE)) ? "before_" : "after_";
assertTrue(fetch.getBatch().get().select("type").collectAsList().stream()
.allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
assertTrue(fetch.getBatch().get().select("type").collectAsList().stream()
.allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
// Validate DB specific meta fields
validateMetaFields(fetch.getBatch().get());
}
private GenericRecord generateDebeziumEvent(Operation op) {
Schema schema = new Schema.Parser().parse(getSchema());
String indexName = getIndexName().concat(".ghschema.gharchive.Value");
GenericRecord rec = new GenericData.Record(schema);
rec.put(DebeziumConstants.INCOMING_OP_FIELD, op.op);
rec.put(DebeziumConstants.INCOMING_TS_MS_FIELD, 100L);
// Before
Schema.Field beforeField = schema.getField(DebeziumConstants.INCOMING_BEFORE_FIELD);
Schema beforeSchema = beforeField.schema().getTypes().get(beforeField.schema().getIndexNamed(indexName));
GenericRecord beforeRecord = new GenericData.Record(beforeSchema);
beforeRecord.put("id", 1);
beforeRecord.put("date", "1/1/2020");
beforeRecord.put("type", "before_type");
beforeRecord.put("payload", "before_payload");
beforeRecord.put("timestamp", 1000L);
rec.put(DebeziumConstants.INCOMING_BEFORE_FIELD, beforeRecord);
// After
Schema.Field afterField = schema.getField(DebeziumConstants.INCOMING_AFTER_FIELD);
Schema afterSchema = afterField.schema().getTypes().get(afterField.schema().getIndexNamed(indexName));
GenericRecord afterRecord = new GenericData.Record(afterSchema);
afterRecord.put("id", 1);
afterRecord.put("date", "1/1/2021");
afterRecord.put("type", "after_type");
afterRecord.put("payload", "after_payload");
afterRecord.put("timestamp", 3000L);
rec.put(DebeziumConstants.INCOMING_AFTER_FIELD, afterRecord);
return generateMetaFields(rec);
}
private static class MockSchemaRegistryProvider extends SchemaRegistryProvider {
private final String schema;
public MockSchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc, TestAbstractDebeziumSource source) {
super(props, jssc);
schema = source.getSchema();
}
@Override
public String fetchSchemaFromRegistry(String registryUrl) throws IOException {
return schema;
}
}
private static Stream<Arguments> testArguments() {
return Stream.of(
arguments(Operation.INSERT),
arguments(Operation.UPDATE),
arguments(Operation.DELETE)
);
}
private enum Operation {
INSERT("c"),
UPDATE("u"),
DELETE("d");
public final String op;
Operation(String op) {
this.op = op;
}
}
}

View File

@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utilities.sources.debezium;
import org.apache.hudi.common.model.debezium.DebeziumConstants;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestMysqlDebeziumSource extends TestAbstractDebeziumSource {
private static final String MYSQL_GITHUB_SCHEMA = "{\"connect.name\": \"mysql.ghschema.gharchive.Envelope\",\n"
+ " \"fields\": [{\"default\": null,\"name\": \"before\",\"type\": [\"null\",{\"connect.name\": \"mysql.ghschema.gharchive.Value\",\n"
+ " \"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"date\",\"type\": \"string\"},{\"default\": null,\"name\": \"timestamp\",\n"
+ " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"type\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"payload\",\n"
+ " \"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"org\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"created_at\",\n"
+ " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"public\",\"type\": [\"null\",\"boolean\"]}],\"name\": \"Value\",\"type\": \"record\"\n"
+ " }]},{\"default\": null,\"name\": \"after\",\"type\": [\"null\",\"Value\"]},{\"name\": \"source\",\"type\": {\"connect.name\": \"io.debezium.connector.mysql.Source\",\n"
+ " \"fields\": [{\"name\": \"connector\",\"type\": \"string\"},{\"name\": \"name\",\"type\": \"string\"},{\"name\": \"ts_ms\",\"type\": \"long\"},\n"
+ " {\"name\": \"db\",\"type\": \"string\"},{\"name\": \"table\",\"type\": \"string\"},{\"default\": null,\n"
+ " \"name\": \"txId\",\"type\": [\"null\",\"long\"]},{\"name\": \"file\",\"type\": \"string\"},{\"default\": null,\"name\": \"pos\",\"type\": [\"null\",\"long\"]},{\"default\": null,\n"
+ " \"name\": \"row\",\"type\": [\"null\",\"long\"]}],\"name\": \"Source\",\"namespace\": \"io.debezium.connector.mysql\",\"type\": \"record\"\n"
+ " }},{\"name\": \"op\",\"type\": \"string\"},{\"default\": null,\"name\": \"ts_ms\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"transaction\",\n"
+ " \"type\": [\"null\",{\"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"total_order\",\"type\": \"long\"},{\"name\": \"data_collection_order\",\n"
+ " \"type\": \"long\"}],\"name\": \"ConnectDefault\",\"namespace\": \"io.confluent.connect.avro\",\"type\": \"record\"}]}],\"name\": \"Envelope\",\n"
+ " \"namespace\": \"mysql.ghschema.gharchive\",\"type\": \"record\"}";
private static final String TEST_DB = "ghschema";
private static final String TEST_TABLE = "gharchive";
private static final long TEST_TS_MS = 12345L;
private static final long TEST_TXID = 543L;
private static final String TEST_FILE = "mysql-bin.00007";
private static final long TEST_POS = 98765L;
private static final String EXPECTED_TEST_SEQ = "00007.98765";
@Override
protected String getIndexName() {
return "mysql";
}
@Override
protected String getSourceClass() {
return MysqlDebeziumSource.class.getName();
}
@Override
protected String getSchema() {
return MYSQL_GITHUB_SCHEMA;
}
@Override
protected GenericRecord generateMetaFields(GenericRecord rec) {
Schema schema = new Schema.Parser().parse(getSchema());
// Source fields specific to Mysql DB
GenericRecord sourceRecord = new GenericData.Record(schema.getField(DebeziumConstants.INCOMING_SOURCE_FIELD).schema());
sourceRecord.put("name", getIndexName());
sourceRecord.put("connector", getIndexName());
sourceRecord.put("db", TEST_DB);
sourceRecord.put("table", TEST_TABLE);
sourceRecord.put("ts_ms", TEST_TS_MS);
sourceRecord.put("txId", TEST_TXID);
sourceRecord.put("file", TEST_FILE);
sourceRecord.put("pos", TEST_POS);
rec.put(DebeziumConstants.INCOMING_SOURCE_FIELD, sourceRecord);
return rec;
}
@Override
protected void validateMetaFields(Dataset<Row> records) {
assertTrue(records.select(DebeziumConstants.FLATTENED_SHARD_NAME).collectAsList().stream()
.allMatch(r -> r.getString(0).equals(getIndexName())));
assertTrue(records.select(DebeziumConstants.FLATTENED_TS_COL_NAME).collectAsList().stream()
.allMatch(r -> r.getLong(0) == TEST_TS_MS));
assertTrue(records.select(DebeziumConstants.FLATTENED_TX_ID_COL_NAME).collectAsList().stream()
.allMatch(r -> r.getLong(0) == TEST_TXID));
assertTrue(records.select(DebeziumConstants.ADDED_SEQ_COL_NAME).collectAsList().stream()
.allMatch(r -> r.getString(0).equals(EXPECTED_TEST_SEQ)));
}
}

View File

@@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.utilities.sources.debezium;
import org.apache.hudi.common.model.debezium.DebeziumConstants;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestPostgresDebeziumSource extends TestAbstractDebeziumSource {
private static final String POSTGRES_GITHUB_SCHEMA = "{\"connect.name\": \"postgres.ghschema.gharchive.Envelope\",\n"
+ " \"fields\": [{\"default\": null,\"name\": \"before\",\"type\": [\"null\",{\"connect.name\": \"postgres.ghschema.gharchive.Value\",\n"
+ " \"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"date\",\"type\": \"string\"},{\"default\": null,\"name\": \"timestamp\",\n"
+ " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"type\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"payload\",\n"
+ " \"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"org\",\"type\": [\"null\",\"string\"]},{\"default\": null,\"name\": \"created_at\",\n"
+ " \"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"public\",\"type\": [\"null\",\"boolean\"]}],\"name\": \"Value\",\"type\": \"record\"\n"
+ " }]},{\"default\": null,\"name\": \"after\",\"type\": [\"null\",\"Value\"]},{\"name\": \"source\",\"type\": {\"connect.name\": \"io.debezium.connector.postgresql.Source\",\n"
+ " \"fields\": [{\"name\": \"connector\",\"type\": \"string\"},{\"name\": \"name\",\"type\": \"string\"},{\"name\": \"ts_ms\",\"type\": \"long\"},\n"
+ " {\"name\": \"db\",\"type\": \"string\"},{\"name\": \"schema\",\"type\": \"string\"},{\"name\": \"table\",\"type\": \"string\"},{\"default\": null,\n"
+ " \"name\": \"txId\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"lsn\",\"type\": [\"null\",\"long\"]},{\"default\": null,\n"
+ " \"name\": \"xmin\",\"type\": [\"null\",\"long\"]}],\"name\": \"Source\",\"namespace\": \"io.debezium.connector.postgresql\",\"type\": \"record\"\n"
+ " }},{\"name\": \"op\",\"type\": \"string\"},{\"default\": null,\"name\": \"ts_ms\",\"type\": [\"null\",\"long\"]},{\"default\": null,\"name\": \"transaction\",\n"
+ " \"type\": [\"null\",{\"fields\": [{\"name\": \"id\",\"type\": \"string\"},{\"name\": \"total_order\",\"type\": \"long\"},{\"name\": \"data_collection_order\",\n"
+ " \"type\": \"long\"}],\"name\": \"ConnectDefault\",\"namespace\": \"io.confluent.connect.avro\",\"type\": \"record\"}]}],\"name\": \"Envelope\",\n"
+ " \"namespace\": \"postgres.ghschema.gharchive\",\"type\": \"record\"}";
private static final String TEST_DB = "postgres";
private static final String TEST_SCHEMA = "ghschema";
private static final String TEST_TABLE = "gharchive";
private static final long TEST_TS_MS = 12345L;
private static final long TEST_TXID = 543L;
private static final long TEST_LSN = 98765L;
@Override
protected String getIndexName() {
return "postgres";
}
@Override
protected String getSourceClass() {
return PostgresDebeziumSource.class.getName();
}
@Override
protected String getSchema() {
return POSTGRES_GITHUB_SCHEMA;
}
@Override
protected GenericRecord generateMetaFields(GenericRecord rec) {
Schema schema = new Schema.Parser().parse(getSchema());
// Source fields specific to Postgres DB
GenericRecord sourceRecord = new GenericData.Record(schema.getField(DebeziumConstants.INCOMING_SOURCE_FIELD).schema());
sourceRecord.put("name", getIndexName());
sourceRecord.put("connector", getIndexName());
sourceRecord.put("db", TEST_DB);
sourceRecord.put("schema", TEST_SCHEMA);
sourceRecord.put("table", TEST_TABLE);
sourceRecord.put("ts_ms", TEST_TS_MS);
sourceRecord.put("txId", TEST_TXID);
sourceRecord.put("lsn", TEST_LSN);
rec.put(DebeziumConstants.INCOMING_SOURCE_FIELD, sourceRecord);
return rec;
}
@Override
protected void validateMetaFields(Dataset<Row> records) {
assertTrue(records.select(DebeziumConstants.FLATTENED_TS_COL_NAME).collectAsList().stream()
.allMatch(r -> r.getLong(0) == TEST_TS_MS));
assertTrue(records.select(DebeziumConstants.FLATTENED_TX_ID_COL_NAME).collectAsList().stream()
.allMatch(r -> r.getLong(0) == TEST_TXID));
assertTrue(records.select(DebeziumConstants.FLATTENED_LSN_COL_NAME).collectAsList().stream()
.allMatch(r -> r.getLong(0) == TEST_LSN));
}
}