diff --git a/.gitignore b/.gitignore index fcd673b34..413e0a0c4 100644 --- a/.gitignore +++ b/.gitignore @@ -78,4 +78,4 @@ dependency-reduced-pom.xml ####################################### hudi-integ-test/compose_env node_modules -package-lock.json \ No newline at end of file +package-lock.json diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index bbe6b10a7..4df7d0deb 100644 --- a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -46,6 +46,7 @@ import org.apache.hudi.keygen.SimpleAvroKeyGenerator; import org.apache.hudi.keygen.constant.KeyGeneratorType; import org.apache.hudi.metrics.MetricsReporterType; import org.apache.hudi.metrics.datadog.DatadogHttpClient.ApiSite; +import org.apache.hudi.table.RandomFileIdPrefixProvider; import org.apache.hudi.table.action.compact.CompactionTriggerStrategy; import org.apache.hudi.table.action.compact.strategy.CompactionStrategy; @@ -413,6 +414,12 @@ public class HoodieWriteConfig extends HoodieConfig { .withDocumentation("Whether to include '_hoodie_operation' in the metadata fields. " + "Once enabled, all the changes of a record are persisted to the delta log directly without merge"); + public static final ConfigProperty FILEID_PREFIX_PROVIDER_CLASS = ConfigProperty + .key("hoodie.fileid.prefix.provider.class") + .defaultValue(RandomFileIdPrefixProvider.class.getName()) + .sinceVersion("0.10.0") + .withDocumentation("File Id Prefix provider class, that implements `org.apache.hudi.fileid.FileIdPrefixProvider`"); + private ConsistencyGuardConfig consistencyGuardConfig; // Hoodie Write Client transparently rewrites File System View config when embedded mode is enabled @@ -1748,6 +1755,10 @@ public class HoodieWriteConfig extends HoodieConfig { return getBooleanOrDefault(ALLOW_OPERATION_METADATA_FIELD); } + public String getFileIdPrefixProviderClassName() { + return getString(FILEID_PREFIX_PROVIDER_CLASS); + } + public static class Builder { protected final HoodieWriteConfig writeConfig = new HoodieWriteConfig(); @@ -2079,6 +2090,11 @@ public class HoodieWriteConfig extends HoodieConfig { return this; } + public Builder withFileIdPrefixProviderClassName(String fileIdPrefixProviderClassName) { + writeConfig.setValue(FILEID_PREFIX_PROVIDER_CLASS, fileIdPrefixProviderClassName); + return this; + } + public Builder withProperties(Properties properties) { this.writeConfig.getProps().putAll(properties); return this; diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/FileIdPrefixProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/FileIdPrefixProvider.java new file mode 100644 index 000000000..d06da9b0d --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/FileIdPrefixProvider.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import java.util.Properties; + +public abstract class FileIdPrefixProvider { + + private final Properties props; + + public FileIdPrefixProvider(Properties props) { + this.props = props; + } + + public Properties getProps() { + return props; + } + + public abstract String createFilePrefix(String partitionPath); +} diff --git a/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/RandomFileIdPrefixProvider.java b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/RandomFileIdPrefixProvider.java new file mode 100644 index 000000000..89d993460 --- /dev/null +++ b/hudi-client/hudi-client-common/src/main/java/org/apache/hudi/table/RandomFileIdPrefixProvider.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hudi.common.fs.FSUtils; + +import java.util.Properties; + +public class RandomFileIdPrefixProvider extends FileIdPrefixProvider { + + public RandomFileIdPrefixProvider(Properties props) { + super(props); + } + + @Override + public String createFilePrefix(String partitionPath) { + return FSUtils.createNewFileIdPfx(); + } +} diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java index 8b7cb198b..57b6306ce 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/client/HoodieJavaWriteClient.java @@ -18,8 +18,6 @@ package org.apache.hudi.client; -import com.codahale.metrics.Timer; -import org.apache.hadoop.conf.Configuration; import org.apache.hudi.client.common.HoodieJavaEngineContext; import org.apache.hudi.client.embedded.EmbeddedTimelineService; import org.apache.hudi.common.engine.HoodieEngineContext; @@ -30,6 +28,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; @@ -41,6 +40,9 @@ import org.apache.hudi.table.HoodieJavaTable; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; +import com.codahale.metrics.Timer; +import org.apache.hadoop.conf.Configuration; + import java.io.IOException; import java.util.List; import java.util.Map; @@ -153,11 +155,23 @@ public class HoodieJavaWriteClient extends throw new HoodieNotSupportedException("BulkInsert is not supported in HoodieJavaClient"); } + public void transitionInflight(String instantTime) { + HoodieTableMetaClient metaClient = createMetaClient(true); + metaClient.getActiveTimeline().transitionRequestedToInflight( + new HoodieInstant(HoodieInstant.State.REQUESTED, metaClient.getCommitActionType(), instantTime), + Option.empty(), config.shouldAllowMultiWriteOnSameInstant()); + } + @Override public List bulkInsertPreppedRecords(List> preppedRecords, String instantTime, Option>>> bulkInsertPartitioner) { - throw new HoodieNotSupportedException("BulkInsertPreppedRecords is not supported in HoodieJavaClient"); + HoodieTable>, List, List> table = + getTableAndInitCtx(WriteOperationType.BULK_INSERT_PREPPED, instantTime); + table.validateInsertSchema(); + preWrite(instantTime, WriteOperationType.BULK_INSERT_PREPPED, table.getMetaClient()); + HoodieWriteMetadata> result = table.bulkInsertPrepped(context, instantTime, preppedRecords, bulkInsertPartitioner); + return postWrite(result, instantTime, table); } @Override diff --git a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java index 9142569f9..b7ea916aa 100644 --- a/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java +++ b/hudi-client/hudi-java-client/src/main/java/org/apache/hudi/table/action/commit/JavaBulkInsertHelper.java @@ -19,17 +19,18 @@ package org.apache.hudi.table.action.commit; import org.apache.hudi.client.WriteStatus; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.model.HoodieKey; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.execution.JavaLazyInsertIterable; import org.apache.hudi.execution.bulkinsert.JavaBulkInsertInternalPartitionerFactory; import org.apache.hudi.io.CreateHandleFactory; import org.apache.hudi.table.BulkInsertPartitioner; +import org.apache.hudi.table.FileIdPrefixProvider; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.action.HoodieWriteMetadata; @@ -66,10 +67,14 @@ public class JavaBulkInsertHelper extends Abst final Option> userDefinedBulkInsertPartitioner) { HoodieWriteMetadata result = new HoodieWriteMetadata(); - //transition bulk_insert state to inflight - table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(HoodieInstant.State.REQUESTED, - table.getMetaClient().getCommitActionType(), instantTime), Option.empty(), - config.shouldAllowMultiWriteOnSameInstant()); + // It's possible the transition to inflight could have already happened. + if (!table.getActiveTimeline().filterInflights().containsInstant(instantTime)) { + table.getActiveTimeline().transitionRequestedToInflight( + new HoodieInstant(HoodieInstant.State.REQUESTED, table.getMetaClient().getCommitActionType(), instantTime), + Option.empty(), + config.shouldAllowMultiWriteOnSameInstant()); + } + // write new files List writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), false); //update index @@ -102,12 +107,16 @@ public class JavaBulkInsertHelper extends Abst : JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode()); repartitionedRecords = (List>) partitioner.repartitionRecords(dedupedRecords, parallelism); - String idPfx = FSUtils.createNewFileIdPfx(); + FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass( + config.getFileIdPrefixProviderClassName(), + config.getProps()); List writeStatuses = new ArrayList<>(); - new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true, config, instantTime, table, idPfx, - table.getTaskContextSupplier(), new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll); + new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true, + config, instantTime, table, + fileIdPrefixProvider.createFilePrefix(""), table.getTaskContextSupplier(), + new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll); return writeStatuses; } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java index 4e53cca36..18b7de2fd 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/config/ConfigGroups.java @@ -30,7 +30,8 @@ public class ConfigGroups { FLINK_SQL("Flink Sql Configs"), WRITE_CLIENT("Write Client Configs"), METRICS("Metrics Configs"), - RECORD_PAYLOAD("Record Payload Config"); + RECORD_PAYLOAD("Record Payload Config"), + KAFKA_CONNECT("Kafka Connect Configs"); public final String name; @@ -72,6 +73,9 @@ public class ConfigGroups { description = "These set of configs are used to enable monitoring and reporting of key" + "Hudi stats and metrics."; break; + case KAFKA_CONNECT: + description = "These set of configs are used for Kakfa Connect Sink Connector for writing Hudi Tables"; + break; default: description = "Please fill in the description for Config Group Name: " + names.name; break; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/CommitUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/CommitUtils.java index b571efa06..dee91b282 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/CommitUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/CommitUtils.java @@ -26,6 +26,7 @@ import org.apache.hudi.common.model.WriteOperationType; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.exception.HoodieException; + import org.apache.log4j.LogManager; import org.apache.log4j.Logger; diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java index cde87d467..6016008e4 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/SchemaTestUtil.java @@ -71,17 +71,20 @@ public final class SchemaTestUtil { return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit); } + public static List generateTestJsonRecords(int from, int limit) throws IOException, URISyntaxException { + Path dataPath = initializeSampleDataPath(); + + try (Stream stream = Files.lines(dataPath)) { + return stream.skip(from).limit(limit).collect(Collectors.toList()); + } catch (IOException e) { + throw new HoodieIOException("Could not read data from " + RESOURCE_SAMPLE_DATA, e); + } + } + private static List toRecords(Schema writerSchema, Schema readerSchema, int from, int limit) throws IOException, URISyntaxException { GenericDatumReader reader = new GenericDatumReader<>(writerSchema, readerSchema); - // Required to register the necessary JAR:// file system - URI resource = SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI(); - Path dataPath; - if (resource.toString().contains("!")) { - dataPath = uriToPath(resource); - } else { - dataPath = Paths.get(SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI()); - } + Path dataPath = initializeSampleDataPath(); try (Stream stream = Files.lines(dataPath)) { return stream.skip(from).limit(limit).map(s -> { @@ -96,6 +99,21 @@ public final class SchemaTestUtil { } } + /** + * Required to register the necessary JAR:// file system. + * @return Path to the sample data in the resource file. + * @throws IOException + * @throws URISyntaxException + */ + private static Path initializeSampleDataPath() throws IOException, URISyntaxException { + URI resource = SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI(); + if (resource.toString().contains("!")) { + return uriToPath(resource); + } else { + return Paths.get(SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI()); + } + } + public static Path uriToPath(URI uri) throws IOException { final Map env = new HashMap<>(); final String[] array = uri.toString().split("!"); diff --git a/hudi-kafka-connect/README.md b/hudi-kafka-connect/README.md new file mode 100644 index 000000000..fd0a5d010 --- /dev/null +++ b/hudi-kafka-connect/README.md @@ -0,0 +1,94 @@ + + +# Quick Start guide for Kafka Connect Sink for Hudi + +This repo contains a sample project that can be used to start off your own source connector for Kafka Connect. + +## Building the connector + +The first thing you need to do to start using this connector is building it. In order to do that, you need to install the following dependencies: + +- [Java 1.8+](https://openjdk.java.net/) +- [Apache Maven](https://maven.apache.org/) + +After installing these dependencies, execute the following command: + +```bash +cd $HUDI_DIR +mvn clean package +``` + +## Incremental Builds + +```bash +mvn clean -pl hudi-kafka-connect install -DskipTests +mvn clean -pl packaging/hudi-kafka-connect-bundle install +``` + +## Put hudi connector in Kafka Connect classpath + +```bash +cp $HUDI_DIR/packaging/hudi-kafka-connect-bundle/target/hudi-kafka-connect-bundle-0.10.0-SNAPSHOT.jar /usr/local/share/java/hudi-kafka-connect/ +``` + +## Trying the connector + +After building the package, we need to install the Apache Kafka + +### 1 - Starting the environment + +Start the ZK and Kafka: + +```bash +./bin/zookeeper-server-start.sh ./config/zookeeper.properties +./bin/kafka-server-start.sh ./config/server.properties +``` + +Wait until the kafka cluster is up and running. + +### 2 - Create the Hudi Control Topic for Coordination of the transactions + +The control topic should only have `1` partition + +```bash +./bin/kafka-topics.sh --delete --topic hudi-control-topic --bootstrap-server localhost:9092 +./bin/kafka-topics.sh --create --topic hudi-control-topic --partitions 1 --replication-factor 1 --bootstrap-server localhost:9092 +``` + +### 3 - Create the Hudi Topic for the Sink and insert data into the topic + +Open a terminal to execute the following command: + +```bash +bash runKafkaTrafficGenerator.sh +``` + +### 4 - Run the Sink connector worker (multiple workers can be run) + +Open a terminal to execute the following command: + +```bash +./bin/connect-distributed.sh ../hudi-kafka-connect/configs/connect-distributed.properties +``` + +### 5- To add the Hudi Sink to the Connector (delete it if you want to re-configure) + +```bash +curl -X DELETE http://localhost:8083/connectors/hudi-sink +curl -X POST -H "Content-Type:application/json" -d @$HUDI-DIR/hudi-kafka-connect/configs/config-sink.json http://localhost:8083/connectors +``` diff --git a/hudi-kafka-connect/configs/config-sink.json b/hudi-kafka-connect/configs/config-sink.json new file mode 100644 index 000000000..4e94bf541 --- /dev/null +++ b/hudi-kafka-connect/configs/config-sink.json @@ -0,0 +1,19 @@ +{ + "name": "hudi-sink", + "config": { + "bootstrap.servers": "localhost:9092", + "connector.class": "org.apache.hudi.connect.HoodieSinkConnector", + "tasks.max": "4", + "key.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter": "org.apache.kafka.connect.storage.StringConverter", + "value.converter.schemas.enable": "false", + "topics": "hudi-test-topic", + "hoodie.table.name": "hudi-test-topic", + "hoodie.base.path": "file:///tmp/hoodie/sample-table", + "hoodie.datasource.write.recordkey.field": "volume", + "hoodie.datasource.write.partitionpath.field": "year", + "hoodie.schemaprovider.class": "org.apache.hudi.schema.FilebasedSchemaProvider", + "hoodie.deltastreamer.schemaprovider.source.schema.file": "file:///tmp/hoodie/schema.avsc", + "hoodie.deltastreamer.schemaprovider.target.schema.file": "file:///tmp/hoodie/schema.avsc" + } +} diff --git a/hudi-kafka-connect/configs/connect-distributed.properties b/hudi-kafka-connect/configs/connect-distributed.properties new file mode 100644 index 000000000..d7d453c69 --- /dev/null +++ b/hudi-kafka-connect/configs/connect-distributed.properties @@ -0,0 +1,33 @@ +## +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +## + +bootstrap.servers=localhost:9092 +group.id=hudi-connect-cluster +key.converter=org.apache.kafka.connect.json.JsonConverter +value.converter=org.apache.kafka.connect.json.JsonConverter +key.converter.schemas.enable=true +value.converter.schemas.enable=true +offset.storage.topic=connect-offsets +offset.storage.replication.factor=1 +config.storage.topic=connect-configs +config.storage.replication.factor=1 +status.storage.topic=connect-status +status.storage.replication.factor=1 + +offset.flush.interval.ms=60000 +listeners=HTTP://:8083 +plugin.path=/usr/local/share/java,/usr/local/share/kafka/plugins,/opt/connectors, diff --git a/hudi-kafka-connect/pom.xml b/hudi-kafka-connect/pom.xml new file mode 100644 index 000000000..7742f3b31 --- /dev/null +++ b/hudi-kafka-connect/pom.xml @@ -0,0 +1,231 @@ + + + + + hudi + org.apache.hudi + 0.10.0-SNAPSHOT + + 4.0.0 + + hudi-kafka-connect + Kafka Connect Sink Connector for Hudi + 0.10.0-SNAPSHOT + jar + + + ${project.parent.basedir} + 2.5.0 + + + + + + org.jacoco + jacoco-maven-plugin + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-jar-plugin + 3.1.2 + + + + test-jar + + + + + + org.apache.rat + apache-rat-plugin + + + + + + src/main/resources + + + src/test/resources + + + + + + + + org.apache.kafka + connect-api + ${connect.api.version} + provided + + + org.apache.kafka + connect-json + ${connect.api.version} + provided + + + + + org.apache.hudi + hudi-java-client + ${project.version} + + + org.apache.hudi + hudi-utilities_${scala.binary.version} + ${project.version} + + + org.apache.hudi + hudi-client-common + ${project.version} + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + + org.apache.hudi + hudi-flink_${scala.binary.version} + ${project.version} + + + org.apache.flink + flink-core + 1.12.1 + + + com.esotericsoftware.kryo + * + + + + + + + log4j + log4j + + + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.binary.version} + + + com.fasterxml.jackson.dataformat + jackson-dataformat-csv + ${fasterxml.version} + + + + + org.apache.avro + avro + ${avro.version} + + + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + + + org.apache.hudi + hudi-common + ${project.version} + tests + test-jar + test + + + + + org.junit.jupiter + junit-jupiter-api + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.junit.vintage + junit-vintage-engine + test + + + + org.junit.jupiter + junit-jupiter-params + test + + + + org.mockito + mockito-junit-jupiter + test + + + + org.junit.platform + junit-platform-runner + test + + + + org.junit.platform + junit-platform-suite-api + test + + + + org.junit.platform + junit-platform-commons + test + + + diff --git a/hudi-kafka-connect/scripts/raw.json b/hudi-kafka-connect/scripts/raw.json new file mode 100644 index 000000000..aa2cc7037 --- /dev/null +++ b/hudi-kafka-connect/scripts/raw.json @@ -0,0 +1,5 @@ +{"volume": 0, "symbol": "TPNL", "ts": "2017-08-31 09:30:00", "month": "08", "high": 6.37, "low": 1.37, "key": "TPNL_2017-08-31 09", "year": 2017, "date": "2017/08/31", "close": 4.44, "open": 1.37, "day": "31"} +{"volume": 0, "symbol": "SPOT", "ts": "2018-08-31 09:30:00", "month": "08", "high": 1.87, "low": 0.37, "key": "TPNL_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 1.44, "open": 1.77, "day": "31"} +{"volume": 0, "symbol": "GOOG", "ts": "2019-08-31 09:30:00", "month": "08", "high": 2.1, "low": 1.7, "key": "TPNL_2019-08-31 09", "year": 2019, "date": "2019/08/31", "close": 1.94, "open": 2.0, "day": "31"} +{"volume": 0, "symbol": "MSFT", "ts": "2020-08-31 09:30:00", "month": "08", "high": 3.33, "low": 0.87, "key": "TPNL_2020-08-31 09", "year": 2020, "date": "2020/08/31", "close": 3.33, "open": 3.1, "day": "31"} +{"volume": 0, "symbol": "APPL", "ts": "2021-08-31 09:30:00", "month": "08", "high": 3.17, "low": 2.37, "key": "TPNL_2021-08-31 09", "year": 2021, "date": "2021/08/31", "close": 2.66, "open": 3.1, "day": "31"} diff --git a/hudi-kafka-connect/scripts/runKafkaTrafficGenerator.sh b/hudi-kafka-connect/scripts/runKafkaTrafficGenerator.sh new file mode 100644 index 000000000..cff414070 --- /dev/null +++ b/hudi-kafka-connect/scripts/runKafkaTrafficGenerator.sh @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +# First delete the existing topic +$KAFKA_HOME/bin/kafka-topics.sh --delete --topic hudi-test-topic --bootstrap-server localhost:9092 + +# Create the topic with 4 partitions +$KAFKA_HOME/bin/kafka-topics.sh --create --topic hudi-test-topic --partitions 4 --replication-factor 1 --bootstrap-server localhost:9092 + +# Generate kafka messages from raw records +inputFile="raw.json" +# Generate the records with unique keys +for ((recordKey=0; recordKey<=$1; )) +do + while IFS= read line + do + echo $line | jq --argjson recordKey $recordKey -c '.volume = $recordKey' | kcat -P -b localhost:9092 -t hudi-test-topic + ((recordKey++)) + if [ $(( $recordKey % 1000 )) -eq 0 ] + then sleep 1 + fi + done < "$inputFile" +done diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkConnector.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkConnector.java new file mode 100644 index 000000000..2d8cc47aa --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkConnector.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.kafka.common.config.ConfigDef; +import org.apache.kafka.connect.connector.Task; +import org.apache.kafka.connect.sink.SinkConnector; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * HudiSinkConnector is a Kafka Connect Connector implementation + * that ingest data from Kafka to Hudi. + */ +public class HoodieSinkConnector extends SinkConnector { + + public static final String VERSION = "0.1.0"; + private static final Logger LOG = LogManager.getLogger(HoodieSinkConnector.class); + private Map configProps; + + /** + * No-arg constructor. It is instantiated by Connect framework. + */ + public HoodieSinkConnector() { + } + + @Override + public String version() { + return VERSION; + } + + @Override + public void start(Map props) { + configProps = new HashMap<>(props); + } + + @Override + public Class taskClass() { + return HoodieSinkTask.class; + } + + @Override + public List> taskConfigs(int maxTasks) { + Map taskProps = new HashMap<>(configProps); + List> taskConfigs = new ArrayList<>(maxTasks); + for (int i = 0; i < maxTasks; ++i) { + taskConfigs.add(taskProps); + } + return taskConfigs; + } + + @Override + public void stop() { + LOG.info(String.format("Shutting down Hudi Sink connector %s", configProps.get("name"))); + } + + @Override + public ConfigDef config() { + // we use Hudi configs instead + return new ConfigDef(); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java new file mode 100644 index 000000000..c7dde9a2e --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/HoodieSinkTask.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.connect.kafka.KafkaConnectControlAgent; +import org.apache.hudi.connect.transaction.ConnectTransactionCoordinator; +import org.apache.hudi.connect.transaction.ConnectTransactionParticipant; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; + +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTask; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +/** + * Implementation of the {@link SinkTask} interface provided by + * Kafka Connect. Implements methods to receive the Kafka records + * from the assigned partitions and commit the Kafka offsets. + * Also, handles re-assignments of partitions. + */ +public class HoodieSinkTask extends SinkTask { + + public static final String TASK_ID_CONFIG_NAME = "task.id"; + private static final Logger LOG = LogManager.getLogger(HoodieSinkTask.class); + private static final int COORDINATOR_KAFKA_PARTITION = 0; + + private final Map transactionCoordinators; + private final Map hudiTransactionParticipants; + private KafkaConnectControlAgent controlKafkaClient; + private KafkaConnectConfigs connectConfigs; + + private String taskId; + private String connectorName; + + public HoodieSinkTask() { + transactionCoordinators = new HashMap(); + hudiTransactionParticipants = new HashMap<>(); + } + + @Override + public String version() { + return HoodieSinkConnector.VERSION; + } + + @Override + public void start(Map props) { + connectorName = props.get("name"); + taskId = props.get(TASK_ID_CONFIG_NAME); + LOG.info(String.format("Starting Hudi Sink Task for %s connector %s with id %s with assignments %s", + props, connectorName, taskId, context.assignment())); + try { + connectConfigs = KafkaConnectConfigs.newBuilder().withProperties(props).build(); + controlKafkaClient = KafkaConnectControlAgent.createKafkaControlManager( + connectConfigs.getBootstrapServers(), + connectConfigs.getControlTopicName()); + bootstrap(context.assignment()); + } catch (ConfigException e) { + throw new ConnectException("Couldn't start HdfsSinkConnector due to configuration error.", e); + } catch (ConnectException e) { + LOG.error("Couldn't start HudiSinkConnector:", e); + LOG.info("Shutting down HudiSinkConnector."); + cleanup(); + // Always throw the original exception that prevent us from starting + throw e; + } + } + + @Override + public void put(Collection records) { + for (SinkRecord record : records) { + String topic = record.topic(); + int partition = record.kafkaPartition(); + TopicPartition tp = new TopicPartition(topic, partition); + hudiTransactionParticipants.get(tp).buffer(record); + } + + for (TopicPartition partition : context.assignment()) { + hudiTransactionParticipants.get(partition).processRecords(); + } + } + + @Override + public void stop() { + cleanup(); + } + + @Override + public void flush(Map currentOffsets) { + // No-op. The connector is managing the offsets. + } + + @Override + public Map preCommit(Map currentOffsets) { + // Although the connector manages offsets via commit files in Hudi, we still want to have Connect + // commit the consumer offsets for records this task has consumed from its topic partitions and + // committed to Hudi. + Map result = new HashMap<>(); + for (TopicPartition partition : context.assignment()) { + TransactionParticipant worker = hudiTransactionParticipants.get(partition); + if (worker != null) { + worker.processRecords(); + if (worker.getLastKafkaCommittedOffset() >= 0) { + result.put(partition, new OffsetAndMetadata(worker.getLastKafkaCommittedOffset())); + } + } + } + return result; + } + + @Override + public void open(Collection partitions) { + LOG.info("New partitions added " + partitions.toString()); + bootstrap(partitions); + } + + @Override + public void close(Collection partitions) { + LOG.info("Existing partitions deleted " + partitions.toString()); + // Close any writers we have. We may get assigned the same partitions and end up duplicating + // some effort since we'll have to reprocess those messages. It may be possible to hold on to + // the TopicPartitionWriter and continue to use the temp file, but this can get significantly + // more complex due to potential failures and network partitions. For example, we may get + // this close, then miss a few generations of group membership, during which + // data may have continued to be processed and we'd have to restart from the recovery stage, + // make sure we apply the WAL, and only reuse the temp file if the starting offset is still + // valid. For now, we prefer the simpler solution that may result in a bit of wasted effort. + for (TopicPartition partition : partitions) { + if (partition.partition() == COORDINATOR_KAFKA_PARTITION) { + if (transactionCoordinators.containsKey(partition)) { + transactionCoordinators.get(partition).stop(); + transactionCoordinators.remove(partition); + } + } + TransactionParticipant worker = hudiTransactionParticipants.remove(partition); + if (worker != null) { + try { + LOG.debug("Closing data writer due to task start failure."); + worker.stop(); + } catch (Throwable t) { + LOG.debug(String.format("Error closing and stopping data writer: %s", t.getMessage()), t); + } + } + } + } + + private void bootstrap(Collection partitions) { + LOG.info(String.format("Bootstrap task for connector %s with id %s with assignments %s part %s", + connectorName, taskId, context.assignment(), partitions)); + for (TopicPartition partition : partitions) { + try { + // If the partition is 0, instantiate the Leader + if (partition.partition() == COORDINATOR_KAFKA_PARTITION) { + ConnectTransactionCoordinator coordinator = new ConnectTransactionCoordinator( + connectConfigs, + partition, + controlKafkaClient); + coordinator.start(); + transactionCoordinators.put(partition, coordinator); + } + ConnectTransactionParticipant worker = new ConnectTransactionParticipant(connectConfigs, partition, controlKafkaClient, context); + hudiTransactionParticipants.put(partition, worker); + worker.start(); + } catch (HoodieException exception) { + LOG.error(String.format("Fatal error initializing task %s for partition %s", taskId, partition.partition()), exception); + } + } + } + + private void cleanup() { + for (TopicPartition partition : context.assignment()) { + TransactionParticipant worker = hudiTransactionParticipants.get(partition); + if (worker != null) { + try { + LOG.debug("Closing data writer due to task start failure."); + worker.stop(); + } catch (Throwable t) { + LOG.debug("Error closing and stopping data writer", t); + } + } + } + hudiTransactionParticipants.clear(); + transactionCoordinators.forEach((topic, transactionCoordinator) -> transactionCoordinator.stop()); + transactionCoordinators.clear(); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java new file mode 100644 index 000000000..536ad4a80 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/KafkaConnectFileIdPrefixProvider.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.table.FileIdPrefixProvider; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Objects; +import java.util.Properties; + +public class KafkaConnectFileIdPrefixProvider extends FileIdPrefixProvider { + + public static final String KAFKA_CONNECT_PARTITION_ID = "hudi.kafka.connect.partition"; + private static final Logger LOG = LogManager.getLogger(KafkaConnectFileIdPrefixProvider.class); + + private final String kafkaPartition; + + public KafkaConnectFileIdPrefixProvider(Properties props) { + super(props); + if (!props.containsKey(KAFKA_CONNECT_PARTITION_ID)) { + LOG.error("Fatal error due to Kafka Connect Partition Id is not set"); + throw new HoodieException("Kafka Connect Partition Key " + KAFKA_CONNECT_PARTITION_ID + " not provided"); + } + this.kafkaPartition = props.getProperty(KAFKA_CONNECT_PARTITION_ID); + } + + @Override + public String createFilePrefix(String partitionPath) { + // We use a combination of kafka partition and partition path as the file id, and then hash it + // to generate a fixed sized hash. + String rawFileIdPrefix = kafkaPartition + partitionPath; + MessageDigest md; + try { + md = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + LOG.error("Fatal error selecting hash algorithm", e); + throw new HoodieException(e); + } + + byte[] digest = Objects.requireNonNull(md).digest(rawFileIdPrefix.getBytes(StandardCharsets.UTF_8)); + + LOG.info("CreateFileId for Kafka Partition " + kafkaPartition + " : " + partitionPath + " = " + rawFileIdPrefix + + " === " + StringUtils.toHexString(digest).toUpperCase()); + return StringUtils.toHexString(digest).toUpperCase(); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java new file mode 100644 index 000000000..a115147ae --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaConnectControlAgent.java @@ -0,0 +1,230 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.kafka; + +import org.apache.hudi.connect.transaction.ControlEvent; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.kafka.clients.consumer.CommitFailedException; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.clients.consumer.ConsumerRecords; +import org.apache.kafka.clients.consumer.KafkaConsumer; +import org.apache.kafka.common.serialization.Deserializer; +import org.apache.kafka.common.serialization.StringDeserializer; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.time.Duration; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; +import java.util.UUID; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +/** + * Class that manages the Kafka consumer and producer for + * the Kafka Control Topic that ensures coordination across the + * {@link TransactionCoordinator} and {@link TransactionParticipant}s. + * Use a single instance per worker (single-threaded), + * and register multiple tasks that can receive the control messages. + */ +public class KafkaConnectControlAgent implements KafkaControlAgent { + + private static final Logger LOG = LogManager.getLogger(KafkaConnectControlAgent.class); + private static final Object LOCK = new Object(); + private static final long KAFKA_POLL_TIMEOUT_MS = 100; + private static final int EXEC_SHUTDOWN_TIMEOUT_MS = 5000; + + private static KafkaConnectControlAgent agent; + private final String bootstrapServers; + private final String controlTopicName; + private final ExecutorService executorService; + private final Map topicCoordinators; + // List of TransactionParticipants per Kafka Topic + private final Map> partitionWorkers; + private final KafkaControlProducer producer; + private KafkaConsumer consumer; + + public KafkaConnectControlAgent(String bootstrapServers, + String controlTopicName) { + this.bootstrapServers = bootstrapServers; + this.controlTopicName = controlTopicName; + this.executorService = Executors.newSingleThreadExecutor(); + this.topicCoordinators = new HashMap<>(); + this.partitionWorkers = new HashMap<>(); + this.producer = new KafkaControlProducer(bootstrapServers, controlTopicName); + start(); + } + + public static KafkaConnectControlAgent createKafkaControlManager(String bootstrapServers, + String controlTopicName) { + if (agent == null) { + synchronized (LOCK) { + if (agent == null) { + agent = new KafkaConnectControlAgent(bootstrapServers, controlTopicName); + } + } + } + return agent; + } + + @Override + public void registerTransactionParticipant(TransactionParticipant worker) { + if (!partitionWorkers.containsKey(worker.getPartition().topic())) { + partitionWorkers.put(worker.getPartition().topic(), new ConcurrentLinkedQueue<>()); + } + partitionWorkers.get(worker.getPartition().topic()).add(worker); + } + + @Override + public void deregisterTransactionParticipant(TransactionParticipant worker) { + if (partitionWorkers.containsKey(worker.getPartition().topic())) { + partitionWorkers.get(worker.getPartition().topic()).remove(worker); + } + } + + @Override + public void registerTransactionCoordinator(TransactionCoordinator coordinator) { + if (!topicCoordinators.containsKey(coordinator.getPartition().topic())) { + topicCoordinators.put(coordinator.getPartition().topic(), coordinator); + } + } + + public void deregisterTransactionCoordinator(TransactionCoordinator coordinator) { + topicCoordinators.remove(coordinator.getPartition().topic()); + } + + @Override + public void publishMessage(ControlEvent message) { + producer.publishMessage(message); + } + + private void start() { + Properties props = new Properties(); + props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + // Todo fetch the worker id or name instead of a uuid. + props.put(ConsumerConfig.GROUP_ID_CONFIG, "hudi-control-group" + UUID.randomUUID().toString()); + props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); + props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaJsonDeserializer.class); + + // Since we are using Kafka Control Topic as a RPC like interface, + // we want consumers to only process messages that are sent after they come online + props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest"); + + consumer = new KafkaConsumer<>(props, new StringDeserializer(), + new KafkaJsonDeserializer<>(ControlEvent.class)); + + consumer.subscribe(Collections.singletonList(controlTopicName)); + + executorService.submit(() -> { + while (true) { + ConsumerRecords records; + records = consumer.poll(Duration.ofMillis(KAFKA_POLL_TIMEOUT_MS)); + for (ConsumerRecord record : records) { + try { + LOG.debug(String.format("Kafka consumerGroupId = %s topic = %s, partition = %s, offset = %s, customer = %s, country = %s", + "", record.topic(), record.partition(), record.offset(), record.key(), record.value())); + ControlEvent message = record.value(); + String senderTopic = message.senderPartition().topic(); + if (message.getSenderType().equals(ControlEvent.SenderType.COORDINATOR)) { + if (partitionWorkers.containsKey(senderTopic)) { + for (TransactionParticipant partitionWorker : partitionWorkers.get(senderTopic)) { + partitionWorker.processControlEvent(message); + } + } else { + LOG.warn(String.format("Failed to send message for unregistered participants for topic %s", senderTopic)); + } + } else if (message.getSenderType().equals(ControlEvent.SenderType.PARTICIPANT)) { + if (topicCoordinators.containsKey(senderTopic)) { + topicCoordinators.get(senderTopic).processControlEvent(message); + } else { + LOG.warn(String.format("Failed to send message for unregistered coordinator for topic %s", senderTopic)); + } + } else { + LOG.warn(String.format("Sender type of Control Message unknown %s", message.getSenderType().name())); + } + } catch (Exception e) { + LOG.error(String.format("Fatal error while consuming a kafka record for topic = %s partition = %s", record.topic(), record.partition()), e); + } + } + try { + consumer.commitSync(); + } catch (CommitFailedException exception) { + LOG.error("Fatal error while committing kafka control topic"); + } + } + }); + } + + public void stop() { + producer.stop(); + consumer.close(); + if (executorService != null) { + boolean terminated = false; + try { + LOG.info("Shutting down executor service."); + executorService.shutdown(); + LOG.info("Awaiting termination."); + terminated = executorService.awaitTermination(EXEC_SHUTDOWN_TIMEOUT_MS, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // ignored + } + + if (!terminated) { + LOG.warn( + "Unclean Kafka Control Manager executor service shutdown "); + executorService.shutdownNow(); + } + } + } + + /** + * Deserializes the incoming Kafka records for the Control Topic. + * + * @param represents the object that is sent over the Control Topic. + */ + public static class KafkaJsonDeserializer implements Deserializer { + + private static final Logger LOG = LogManager.getLogger(KafkaJsonDeserializer.class); + private final Class type; + + KafkaJsonDeserializer(Class type) { + this.type = type; + } + + @Override + public T deserialize(String s, byte[] bytes) { + ObjectMapper mapper = new ObjectMapper(); + T obj = null; + try { + obj = mapper.readValue(bytes, type); + } catch (Exception e) { + LOG.error(e.getMessage()); + } + return obj; + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java new file mode 100644 index 000000000..ea5177eb5 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlAgent.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.kafka; + +import org.apache.hudi.connect.transaction.ControlEvent; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; + +/** + * Manages the Kafka consumer and producer for + * the Kafka Control Topic that ensures coordination across the + * {@link TransactionCoordinator} and {@link TransactionParticipant}s. + */ +public interface KafkaControlAgent { + + void registerTransactionParticipant(TransactionParticipant worker); + + void deregisterTransactionParticipant(TransactionParticipant worker); + + void registerTransactionCoordinator(TransactionCoordinator coordinator); + + void deregisterTransactionCoordinator(TransactionCoordinator coordinator); + + void publishMessage(ControlEvent message); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java new file mode 100644 index 000000000..a23251e35 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/kafka/KafkaControlProducer.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.kafka; + +import org.apache.hudi.connect.transaction.ControlEvent; + +import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.PropertyAccessor; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.Producer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.Serializer; +import org.apache.kafka.common.serialization.StringSerializer; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Properties; + +/** + * Kafka producer to send events to the + * Control Topic that coordinates transactions + * across Participants. + */ +public class KafkaControlProducer { + + private static final Logger LOG = LogManager.getLogger(KafkaControlProducer.class); + + private final String bootstrapServers; + private final String controlTopicName; + private Producer producer; + + public KafkaControlProducer(String bootstrapServers, String controlTopicName) { + this.bootstrapServers = bootstrapServers; + this.controlTopicName = controlTopicName; + start(); + } + + private void start() { + Properties props = new Properties(); + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, KafkaJsonSerializer.class); + + producer = new KafkaProducer<>( + props, + new StringSerializer(), + new KafkaJsonSerializer() + ); + } + + public void stop() { + producer.close(); + } + + public void publishMessage(ControlEvent message) { + ProducerRecord record + = new ProducerRecord<>(controlTopicName, message.key(), message); + producer.send(record); + } + + public static class KafkaJsonSerializer implements Serializer { + + private static final Logger LOG = LogManager.getLogger(KafkaJsonSerializer.class); + + @Override + public byte[] serialize(String topic, ControlEvent data) { + byte[] retVal = null; + ObjectMapper objectMapper = new ObjectMapper(); + objectMapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); + + try { + retVal = objectMapper.writeValueAsBytes(data); + } catch (Exception e) { + LOG.error("Fatal error during serialization of Kafka Control Message ", e); + } + return retVal; + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java new file mode 100644 index 000000000..13291c827 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionCoordinator.java @@ -0,0 +1,399 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.connect.writers.ConnectTransactionServices; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.connect.writers.KafkaConnectTransactionServices; +import org.apache.hudi.exception.HoodieException; + +import org.apache.kafka.common.TopicPartition; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; + +/** + * Implementation of the Coordinator that + * coordinates the Hudi write transactions + * across all the Kafka partitions for a single Kafka Topic. + */ +public class ConnectTransactionCoordinator implements TransactionCoordinator, Runnable { + + private static final Logger LOG = LogManager.getLogger(ConnectTransactionCoordinator.class); + private static final String BOOTSTRAP_SERVERS_CFG = "bootstrap.servers"; + private static final String KAFKA_OFFSET_KEY = "kafka.commit.offsets"; + private static final String KAFKA_OFFSET_DELIMITER = ","; + private static final String KAFKA_OFFSET_KV_DELIMITER = "="; + private static final Long START_COMMIT_INIT_DELAY_MS = 100L; + private static final Long RESTART_COMMIT_DELAY_MS = 500L; + private static final int COORDINATOR_EVENT_LOOP_TIMEOUT_MS = 1000; + + private final KafkaConnectConfigs configs; + private final TopicPartition partition; + private final KafkaControlAgent kafkaControlClient; + private final ConnectTransactionServices transactionServices; + private final KafkaPartitionProvider partitionProvider; + private final Map> partitionsWriteStatusReceived; + private final Map currentConsumedKafkaOffsets; + private final AtomicBoolean hasStarted = new AtomicBoolean(false); + private final BlockingQueue events; + private final ExecutorService executorService; + private final ScheduledExecutorService scheduler; + + private String currentCommitTime; + private Map globalCommittedKafkaOffsets; + private State currentState; + private int numPartitions; + + public ConnectTransactionCoordinator(KafkaConnectConfigs configs, + TopicPartition partition, + KafkaControlAgent kafkaControlClient) throws HoodieException { + this(configs, + partition, + kafkaControlClient, + new KafkaConnectTransactionServices(configs), + KafkaConnectUtils::getLatestNumPartitions); + } + + public ConnectTransactionCoordinator(KafkaConnectConfigs configs, + TopicPartition partition, + KafkaControlAgent kafkaControlClient, + ConnectTransactionServices transactionServices, + KafkaPartitionProvider partitionProvider) { + this.configs = configs; + this.partition = partition; + this.kafkaControlClient = kafkaControlClient; + this.transactionServices = transactionServices; + this.partitionProvider = partitionProvider; + this.events = new LinkedBlockingQueue<>(); + scheduler = Executors.newSingleThreadScheduledExecutor(); + executorService = Executors.newSingleThreadExecutor(); + + + this.currentCommitTime = StringUtils.EMPTY_STRING; + this.partitionsWriteStatusReceived = new HashMap<>(); + this.globalCommittedKafkaOffsets = new HashMap<>(); + this.currentConsumedKafkaOffsets = new HashMap<>(); + this.currentState = State.INIT; + } + + @Override + public void start() { + if (hasStarted.compareAndSet(false, true)) { + executorService.submit(this); + } + kafkaControlClient.registerTransactionCoordinator(this); + LOG.info(String.format("Start Transaction Coordinator for topic %s partition %s", + partition.topic(), partition.partition())); + + initializeGlobalCommittedKafkaOffsets(); + // Submit the first start commit + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + START_COMMIT_INIT_DELAY_MS, TimeUnit.MILLISECONDS); + } + + @Override + public void stop() { + kafkaControlClient.deregisterTransactionCoordinator(this); + hasStarted.set(false); + if (executorService != null) { + boolean terminated = false; + try { + LOG.info("Shutting down executor service."); + executorService.shutdown(); + LOG.info("Awaiting termination."); + terminated = executorService.awaitTermination(100, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // ignored + } + + if (!terminated) { + LOG.warn( + "Unclean Kafka Control Manager executor service shutdown "); + executorService.shutdownNow(); + } + } + } + + @Override + public TopicPartition getPartition() { + return partition; + } + + @Override + public void processControlEvent(ControlEvent message) { + CoordinatorEvent.CoordinatorEventType type; + if (message.getMsgType().equals(ControlEvent.MsgType.WRITE_STATUS)) { + type = CoordinatorEvent.CoordinatorEventType.WRITE_STATUS; + } else { + LOG.warn(String.format("The Coordinator should not be receiving messages of type %s", message.getMsgType().name())); + return; + } + + CoordinatorEvent event = new CoordinatorEvent(type, + message.senderPartition().topic(), + message.getCommitTime()); + event.setMessage(message); + submitEvent(event); + } + + @Override + public void run() { + while (true) { + try { + CoordinatorEvent event = events.poll(COORDINATOR_EVENT_LOOP_TIMEOUT_MS, TimeUnit.MILLISECONDS); + if (event != null) { + processCoordinatorEvent(event); + } + } catch (InterruptedException exception) { + LOG.warn("Error received while polling the event loop in Partition Coordinator", exception); + } + } + } + + private void submitEvent(CoordinatorEvent event) { + this.submitEvent(event, 0, TimeUnit.SECONDS); + } + + private void submitEvent(CoordinatorEvent event, long delay, TimeUnit unit) { + scheduler.schedule(() -> { + events.add(event); + }, delay, unit); + } + + private void processCoordinatorEvent(CoordinatorEvent event) { + try { + // Ignore NULL and STALE events, unless its one to start a new COMMIT + if (event == null + || (!event.getEventType().equals(CoordinatorEvent.CoordinatorEventType.START_COMMIT) + && (!event.getCommitTime().equals(currentCommitTime)))) { + return; + } + + switch (event.getEventType()) { + case START_COMMIT: + startNewCommit(); + break; + case END_COMMIT: + endExistingCommit(); + break; + case WRITE_STATUS: + // Ignore stale write_status messages sent after + if (event.getMessage() != null + && currentState.equals(State.ENDED_COMMIT)) { + onReceiveWriteStatus(event.getMessage()); + } else { + LOG.warn("Could not process WRITE_STATUS due to missing message"); + } + break; + case ACK_COMMIT: + submitAckCommit(); + break; + case WRITE_STATUS_TIMEOUT: + handleWriteStatusTimeout(); + break; + default: + throw new IllegalStateException("Partition Coordinator has received an illegal event type " + event.getEventType().name()); + } + } catch (Exception exception) { + LOG.warn("Error received while polling the event loop in Partition Coordinator", exception); + } + } + + private void startNewCommit() { + numPartitions = partitionProvider.getLatestNumPartitions(configs.getString(BOOTSTRAP_SERVERS_CFG), partition.topic()); + partitionsWriteStatusReceived.clear(); + try { + currentCommitTime = transactionServices.startCommit(); + ControlEvent message = new ControlEvent.Builder( + ControlEvent.MsgType.START_COMMIT, + ControlEvent.SenderType.COORDINATOR, + currentCommitTime, + partition) + .setCoordinatorInfo( + new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets)) + .build(); + kafkaControlClient.publishMessage(message); + currentState = State.STARTED_COMMIT; + // schedule a timeout for ending the current commit + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.END_COMMIT, + partition.topic(), + currentCommitTime), + configs.getCommitIntervalSecs(), TimeUnit.SECONDS); + } catch (Exception exception) { + LOG.error(String.format("Failed to start a new commit %s, will retry", currentCommitTime), exception); + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + RESTART_COMMIT_DELAY_MS, TimeUnit.MILLISECONDS); + } + } + + private void endExistingCommit() { + try { + ControlEvent message = new ControlEvent.Builder( + ControlEvent.MsgType.END_COMMIT, + ControlEvent.SenderType.COORDINATOR, + currentCommitTime, + partition) + .setCoordinatorInfo(new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets)) + .build(); + kafkaControlClient.publishMessage(message); + } catch (Exception exception) { + LOG.warn(String.format("Could not send END_COMMIT message for partition %s and commitTime %s", partition, currentCommitTime), exception); + } + currentConsumedKafkaOffsets.clear(); + currentState = State.ENDED_COMMIT; + + // schedule a timeout for receiving all write statuses + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.WRITE_STATUS_TIMEOUT, + partition.topic(), + currentCommitTime), + configs.getCoordinatorWriteTimeoutSecs(), TimeUnit.SECONDS); + } + + private void onReceiveWriteStatus(ControlEvent message) { + ControlEvent.ParticipantInfo participantInfo = message.getParticipantInfo(); + if (participantInfo.getOutcomeType().equals(ControlEvent.OutcomeType.WRITE_SUCCESS)) { + int partition = message.senderPartition().partition(); + partitionsWriteStatusReceived.put(partition, participantInfo.writeStatuses()); + currentConsumedKafkaOffsets.put(partition, participantInfo.getKafkaCommitOffset()); + } + if (partitionsWriteStatusReceived.size() >= numPartitions + && currentState.equals(State.ENDED_COMMIT)) { + // Commit the kafka offsets to the commit file + try { + List allWriteStatuses = new ArrayList<>(); + partitionsWriteStatusReceived.forEach((key, value) -> allWriteStatuses.addAll(value)); + // Commit the last write in Hudi, along with the latest kafka offset + if (!allWriteStatuses.isEmpty()) { + transactionServices.endCommit(currentCommitTime, + allWriteStatuses, + transformKafkaOffsets(currentConsumedKafkaOffsets)); + } + currentState = State.WRITE_STATUS_RCVD; + globalCommittedKafkaOffsets.putAll(currentConsumedKafkaOffsets); + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.ACK_COMMIT, + partition.topic(), + currentCommitTime)); + } catch (Exception exception) { + LOG.error("Fatal error while committing file", exception); + } + } + } + + private void handleWriteStatusTimeout() { + // If we are still stuck in ENDED_STATE + if (currentState.equals(State.ENDED_COMMIT)) { + currentState = State.WRITE_STATUS_TIMEDOUT; + LOG.warn("Did not receive the Write Status from all partitions"); + // Submit the next start commit + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + RESTART_COMMIT_DELAY_MS, TimeUnit.MILLISECONDS); + } + } + + private void submitAckCommit() { + try { + ControlEvent message = new ControlEvent.Builder( + ControlEvent.MsgType.ACK_COMMIT, + ControlEvent.SenderType.COORDINATOR, + currentCommitTime, + partition) + .setCoordinatorInfo( + new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets)) + .build(); + kafkaControlClient.publishMessage(message); + } catch (Exception exception) { + LOG.warn(String.format("Could not send ACK_COMMIT message for partition %s and commitTime %s", partition, currentCommitTime), exception); + } + currentState = State.ACKED_COMMIT; + + // Submit the next start commit + submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT, + partition.topic(), + StringUtils.EMPTY_STRING), + START_COMMIT_INIT_DELAY_MS, TimeUnit.MILLISECONDS); + } + + private void initializeGlobalCommittedKafkaOffsets() { + try { + Map commitMetadata = transactionServices.fetchLatestExtraCommitMetadata(); + String latestKafkaOffsets = commitMetadata.get(KAFKA_OFFSET_KEY); + if (!StringUtils.isNullOrEmpty(latestKafkaOffsets)) { + LOG.info("Retrieved Raw Kafka offsets from Hudi Commit File " + latestKafkaOffsets); + globalCommittedKafkaOffsets = Arrays.stream(latestKafkaOffsets.split(KAFKA_OFFSET_DELIMITER)) + .map(entry -> entry.split(KAFKA_OFFSET_KV_DELIMITER)) + .collect(Collectors.toMap(entry -> Integer.parseInt(entry[0]), entry -> Long.parseLong(entry[1]))); + LOG.info("Initialized the kafka offset commits " + globalCommittedKafkaOffsets); + } + } catch (Exception exception) { + throw new HoodieException("Could not deserialize the kafka commit offsets", exception); + } + } + + private Map transformKafkaOffsets(Map kafkaOffsets) { + try { + String kafkaOffsetValue = kafkaOffsets.keySet().stream() + .map(key -> key + KAFKA_OFFSET_KV_DELIMITER + kafkaOffsets.get(key)) + .collect(Collectors.joining(KAFKA_OFFSET_DELIMITER)); + return Collections.singletonMap(KAFKA_OFFSET_KEY, kafkaOffsetValue); + } catch (Exception exception) { + throw new HoodieException("Could not serialize the kafka commit offsets", exception); + } + } + + private enum State { + INIT, + STARTED_COMMIT, + ENDED_COMMIT, + WRITE_STATUS_RCVD, + WRITE_STATUS_TIMEDOUT, + ACKED_COMMIT, + } + + /** + * Provides the current partitions of a Kafka Topic dynamically. + */ + public interface KafkaPartitionProvider { + int getLatestNumPartitions(String bootstrapServers, String topicName); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java new file mode 100644 index 000000000..fe1996e65 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ConnectTransactionParticipant.java @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.writers.ConnectWriterProvider; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.connect.writers.KafkaConnectWriterProvider; +import org.apache.hudi.exception.HoodieException; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTaskContext; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; + +/** + * Implementation of the {@link TransactionParticipant} that coordinates the Hudi write transactions + * based on events from the {@link TransactionCoordinator} and manages the Hudi Writes for a specific Kafka Partition. + */ +public class ConnectTransactionParticipant implements TransactionParticipant { + + private static final Logger LOG = LogManager.getLogger(ConnectTransactionParticipant.class); + + private final LinkedList buffer; + private final BlockingQueue controlEvents; + private final TopicPartition partition; + private final SinkTaskContext context; + private final KafkaControlAgent kafkaControlAgent; + private final ConnectWriterProvider writerProvider; + + private TransactionInfo ongoingTransactionInfo; + private long committedKafkaOffset; + + public ConnectTransactionParticipant(KafkaConnectConfigs configs, + TopicPartition partition, + KafkaControlAgent kafkaControlAgent, + SinkTaskContext context) throws HoodieException { + this(partition, kafkaControlAgent, context, new KafkaConnectWriterProvider(configs, partition)); + } + + public ConnectTransactionParticipant(TopicPartition partition, + KafkaControlAgent kafkaControlAgent, + SinkTaskContext context, + ConnectWriterProvider writerProvider) throws HoodieException { + this.buffer = new LinkedList<>(); + this.controlEvents = new LinkedBlockingQueue<>(); + this.partition = partition; + this.context = context; + this.writerProvider = writerProvider; + this.kafkaControlAgent = kafkaControlAgent; + this.ongoingTransactionInfo = null; + this.committedKafkaOffset = 0; + } + + @Override + public void start() { + LOG.info("Start Hudi Transaction Participant for partition " + partition.partition()); + this.kafkaControlAgent.registerTransactionParticipant(this); + context.pause(partition); + } + + @Override + public void stop() { + this.kafkaControlAgent.deregisterTransactionParticipant(this); + cleanupOngoingTransaction(); + } + + @Override + public void buffer(SinkRecord record) { + buffer.add(record); + } + + @Override + public void processControlEvent(ControlEvent message) { + controlEvents.add(message); + } + + @Override + public long getLastKafkaCommittedOffset() { + return committedKafkaOffset; + } + + @Override + public TopicPartition getPartition() { + return partition; + } + + @Override + public void processRecords() { + while (!controlEvents.isEmpty()) { + ControlEvent message = controlEvents.poll(); + switch (message.getMsgType()) { + case START_COMMIT: + handleStartCommit(message); + break; + case END_COMMIT: + handleEndCommit(message); + break; + case ACK_COMMIT: + handleAckCommit(message); + break; + case WRITE_STATUS: + // ignore write status since its only processed by leader + break; + default: + throw new IllegalStateException("HudiTransactionParticipant received incorrect state " + message.getMsgType()); + } + } + + writeRecords(); + } + + private void handleStartCommit(ControlEvent message) { + // If there is an existing/ongoing transaction locally + // but it failed globally since we received another START_COMMIT instead of an END_COMMIT or ACK_COMMIT, + // so close it and start new transaction + cleanupOngoingTransaction(); + // Resync the last committed Kafka offset from the leader + syncKafkaOffsetWithLeader(message); + context.resume(partition); + String currentCommitTime = message.getCommitTime(); + LOG.info("Started a new transaction after receiving START_COMMIT for commit " + currentCommitTime); + try { + ongoingTransactionInfo = new TransactionInfo<>(currentCommitTime, writerProvider.getWriter(currentCommitTime)); + ongoingTransactionInfo.setLastWrittenKafkaOffset(committedKafkaOffset); + } catch (Exception exception) { + LOG.warn("Error received while starting a new transaction", exception); + } + } + + private void handleEndCommit(ControlEvent message) { + if (ongoingTransactionInfo == null) { + LOG.warn(String.format("END_COMMIT %s is received while we were NOT in active transaction", message.getCommitTime())); + return; + } else if (!ongoingTransactionInfo.getCommitTime().equals(message.getCommitTime())) { + LOG.error(String.format("Fatal error received END_COMMIT with commit time %s while local transaction commit time %s", + message.getCommitTime(), ongoingTransactionInfo.getCommitTime())); + // Recovery: A new END_COMMIT from leader caused interruption to an existing transaction, + // explicitly reset Kafka commit offset to ensure no data loss + cleanupOngoingTransaction(); + syncKafkaOffsetWithLeader(message); + return; + } + + // send Writer Status Message and wait for ACK_COMMIT in async fashion + try { + context.pause(partition); + ongoingTransactionInfo.commitInitiated(); + //sendWriterStatus + List writeStatuses = new ArrayList<>(); + try { + writeStatuses = ongoingTransactionInfo.getWriter().close(); + } catch (IOException exception) { + LOG.warn("Error closing the Hudi Writer", exception); + } + + ControlEvent writeStatus = new ControlEvent.Builder(ControlEvent.MsgType.WRITE_STATUS, + ControlEvent.SenderType.PARTICIPANT, ongoingTransactionInfo.getCommitTime(), partition) + .setParticipantInfo(new ControlEvent.ParticipantInfo( + writeStatuses, + ongoingTransactionInfo.getLastWrittenKafkaOffset(), + ControlEvent.OutcomeType.WRITE_SUCCESS)) + .build(); + kafkaControlAgent.publishMessage(writeStatus); + } catch (Exception exception) { + LOG.warn(String.format("Error ending commit %s for partition %s", message.getCommitTime(), partition.partition()), exception); + } + } + + private void handleAckCommit(ControlEvent message) { + // Update lastKafkCommitedOffset locally. + if (ongoingTransactionInfo != null && committedKafkaOffset < ongoingTransactionInfo.getLastWrittenKafkaOffset()) { + committedKafkaOffset = ongoingTransactionInfo.getLastWrittenKafkaOffset(); + } + syncKafkaOffsetWithLeader(message); + cleanupOngoingTransaction(); + } + + private void writeRecords() { + if (ongoingTransactionInfo != null && !ongoingTransactionInfo.isCommitInitiated()) { + while (!buffer.isEmpty()) { + try { + SinkRecord record = buffer.peek(); + if (record != null + && record.kafkaOffset() >= ongoingTransactionInfo.getLastWrittenKafkaOffset()) { + ongoingTransactionInfo.getWriter().writeRecord(record); + ongoingTransactionInfo.setLastWrittenKafkaOffset(record.kafkaOffset() + 1); + } else if (record != null && record.kafkaOffset() < committedKafkaOffset) { + LOG.warn(String.format("Received a kafka record with offset %s prior to last committed offset %s for partition %s", + record.kafkaOffset(), ongoingTransactionInfo.getLastWrittenKafkaOffset(), + partition)); + } + buffer.poll(); + } catch (Exception exception) { + LOG.warn(String.format("Error received while writing records for transaction %s in partition %s", + ongoingTransactionInfo.getCommitTime(), partition.partition()), + exception); + } + } + } + } + + private void cleanupOngoingTransaction() { + if (ongoingTransactionInfo != null) { + try { + ongoingTransactionInfo.getWriter().close(); + ongoingTransactionInfo = null; + } catch (IOException exception) { + LOG.warn("Error received while trying to cleanup existing transaction", exception); + } + } + } + + private void syncKafkaOffsetWithLeader(ControlEvent message) { + if (message.getCoordinatorInfo() != null) { + Long coordinatorCommittedKafkaOffset = message.getCoordinatorInfo().getGlobalKafkaCommitOffsets().get(partition.partition()); + // Recover kafka committed offsets, treating the commit offset from the coordinator + // as the source of truth + if (coordinatorCommittedKafkaOffset != null && coordinatorCommittedKafkaOffset >= 0) { + if (coordinatorCommittedKafkaOffset != committedKafkaOffset) { + LOG.warn(String.format("Recovering the kafka offset for partition %s to offset %s instead of local offset %s", + partition.partition(), coordinatorCommittedKafkaOffset, committedKafkaOffset)); + context.offset(partition, coordinatorCommittedKafkaOffset); + } + committedKafkaOffset = coordinatorCommittedKafkaOffset; + } + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ControlEvent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ControlEvent.java new file mode 100644 index 000000000..093064881 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/ControlEvent.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.util.SerializationUtils; + +import org.apache.kafka.common.TopicPartition; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +/** + * The events sent over the Kafka Control Topic between the + * coordinator and the followers, in order to ensure + * coordination across all the writes. + */ +@SuppressWarnings("checkstyle:VisibilityModifier") +public class ControlEvent implements Serializable { + + private static final Logger LOG = LogManager.getLogger(ControlEvent.class); + private static final int CURRENT_VERSION = 0; + + private final int version = CURRENT_VERSION; + private MsgType msgType; + private SenderType senderType; + private String commitTime; + private byte[] senderPartition; + private CoordinatorInfo coordinatorInfo; + private ParticipantInfo participantInfo; + + public ControlEvent() { + } + + public ControlEvent(MsgType msgType, + SenderType senderType, + String commitTime, + byte[] senderPartition, + CoordinatorInfo coordinatorInfo, + ParticipantInfo participantInfo) { + this.msgType = msgType; + this.senderType = senderType; + this.commitTime = commitTime; + this.senderPartition = senderPartition; + this.coordinatorInfo = coordinatorInfo; + this.participantInfo = participantInfo; + } + + public String key() { + return msgType.name().toLowerCase(Locale.ROOT); + } + + public MsgType getMsgType() { + return msgType; + } + + public SenderType getSenderType() { + return senderType; + } + + public String getCommitTime() { + return commitTime; + } + + public byte[] getSenderPartition() { + return senderPartition; + } + + public TopicPartition senderPartition() { + return SerializationUtils.deserialize(senderPartition); + } + + public CoordinatorInfo getCoordinatorInfo() { + return coordinatorInfo; + } + + public ParticipantInfo getParticipantInfo() { + return participantInfo; + } + + public int getVersion() { + return version; + } + + @Override + public String toString() { + return String.format("%s %s %s %s %s %s", version, msgType.name(), commitTime, + Arrays.toString(senderPartition), coordinatorInfo.toString(), participantInfo.toString()); + } + + /** + * Builder that helps build {@link ControlEvent}. + */ + public static class Builder { + + private final MsgType msgType; + private SenderType senderType; + private final String commitTime; + private final byte[] senderPartition; + private CoordinatorInfo coordinatorInfo; + private ParticipantInfo participantInfo; + + public Builder(MsgType msgType, SenderType senderType, String commitTime, TopicPartition senderPartition) throws IOException { + this.msgType = msgType; + this.senderType = senderType; + this.commitTime = commitTime; + this.senderPartition = SerializationUtils.serialize(senderPartition); + } + + public Builder setCoordinatorInfo(CoordinatorInfo coordinatorInfo) { + this.coordinatorInfo = coordinatorInfo; + return this; + } + + public Builder setParticipantInfo(ParticipantInfo participantInfo) { + this.participantInfo = participantInfo; + return this; + } + + public ControlEvent build() { + return new ControlEvent(msgType, senderType, commitTime, senderPartition, coordinatorInfo, participantInfo); + } + } + + /** + * The info sent by the {@link TransactionCoordinator} to one or more + * {@link TransactionParticipant}s. + */ + public static class CoordinatorInfo implements Serializable { + + private Map globalKafkaCommitOffsets; + + public CoordinatorInfo() { + } + + public CoordinatorInfo(Map globalKafkaCommitOffsets) { + this.globalKafkaCommitOffsets = globalKafkaCommitOffsets; + } + + public Map getGlobalKafkaCommitOffsets() { + return (globalKafkaCommitOffsets == null) ? new HashMap<>() : globalKafkaCommitOffsets; + } + } + + /** + * The info sent by a {@link TransactionParticipant} instances to the + * {@link TransactionCoordinator}. + */ + public static class ParticipantInfo implements Serializable { + + private byte[] writeStatusList; + private long kafkaCommitOffset; + private OutcomeType outcomeType; + + public ParticipantInfo() { + } + + public ParticipantInfo(List writeStatuses, long kafkaCommitOffset, OutcomeType outcomeType) throws IOException { + this.writeStatusList = SerializationUtils.serialize(writeStatuses); + this.kafkaCommitOffset = kafkaCommitOffset; + this.outcomeType = outcomeType; + } + + public byte[] getWriteStatusList() { + return writeStatusList; + } + + public List writeStatuses() { + return SerializationUtils.deserialize(writeStatusList); + } + + public long getKafkaCommitOffset() { + return kafkaCommitOffset; + } + + public OutcomeType getOutcomeType() { + return outcomeType; + } + } + + /** + * Type of Control Event. + */ + public enum MsgType { + START_COMMIT, + END_COMMIT, + ACK_COMMIT, + WRITE_STATUS, + } + + public enum SenderType { + COORDINATOR, + PARTICIPANT + } + + public enum OutcomeType { + WRITE_SUCCESS, + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java new file mode 100644 index 000000000..a0e2654cd --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/CoordinatorEvent.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +/** + * The events within the Coordinator that trigger + * the state changes in the state machine of + * the Coordinator. + */ +public class CoordinatorEvent { + + private final CoordinatorEventType eventType; + private final String topicName; + private final String commitTime; + private ControlEvent message; + + public CoordinatorEvent(CoordinatorEventType eventType, + String topicName, + String commitTime) { + this.eventType = eventType; + this.topicName = topicName; + this.commitTime = commitTime; + } + + public CoordinatorEventType getEventType() { + return eventType; + } + + public String getTopicName() { + return topicName; + } + + public String getCommitTime() { + return commitTime; + } + + public ControlEvent getMessage() { + return message; + } + + public void setMessage(ControlEvent message) { + this.message = message; + } + + /** + * The type of Coordinator Event. + */ + public enum CoordinatorEventType { + START_COMMIT, + END_COMMIT, + WRITE_STATUS, + ACK_COMMIT, + WRITE_STATUS_TIMEOUT + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java new file mode 100644 index 000000000..04f8a2e3c --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionCoordinator.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.kafka.common.TopicPartition; + +/** + * The Base Coordinator that + * coordinates the write transactions + * across all the Kafka partitions, that + * are managed by the {@link TransactionParticipant}. + */ +public interface TransactionCoordinator { + + void start(); + + void stop(); + + /* Kafka Topic that this Coordinator belongs to */ + TopicPartition getPartition(); + + /* Called when a control event is received from the Kafka control topic */ + void processControlEvent(ControlEvent message); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionInfo.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionInfo.java new file mode 100644 index 000000000..9c7bbf1e8 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionInfo.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.hudi.connect.writers.ConnectWriter; + +/** + * Stores all the state for the current Transaction within a + * {@link TransactionParticipant}. + * @param The type of status returned by the underlying writer. + */ +public class TransactionInfo { + + private final String commitTime; + private final ConnectWriter writer; + private long lastWrittenKafkaOffset; + private boolean commitInitiated; + + public TransactionInfo(String commitTime, ConnectWriter writer) { + this.commitTime = commitTime; + this.writer = writer; + this.lastWrittenKafkaOffset = 0; + this.commitInitiated = false; + } + + public String getCommitTime() { + return commitTime; + } + + public ConnectWriter getWriter() { + return writer; + } + + public long getLastWrittenKafkaOffset() { + return lastWrittenKafkaOffset; + } + + public boolean isCommitInitiated() { + return commitInitiated; + } + + public void setLastWrittenKafkaOffset(long lastWrittenKafkaOffset) { + this.lastWrittenKafkaOffset = lastWrittenKafkaOffset; + } + + public void commitInitiated() { + this.commitInitiated = true; + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java new file mode 100644 index 000000000..0179f3b71 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/transaction/TransactionParticipant.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.transaction; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.sink.SinkRecord; + +/** + * Interface for the Participant that + * manages Writes for a + * single Kafka partition, based on + * coordination signals from the {@link TransactionCoordinator}. + */ +public interface TransactionParticipant { + + void start(); + + void stop(); + + void buffer(SinkRecord record); + + void processRecords(); + + TopicPartition getPartition(); + + void processControlEvent(ControlEvent message); + + long getLastKafkaCommittedOffset(); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java new file mode 100644 index 000000000..593cfb124 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/utils/KafkaConnectUtils.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.utils; + +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.BaseKeyGenerator; +import org.apache.hudi.keygen.CustomAvroKeyGenerator; +import org.apache.hudi.keygen.CustomKeyGenerator; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.constant.KeyGeneratorOptions; + +import org.apache.hadoop.conf.Configuration; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.DescribeTopicsResult; +import org.apache.kafka.clients.admin.TopicDescription; +import org.apache.kafka.common.KafkaFuture; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Arrays; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; + +/** + * Helper methods for Kafka. + */ +public class KafkaConnectUtils { + + private static final Logger LOG = LogManager.getLogger(KafkaConnectUtils.class); + + public static int getLatestNumPartitions(String bootstrapServers, String topicName) { + Properties props = new Properties(); + props.put("bootstrap.servers", bootstrapServers); + try { + AdminClient client = AdminClient.create(props); + DescribeTopicsResult result = client.describeTopics(Arrays.asList(topicName)); + Map> values = result.values(); + KafkaFuture topicDescription = values.get(topicName); + int numPartitions = topicDescription.get().partitions().size(); + LOG.info(String.format("Latest number of partitions for topic %s is %s", topicName, numPartitions)); + return numPartitions; + } catch (Exception exception) { + throw new HoodieException("Fatal error fetching the latest partition of kafka topic name" + topicName, exception); + } + } + + /** + * Returns the default Hadoop Configuration. + * @return + */ + public static Configuration getDefaultHadoopConf() { + Configuration hadoopConf = new Configuration(); + hadoopConf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + return hadoopConf; + } + + /** + * Extract the record fields. + * @param keyGenerator key generator Instance of the keygenerator. + * @return Returns the record key columns seprarated by comma. + */ + public static String getRecordKeyColumns(KeyGenerator keyGenerator) { + return String.join(",", keyGenerator.getRecordKeyFieldNames()); + } + + /** + * Extract partition columns directly if an instance of class {@link BaseKeyGenerator}, + * else extract partition columns from the properties. + * + * @param keyGenerator key generator Instance of the keygenerator. + * @param typedProperties properties from the config. + * @return partition columns Returns the partition columns seprarated by comma. + */ + public static String getPartitionColumns(KeyGenerator keyGenerator, TypedProperties typedProperties) { + + if (keyGenerator instanceof CustomKeyGenerator || keyGenerator instanceof CustomAvroKeyGenerator) { + return ((BaseKeyGenerator) keyGenerator).getPartitionPathFields().stream().map( + pathField -> Arrays.stream(pathField.split(CustomAvroKeyGenerator.SPLIT_REGEX)) + .findFirst().orElse("Illegal partition path field format: '$pathField' for ${c.getClass.getSimpleName}")) + .collect(Collectors.joining(",")); + } + + if (keyGenerator instanceof BaseKeyGenerator) { + return String.join(",", ((BaseKeyGenerator) keyGenerator).getPartitionPathFields()); + } + + return typedProperties.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key()); + } + + + /** + * Get the Metadata from the latest commit file. + * + * @param metaClient The {@link HoodieTableMetaClient} to get access to the meta data. + * @return An Optional {@link HoodieCommitMetadata} containing the meta data from the latest commit file. + */ + public static Option getCommitMetadataForLatestInstant(HoodieTableMetaClient metaClient) { + HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline() + .filterCompletedInstants() + .filter(instant -> (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE && instant.getAction().equals(HoodieActiveTimeline.COMMIT_ACTION)) + || (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ && instant.getAction().equals(HoodieActiveTimeline.DELTA_COMMIT_ACTION)) + ); + Option latestInstant = timeline.lastInstant(); + if (latestInstant.isPresent()) { + try { + byte[] data = timeline.getInstantDetails(latestInstant.get()).get(); + return Option.of(HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class)); + } catch (Exception e) { + throw new HoodieException("Failed to read schema from commit metadata", e); + } + } else { + return Option.empty(); + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java new file mode 100644 index 000000000..c958b2b48 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/AbstractConnectWriter.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.schema.SchemaProvider; +import org.apache.hudi.utilities.sources.helpers.AvroConvertor; + +import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.List; + +/** + * Base Hudi Writer that manages reading the raw Kafka records and + * converting them to {@link HoodieRecord}s that can be written to Hudi by + * the derived implementations of this class. + */ +public abstract class AbstractConnectWriter implements ConnectWriter { + + public static final String KAFKA_AVRO_CONVERTER = "io.confluent.connect.avro.AvroConverter"; + public static final String KAFKA_JSON_CONVERTER = "org.apache.kafka.connect.json.JsonConverter"; + public static final String KAFKA_STRING_CONVERTER = "org.apache.kafka.connect.storage.StringConverter"; + private static final Logger LOG = LogManager.getLogger(AbstractConnectWriter.class); + + private final KafkaConnectConfigs connectConfigs; + private final KeyGenerator keyGenerator; + private final SchemaProvider schemaProvider; + + public AbstractConnectWriter(KafkaConnectConfigs connectConfigs, + KeyGenerator keyGenerator, + SchemaProvider schemaProvider) { + this.connectConfigs = connectConfigs; + this.keyGenerator = keyGenerator; + this.schemaProvider = schemaProvider; + } + + @Override + public void writeRecord(SinkRecord record) throws IOException { + AvroConvertor convertor = new AvroConvertor(schemaProvider.getSourceSchema()); + Option avroRecord; + switch (connectConfigs.getKafkaValueConverter()) { + case KAFKA_AVRO_CONVERTER: + avroRecord = Option.of((GenericRecord) record.value()); + break; + case KAFKA_STRING_CONVERTER: + avroRecord = Option.of(convertor.fromJson((String) record.value())); + break; + case KAFKA_JSON_CONVERTER: + throw new UnsupportedEncodingException("Currently JSON objects are not supported"); + default: + throw new IOException("Unsupported Kafka Format type (" + connectConfigs.getKafkaValueConverter() + ")"); + } + + HoodieRecord hoodieRecord = new HoodieRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord)); + writeHudiRecord(hoodieRecord); + } + + @Override + public List close() { + return flushHudiRecords(); + } + + protected abstract void writeHudiRecord(HoodieRecord record); + + protected abstract List flushHudiRecords(); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java new file mode 100644 index 000000000..3319604b5 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/BufferedConnectWriter.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.DefaultSizeEstimator; +import org.apache.hudi.common.util.HoodieRecordSizeEstimator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.collection.ExternalSpillableMap; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.IOUtils; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.schema.SchemaProvider; + +import org.apache.avro.Schema; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Specific implementation of a Hudi Writer that buffers all incoming records, + * and writes them to Hudi files on the end of a transaction using Bulk Insert. + */ +public class BufferedConnectWriter extends AbstractConnectWriter { + + private static final Logger LOG = LogManager.getLogger(BufferedConnectWriter.class); + + private final HoodieEngineContext context; + private final HoodieJavaWriteClient writeClient; + private final String instantTime; + private final HoodieWriteConfig config; + private ExternalSpillableMap> bufferedRecords; + + public BufferedConnectWriter(HoodieEngineContext context, + HoodieJavaWriteClient writeClient, + String instantTime, + KafkaConnectConfigs connectConfigs, + HoodieWriteConfig config, + KeyGenerator keyGenerator, + SchemaProvider schemaProvider) { + super(connectConfigs, keyGenerator, schemaProvider); + this.context = context; + this.writeClient = writeClient; + this.instantTime = instantTime; + this.config = config; + init(); + } + + private void init() { + try { + // Load and batch all incoming records in a map + long memoryForMerge = IOUtils.getMaxMemoryPerPartitionMerge(context.getTaskContextSupplier(), config); + LOG.info("MaxMemoryPerPartitionMerge => " + memoryForMerge); + this.bufferedRecords = new ExternalSpillableMap<>(memoryForMerge, + config.getSpillableMapBasePath(), + new DefaultSizeEstimator(), + new HoodieRecordSizeEstimator(new Schema.Parser().parse(config.getSchema())), + config.getCommonConfig().getSpillableDiskMapType(), + config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()); + } catch (IOException io) { + throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io); + } + } + + @Override + public void writeHudiRecord(HoodieRecord record) { + bufferedRecords.put(record.getRecordKey(), record); + } + + @Override + public List flushHudiRecords() { + try { + LOG.info("Number of entries in MemoryBasedMap => " + + bufferedRecords.getInMemoryMapNumEntries() + + "Total size in bytes of MemoryBasedMap => " + + bufferedRecords.getCurrentInMemoryMapSize() + "Number of entries in BitCaskDiskMap => " + + bufferedRecords.getDiskBasedMapNumEntries() + "Size of file spilled to disk => " + + bufferedRecords.getSizeOfFileOnDiskInBytes()); + List writeStatuses = new ArrayList<>(); + // Write out all records if non-empty + if (!bufferedRecords.isEmpty()) { + writeStatuses = writeClient.bulkInsertPreppedRecords( + bufferedRecords.values().stream().collect(Collectors.toList()), + instantTime, Option.empty()); + } + bufferedRecords.close(); + LOG.info("Flushed hudi records and got writeStatuses: " + + writeStatuses); + return writeStatuses; + } catch (Exception e) { + throw new HoodieException("Write records failed", e); + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectTransactionServices.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectTransactionServices.java new file mode 100644 index 000000000..b36e1f1c7 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectTransactionServices.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.transaction.TransactionCoordinator; + +import java.util.List; +import java.util.Map; + +/** + * Transaction service APIs used by + * {@link TransactionCoordinator}. + */ +public interface ConnectTransactionServices { + + String startCommit(); + + void endCommit(String commitTime, List writeStatuses, Map extraMetadata); + + Map fetchLatestExtraCommitMetadata(); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java new file mode 100644 index 000000000..a90d72a45 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriter.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.kafka.connect.sink.SinkRecord; + +import java.io.IOException; +import java.util.List; + +public interface ConnectWriter { + + void writeRecord(SinkRecord record) throws IOException; + + List close() throws IOException; +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriterProvider.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriterProvider.java new file mode 100644 index 000000000..87deedc8c --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/ConnectWriterProvider.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +public interface ConnectWriterProvider { + + ConnectWriter getWriter(String commitTime); +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java new file mode 100644 index 000000000..ae6b5d1d3 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectConfigs.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.common.config.ConfigClassProperty; +import org.apache.hudi.common.config.ConfigGroups; +import org.apache.hudi.common.config.ConfigProperty; +import org.apache.hudi.common.config.HoodieConfig; +import org.apache.hudi.hive.HiveSyncTool; +import org.apache.hudi.schema.FilebasedSchemaProvider; + +import javax.annotation.concurrent.Immutable; + +import java.util.Map; +import java.util.Properties; + +/** + * Class storing configs for the HoodieWriteClient. + */ +@Immutable +@ConfigClassProperty(name = "Kafka Sink Connect Configurations", + groupName = ConfigGroups.Names.KAFKA_CONNECT, + description = "Configurations for Kakfa Connect Sink Connector for Hudi.") +public class KafkaConnectConfigs extends HoodieConfig { + + public static final String KAFKA_VALUE_CONVERTER = "value.converter"; + + public static final ConfigProperty KAFKA_BOOTSTRAP_SERVERS = ConfigProperty + .key("bootstrap.servers") + .defaultValue("localhost:9092") + .withDocumentation("The bootstrap servers for the Kafka Cluster."); + + public static final ConfigProperty CONTROL_TOPIC_NAME = ConfigProperty + .key("hoodie.kafka.control.topic") + .defaultValue("hudi-control-topic") + .withDocumentation("Kafka topic name used by the Hudi Sink Connector for " + + "sending and receiving control messages. Not used for data records."); + + public static final ConfigProperty SCHEMA_PROVIDER_CLASS = ConfigProperty + .key("hoodie.schemaprovider.class") + .defaultValue(FilebasedSchemaProvider.class.getName()) + .withDocumentation("subclass of org.apache.hudi.schema.SchemaProvider " + + "to attach schemas to input & target table data, built in options: " + + "org.apache.hudi.schema.FilebasedSchemaProvider."); + + public static final ConfigProperty COMMIT_INTERVAL_SECS = ConfigProperty + .key("hoodie.kafka.commit.interval.secs") + .defaultValue("60") + .withDocumentation("The interval at which Hudi will commit the records written " + + "to the files, making them consumable on the read-side."); + + public static final ConfigProperty COORDINATOR_WRITE_TIMEOUT_SECS = ConfigProperty + .key("hoodie.kafka.coordinator.write.timeout.secs") + .defaultValue("60") + .withDocumentation("The timeout after sending an END_COMMIT until when " + + "the coordinator will wait for the write statuses from all the partitions" + + "to ignore the current commit and start a new commit."); + + public static final ConfigProperty META_SYNC_ENABLE = ConfigProperty + .key("hoodie.meta.sync.enable") + .defaultValue("false") + .withDocumentation("Enable Meta Sync such as Hive"); + + public static final ConfigProperty META_SYNC_CLASSES = ConfigProperty + .key("hoodie.meta.sync.classes") + .defaultValue(HiveSyncTool.class.getName()) + .withDocumentation("Meta sync client tool, using comma to separate multi tools"); + + protected KafkaConnectConfigs() { + super(); + } + + protected KafkaConnectConfigs(Properties props) { + super(props); + Properties newProps = new Properties(); + newProps.putAll(props); + } + + public static KafkaConnectConfigs.Builder newBuilder() { + return new KafkaConnectConfigs.Builder(); + } + + public String getBootstrapServers() { + return getString(KAFKA_BOOTSTRAP_SERVERS); + } + + public String getControlTopicName() { + return getString(CONTROL_TOPIC_NAME); + } + + public String getSchemaProviderClass() { + return getString(SCHEMA_PROVIDER_CLASS); + } + + public Long getCommitIntervalSecs() { + return getLong(COMMIT_INTERVAL_SECS); + } + + public Long getCoordinatorWriteTimeoutSecs() { + return getLong(COORDINATOR_WRITE_TIMEOUT_SECS); + } + + public String getKafkaValueConverter() { + return getString(KAFKA_VALUE_CONVERTER); + } + + public Boolean isMetaSyncEnabled() { + return getBoolean(META_SYNC_ENABLE); + } + + public String getMetaSyncClasses() { + return getString(META_SYNC_CLASSES); + } + + public static class Builder { + + protected final KafkaConnectConfigs connectConfigs = new KafkaConnectConfigs(); + + public Builder withBootstrapServers(String bootstrapServers) { + connectConfigs.setValue(KAFKA_BOOTSTRAP_SERVERS, bootstrapServers); + return this; + } + + public Builder withControlTopicName(String controlTopicName) { + connectConfigs.setValue(CONTROL_TOPIC_NAME, controlTopicName); + return this; + } + + public Builder withCommitIntervalSecs(Long commitIntervalSecs) { + connectConfigs.setValue(COMMIT_INTERVAL_SECS, String.valueOf(commitIntervalSecs)); + return this; + } + + public Builder withCoordinatorWriteTimeoutSecs(Long coordinatorWriteTimeoutSecs) { + connectConfigs.setValue(COORDINATOR_WRITE_TIMEOUT_SECS, String.valueOf(coordinatorWriteTimeoutSecs)); + return this; + } + + // Kafka connect task are passed with props with type Map<> + public Builder withProperties(Map properties) { + connectConfigs.getProps().putAll(properties); + return this; + } + + public Builder withProperties(Properties properties) { + connectConfigs.getProps().putAll(properties); + return this; + } + + protected void setDefaults() { + // Check for mandatory properties + connectConfigs.setDefaults(KafkaConnectConfigs.class.getName()); + } + + public KafkaConnectConfigs build() { + setDefaults(); + // Build HudiConnectConfigs at the end + return new KafkaConnectConfigs(connectConfigs.getProps()); + } + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java new file mode 100644 index 000000000..ad40ebcb7 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectTransactionServices.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieCommitMetadata; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * Implementation of Transaction service APIs used by + * {@link TransactionCoordinator} + * using {@link HoodieJavaWriteClient}. + */ +public class KafkaConnectTransactionServices implements ConnectTransactionServices { + + private static final Logger LOG = LogManager.getLogger(KafkaConnectTransactionServices.class); + private static final String TABLE_FORMAT = "PARQUET"; + + private final Option tableMetaClient; + private final Configuration hadoopConf; + private final FileSystem fs; + private final String tableBasePath; + private final String tableName; + private final HoodieEngineContext context; + + private final HoodieJavaWriteClient javaClient; + + public KafkaConnectTransactionServices( + KafkaConnectConfigs connectConfigs) throws HoodieException { + HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder() + .withProperties(connectConfigs.getProps()).build(); + + tableBasePath = writeConfig.getBasePath(); + tableName = writeConfig.getTableName(); + hadoopConf = KafkaConnectUtils.getDefaultHadoopConf(); + context = new HoodieJavaEngineContext(hadoopConf); + fs = FSUtils.getFs(tableBasePath, hadoopConf); + + try { + KeyGenerator keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator( + new TypedProperties(connectConfigs.getProps())); + + String recordKeyFields = KafkaConnectUtils.getRecordKeyColumns(keyGenerator); + String partitionColumns = KafkaConnectUtils.getPartitionColumns(keyGenerator, + new TypedProperties(connectConfigs.getProps())); + + LOG.info(String.format("Setting record key %s and partitionfields %s for table %s", + recordKeyFields, + partitionColumns, + tableBasePath + tableName)); + + tableMetaClient = Option.of(HoodieTableMetaClient.withPropertyBuilder() + .setTableType(HoodieTableType.COPY_ON_WRITE.name()) + .setTableName(tableName) + .setPayloadClassName(HoodieAvroPayload.class.getName()) + .setBaseFileFormat(TABLE_FORMAT) + .setRecordKeyFields(recordKeyFields) + .setPartitionFields(partitionColumns) + .setKeyGeneratorClassProp(writeConfig.getKeyGeneratorClass()) + .initTable(hadoopConf, tableBasePath)); + + javaClient = new HoodieJavaWriteClient<>(context, writeConfig); + } catch (Exception exception) { + throw new HoodieException("Fatal error instantiating Hudi Transaction Services ", exception); + } + } + + public String startCommit() { + String newCommitTime = javaClient.startCommit(); + javaClient.transitionInflight(newCommitTime); + LOG.info("Starting Hudi commit " + newCommitTime); + return newCommitTime; + } + + public void endCommit(String commitTime, List writeStatuses, Map extraMetadata) { + javaClient.commit(commitTime, writeStatuses, Option.of(extraMetadata), + HoodieActiveTimeline.COMMIT_ACTION, Collections.emptyMap()); + LOG.info("Ending Hudi commit " + commitTime); + } + + public Map fetchLatestExtraCommitMetadata() { + if (tableMetaClient.isPresent()) { + Option metadata = KafkaConnectUtils.getCommitMetadataForLatestInstant(tableMetaClient.get()); + if (metadata.isPresent()) { + return metadata.get().getExtraMetadata(); + } else { + LOG.info("Hoodie Extra Metadata from latest commit is absent"); + return Collections.emptyMap(); + } + } + throw new HoodieException("Fatal error retrieving Hoodie Extra Metadata since Table Meta Client is absent"); + } +} diff --git a/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java new file mode 100644 index 000000000..9d007dd09 --- /dev/null +++ b/hudi-kafka-connect/src/main/java/org/apache/hudi/connect/writers/KafkaConnectWriterProvider.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect.writers; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.engine.HoodieEngineContext; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.util.ReflectionUtils; +import org.apache.hudi.common.util.StringUtils; +import org.apache.hudi.config.HoodieIndexConfig; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.connect.KafkaConnectFileIdPrefixProvider; +import org.apache.hudi.connect.utils.KafkaConnectUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.index.HoodieIndex; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory; +import org.apache.hudi.schema.SchemaProvider; + +import org.apache.hadoop.conf.Configuration; +import org.apache.kafka.common.TopicPartition; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.util.Collections; + +/** + * Provides the Hudi Writer for the {@link org.apache.hudi.connect.transaction.TransactionParticipant} + * to write the incoming records to Hudi. + */ +public class KafkaConnectWriterProvider implements ConnectWriterProvider { + + private static final Logger LOG = LogManager.getLogger(KafkaConnectWriterProvider.class); + + private final KafkaConnectConfigs connectConfigs; + private final HoodieEngineContext context; + private final HoodieWriteConfig writeConfig; + private final HoodieJavaWriteClient hudiJavaClient; + private final KeyGenerator keyGenerator; + private final SchemaProvider schemaProvider; + + public KafkaConnectWriterProvider( + KafkaConnectConfigs connectConfigs, + TopicPartition partition) throws HoodieException { + this.connectConfigs = connectConfigs; + Configuration hadoopConf = KafkaConnectUtils.getDefaultHadoopConf(); + + try { + this.schemaProvider = StringUtils.isNullOrEmpty(connectConfigs.getSchemaProviderClass()) ? null + : (SchemaProvider) ReflectionUtils.loadClass(connectConfigs.getSchemaProviderClass(), + new TypedProperties(connectConfigs.getProps())); + + this.keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator( + new TypedProperties(connectConfigs.getProps())); + + // Create the write client to write some records in + writeConfig = HoodieWriteConfig.newBuilder() + .withProperties(connectConfigs.getProps()) + .withFileIdPrefixProviderClassName(KafkaConnectFileIdPrefixProvider.class.getName()) + .withProps(Collections.singletonMap( + KafkaConnectFileIdPrefixProvider.KAFKA_CONNECT_PARTITION_ID, + String.valueOf(partition))) + .withSchema(schemaProvider.getSourceSchema().toString()) + .withAutoCommit(false) + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()) + .build(); + + context = new HoodieJavaEngineContext(hadoopConf); + + hudiJavaClient = new HoodieJavaWriteClient<>(context, writeConfig); + } catch (Throwable e) { + throw new HoodieException("Fatal error instantiating Hudi Write Provider ", e); + } + } + + public AbstractConnectWriter getWriter(String commitTime) { + return new BufferedConnectWriter( + context, + hudiJavaClient, + commitTime, + connectConfigs, + writeConfig, + keyGenerator, + schemaProvider); + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java new file mode 100644 index 000000000..21940ab43 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionCoordinator.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.connect.transaction.ConnectTransactionCoordinator; +import org.apache.hudi.connect.transaction.ControlEvent; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.helper.MockConnectTransactionServices; +import org.apache.hudi.helper.MockKafkaControlAgent; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.sink.SinkRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.mock; + +public class TestConnectTransactionCoordinator { + + private static final String TOPIC_NAME = "kafka-connect-test-topic"; + private static final int NUM_PARTITIONS = 4; + private static final int MAX_COMMIT_ROUNDS = 5; + private static final int TEST_TIMEOUT_SECS = 60; + + private KafkaConnectConfigs configs; + private MockParticipant participant; + private MockKafkaControlAgent kafkaControlAgent; + private MockConnectTransactionServices transactionServices; + private CountDownLatch latch; + + @BeforeEach + public void setUp() throws Exception { + transactionServices = new MockConnectTransactionServices(); + configs = KafkaConnectConfigs.newBuilder() + .withCommitIntervalSecs(1L) + .withCoordinatorWriteTimeoutSecs(1L) + .build(); + latch = new CountDownLatch(1); + } + + @ParameterizedTest + @EnumSource(value = MockParticipant.TestScenarios.class) + public void testSingleCommitScenario(MockParticipant.TestScenarios scenario) throws InterruptedException { + kafkaControlAgent = new MockKafkaControlAgent(); + participant = new MockParticipant(kafkaControlAgent, latch, scenario, MAX_COMMIT_ROUNDS); + participant.start(); + + // Test the coordinator using the mock participant + TransactionCoordinator coordinator = new ConnectTransactionCoordinator( + configs, + new TopicPartition(TOPIC_NAME, 0), + kafkaControlAgent, + transactionServices, + (bootstrapServers, topicName) -> NUM_PARTITIONS); + coordinator.start(); + + latch.await(TEST_TIMEOUT_SECS, TimeUnit.SECONDS); + + if (latch.getCount() > 0) { + throw new HoodieException("Test timedout resulting in failure"); + } + coordinator.stop(); + participant.stop(); + } + + /** + * A mock Transaction Participant, that exercises all the test scenarios + * for the coordinator as mentioned in {@link TestScenarios}. + */ + private static class MockParticipant implements TransactionParticipant { + + private final MockKafkaControlAgent kafkaControlAgent; + private final TopicPartition partition; + private final CountDownLatch latch; + private final TestScenarios testScenario; + private final int maxNumberCommitRounds; + private final Map kafkaOffsetsCommitted; + + private ControlEvent.MsgType expectedMsgType; + private int numberCommitRounds; + + public MockParticipant(MockKafkaControlAgent kafkaControlAgent, + CountDownLatch latch, + TestScenarios testScenario, + int maxNumberCommitRounds) { + this.kafkaControlAgent = kafkaControlAgent; + this.latch = latch; + this.testScenario = testScenario; + this.maxNumberCommitRounds = maxNumberCommitRounds; + this.partition = new TopicPartition(TOPIC_NAME, (NUM_PARTITIONS - 1)); + this.kafkaOffsetsCommitted = new HashMap<>(); + expectedMsgType = ControlEvent.MsgType.START_COMMIT; + numberCommitRounds = 0; + } + + @Override + public void start() { + kafkaControlAgent.registerTransactionParticipant(this); + } + + @Override + public void stop() { + kafkaControlAgent.deregisterTransactionParticipant(this); + } + + @Override + public void buffer(SinkRecord record) { + } + + @Override + public void processRecords() { + } + + @Override + public TopicPartition getPartition() { + return partition; + } + + @Override + public void processControlEvent(ControlEvent message) { + assertEquals(message.getSenderType(), ControlEvent.SenderType.COORDINATOR); + assertEquals(message.senderPartition().topic(), partition.topic()); + testScenarios(message); + } + + @Override + public long getLastKafkaCommittedOffset() { + return 0; + } + + private void testScenarios(ControlEvent message) { + assertEquals(expectedMsgType, message.getMsgType()); + + switch (message.getMsgType()) { + case START_COMMIT: + expectedMsgType = ControlEvent.MsgType.END_COMMIT; + break; + case END_COMMIT: + assertEquals(kafkaOffsetsCommitted, message.getCoordinatorInfo().getGlobalKafkaCommitOffsets()); + int numSuccessPartitions; + Map kafkaOffsets = new HashMap<>(); + List controlEvents = new ArrayList<>(); + // Prepare the WriteStatuses for all partitions + for (int i = 1; i <= NUM_PARTITIONS; i++) { + try { + long kafkaOffset = (long) (Math.random() * 10000); + kafkaOffsets.put(i, kafkaOffset); + ControlEvent event = successWriteStatus( + message.getCommitTime(), + new TopicPartition(TOPIC_NAME, i), + kafkaOffset); + controlEvents.add(event); + } catch (Exception exception) { + throw new HoodieException("Fatal error sending control event to Coordinator"); + } + } + + switch (testScenario) { + case ALL_CONNECT_TASKS_SUCCESS: + numSuccessPartitions = NUM_PARTITIONS; + kafkaOffsetsCommitted.putAll(kafkaOffsets); + expectedMsgType = ControlEvent.MsgType.ACK_COMMIT; + break; + case SUBSET_CONNECT_TASKS_FAILED: + numSuccessPartitions = NUM_PARTITIONS / 2; + expectedMsgType = ControlEvent.MsgType.START_COMMIT; + break; + default: + throw new HoodieException("Unknown test scenario " + testScenario); + } + + // Send events based on test scenario + for (int i = 0; i < numSuccessPartitions; i++) { + kafkaControlAgent.publishMessage(controlEvents.get(i)); + } + break; + case ACK_COMMIT: + if (numberCommitRounds >= maxNumberCommitRounds) { + latch.countDown(); + } + expectedMsgType = ControlEvent.MsgType.START_COMMIT; + break; + default: + throw new HoodieException("Illegal control message type " + message.getMsgType()); + } + + if (message.getMsgType().equals(ControlEvent.MsgType.START_COMMIT)) { + if (numberCommitRounds >= maxNumberCommitRounds) { + latch.countDown(); + } + numberCommitRounds++; + expectedMsgType = ControlEvent.MsgType.END_COMMIT; + } + } + + public enum TestScenarios { + SUBSET_CONNECT_TASKS_FAILED, + ALL_CONNECT_TASKS_SUCCESS + } + + private static ControlEvent successWriteStatus(String commitTime, + TopicPartition partition, + long kafkaOffset) throws Exception { + // send WS + WriteStatus writeStatus = new WriteStatus(); + WriteStatus status = new WriteStatus(false, 1.0); + for (int i = 0; i < 1000; i++) { + status.markSuccess(mock(HoodieRecord.class), Option.empty()); + } + return new ControlEvent.Builder(ControlEvent.MsgType.WRITE_STATUS, + ControlEvent.SenderType.PARTICIPANT, + commitTime, + partition) + .setParticipantInfo(new ControlEvent.ParticipantInfo( + Collections.singletonList(writeStatus), + kafkaOffset, + ControlEvent.OutcomeType.WRITE_SUCCESS)) + .build(); + } + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java new file mode 100644 index 000000000..900ba46f7 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/connect/TestConnectTransactionParticipant.java @@ -0,0 +1,273 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.connect; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.transaction.ConnectTransactionParticipant; +import org.apache.hudi.connect.transaction.ControlEvent; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.helper.MockKafkaControlAgent; +import org.apache.hudi.helper.TestHudiWriterProvider; +import org.apache.hudi.helper.TestKafkaConnect; + +import org.apache.kafka.common.TopicPartition; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.util.Collections; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestConnectTransactionParticipant { + + private static final String TOPIC_NAME = "kafka-connect-test-topic"; + private static final int PARTITION_NUMBER = 4; + + private ConnectTransactionParticipant participant; + private MockCoordinator coordinator; + private TopicPartition partition; + private KafkaConnectConfigs configs; + private KafkaControlAgent kafkaControlAgent; + private TestHudiWriterProvider testHudiWriterProvider; + private TestKafkaConnect testKafkaConnect; + + @BeforeEach + public void setUp() throws Exception { + partition = new TopicPartition(TOPIC_NAME, PARTITION_NUMBER); + kafkaControlAgent = new MockKafkaControlAgent(); + testKafkaConnect = new TestKafkaConnect(partition); + coordinator = new MockCoordinator(kafkaControlAgent); + coordinator.start(); + configs = KafkaConnectConfigs.newBuilder() + .build(); + initializeParticipant(); + } + + @ParameterizedTest + @EnumSource(value = CoordinatorFailureTestScenarios.class) + public void testAllCoordinatorFailureScenarios(CoordinatorFailureTestScenarios testScenario) { + int expectedRecordsWritten = 0; + switch (testScenario) { + case REGULAR_SCENARIO: + expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + break; + case COORDINATOR_FAILED_AFTER_START_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + // Coordinator Failed + initializeCoordinator(); + break; + case COORDINATOR_FAILED_AFTER_END_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); + expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); + // Coordinator Failed + initializeCoordinator(); + break; + default: + throw new HoodieException("Unknown test scenario " + testScenario); + } + + // Regular Case or Coordinator Recovery Case + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); + expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isResumed()); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + + participant.stop(); + } + + @ParameterizedTest + @EnumSource(value = ParticipantFailureTestScenarios.class) + public void testAllParticipantFailureScenarios(ParticipantFailureTestScenarios testScenario) { + int expectedRecordsWritten = 0; + switch (testScenario) { + case FAILURE_BEFORE_START_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + // Participant fails + initializeParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); + expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isResumed()); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + break; + case FAILURE_AFTER_START_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + // Participant fails + initializeParticipant(); + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + break; + case FAILURE_AFTER_END_COMMIT: + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + // Participant fails + initializeParticipant(); + testKafkaConnect.putRecordsToParticipant(); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertTrue(testKafkaConnect.isPaused()); + coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT); + testKafkaConnect.putRecordsToParticipant(); + assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten); + // Ensure Coordinator and participant are in sync in the kafka offsets + assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset()); + break; + default: + throw new HoodieException("Unknown test scenario " + testScenario); + } + } + + private void initializeParticipant() { + testHudiWriterProvider = new TestHudiWriterProvider(); + participant = new ConnectTransactionParticipant( + partition, + kafkaControlAgent, + testKafkaConnect, + testHudiWriterProvider); + testKafkaConnect.setParticipant(participant); + participant.start(); + } + + private void initializeCoordinator() { + coordinator = new MockCoordinator(kafkaControlAgent); + coordinator.start(); + } + + private static class MockCoordinator implements TransactionCoordinator { + + private static int currentCommitTime; + + static { + currentCommitTime = 101; + } + + private final KafkaControlAgent kafkaControlAgent; + private final TopicPartition partition; + + private Option lastReceivedWriteStatusEvent; + private long committedKafkaOffset; + + public MockCoordinator(KafkaControlAgent kafkaControlAgent) { + this.kafkaControlAgent = kafkaControlAgent; + partition = new TopicPartition(TOPIC_NAME, 0); + lastReceivedWriteStatusEvent = Option.empty(); + committedKafkaOffset = 0L; + } + + public void sendEventFromCoordinator( + ControlEvent.MsgType type) { + try { + if (type.equals(ControlEvent.MsgType.START_COMMIT)) { + ++currentCommitTime; + } + kafkaControlAgent.publishMessage(new ControlEvent.Builder( + type, + ControlEvent.SenderType.COORDINATOR, + String.valueOf(currentCommitTime), + partition) + .setCoordinatorInfo(new ControlEvent.CoordinatorInfo( + Collections.singletonMap(PARTITION_NUMBER, committedKafkaOffset))) + .build()); + } catch (Exception exception) { + throw new HoodieException("Fatal error sending control event to Participant"); + } + } + + public Option getLastReceivedWriteStatusEvent() { + return lastReceivedWriteStatusEvent; + } + + public long getCommittedKafkaOffset() { + return committedKafkaOffset; + } + + @Override + public void start() { + kafkaControlAgent.registerTransactionCoordinator(this); + } + + @Override + public void stop() { + kafkaControlAgent.deregisterTransactionCoordinator(this); + } + + @Override + public TopicPartition getPartition() { + return partition; + } + + @Override + public void processControlEvent(ControlEvent message) { + if (message.getMsgType().equals(ControlEvent.MsgType.WRITE_STATUS)) { + lastReceivedWriteStatusEvent = Option.of(message); + assertTrue(message.getParticipantInfo().getKafkaCommitOffset() >= committedKafkaOffset); + committedKafkaOffset = message.getParticipantInfo().getKafkaCommitOffset(); + } + } + } + + private enum CoordinatorFailureTestScenarios { + REGULAR_SCENARIO, + COORDINATOR_FAILED_AFTER_START_COMMIT, + COORDINATOR_FAILED_AFTER_END_COMMIT, + } + + private enum ParticipantFailureTestScenarios { + FAILURE_BEFORE_START_COMMIT, + FAILURE_AFTER_START_COMMIT, + FAILURE_AFTER_END_COMMIT, + } + +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockConnectTransactionServices.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockConnectTransactionServices.java new file mode 100644 index 000000000..6994c6554 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockConnectTransactionServices.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.helper; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.writers.ConnectTransactionServices; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * Helper class for {@link ConnectTransactionServices} to generate + * a unique commit time for testing purposes. + */ +public class MockConnectTransactionServices implements ConnectTransactionServices { + + private int commitTime; + + public MockConnectTransactionServices() { + commitTime = 100; + } + + @Override + public String startCommit() { + commitTime++; + return String.valueOf(commitTime); + } + + @Override + public void endCommit(String commitTime, List writeStatuses, Map extraMetadata) { + assertEquals(String.valueOf(this.commitTime), commitTime); + } + + @Override + public Map fetchLatestExtraCommitMetadata() { + return new HashMap<>(); + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java new file mode 100644 index 000000000..529cd75fd --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/MockKafkaControlAgent.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.helper; + +import org.apache.hudi.connect.kafka.KafkaControlAgent; +import org.apache.hudi.connect.transaction.ControlEvent; +import org.apache.hudi.connect.transaction.TransactionCoordinator; +import org.apache.hudi.connect.transaction.TransactionParticipant; +import org.apache.hudi.exception.HoodieException; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * A mock Kafka Control Agent that supports the testing + * of a {@link TransactionCoordinator} with multiple + * instances of {@link TransactionParticipant}. + */ +public class MockKafkaControlAgent implements KafkaControlAgent { + + private final Map coordinators; + private final Map> participants; + + public MockKafkaControlAgent() { + coordinators = new HashMap<>(); + participants = new HashMap<>(); + } + + @Override + public void registerTransactionCoordinator(TransactionCoordinator coordinator) { + coordinators.put(coordinator.getPartition().topic(), coordinator); + } + + @Override + public void registerTransactionParticipant(TransactionParticipant participant) { + if (!participants.containsKey(participant.getPartition().topic())) { + participants.put(participant.getPartition().topic(), new ArrayList<>()); + } + participants.get(participant.getPartition().topic()).add(participant); + } + + @Override + public void deregisterTransactionCoordinator(TransactionCoordinator coordinator) { + coordinators.remove(coordinator.getPartition().topic()); + } + + @Override + public void deregisterTransactionParticipant(TransactionParticipant worker) { + if (participants.containsKey(worker.getPartition().topic())) { + participants.get(worker.getPartition().topic()).remove(worker); + } + } + + @Override + public void publishMessage(ControlEvent message) { + try { + String topic = message.senderPartition().topic(); + if (message.getSenderType().equals(ControlEvent.SenderType.COORDINATOR)) { + for (TransactionParticipant participant : participants.get(topic)) { + participant.processControlEvent(message); + } + } else { + coordinators.get(topic).processControlEvent(message); + } + } catch (Exception exception) { + throw new HoodieException("Fatal error trying to relay Kafka Control Messages for Testing."); + } + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestHudiWriterProvider.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestHudiWriterProvider.java new file mode 100644 index 000000000..45c9b0372 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestHudiWriterProvider.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.helper; + +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.connect.writers.ConnectWriter; +import org.apache.hudi.connect.writers.ConnectWriterProvider; + +import org.apache.kafka.connect.sink.SinkRecord; + +import java.util.List; + +/** + * Helper class the provides a Hudi writer and + * maintains stats that are used for test validation. + */ +public class TestHudiWriterProvider implements ConnectWriterProvider { + + private TestHudiWriter currentWriter; + + public TestHudiWriterProvider() { + } + + public int getLatestNumberWrites() { + return (currentWriter != null) ? currentWriter.numberRecords : 0; + } + + public boolean isClosed() { + return currentWriter == null || currentWriter.isClosed; + } + + @Override + public ConnectWriter getWriter(String commitTime) { + currentWriter = new TestHudiWriter(); + return currentWriter; + } + + private static class TestHudiWriter implements ConnectWriter { + + private int numberRecords; + private boolean isClosed; + + public TestHudiWriter() { + this.numberRecords = 0; + this.isClosed = false; + } + + public int getNumberRecords() { + return numberRecords; + } + + public boolean isClosed() { + return isClosed; + } + + @Override + public void writeRecord(SinkRecord record) { + numberRecords++; + } + + @Override + public List close() { + isClosed = false; + return null; + } + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestKafkaConnect.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestKafkaConnect.java new file mode 100644 index 000000000..953080921 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/helper/TestKafkaConnect.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.helper; + +import org.apache.hudi.connect.transaction.TransactionParticipant; + +import org.apache.kafka.common.TopicPartition; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.sink.SinkRecord; +import org.apache.kafka.connect.sink.SinkTaskContext; + +import java.util.Arrays; +import java.util.Map; +import java.util.Set; + +/** + * Helper class that emulates the Kafka Connect f/w and additionally + * implements {@link SinkTaskContext} for testing purposes. + */ +public class TestKafkaConnect implements SinkTaskContext { + + private static final int NUM_RECORDS_BATCH = 5; + private final TopicPartition testPartition; + + private TransactionParticipant participant; + private long currentKafkaOffset; + private boolean isPaused; + + public TestKafkaConnect(TopicPartition testPartition) { + this.testPartition = testPartition; + isPaused = false; + currentKafkaOffset = 0L; + } + + public void setParticipant(TransactionParticipant participant) { + this.participant = participant; + } + + public boolean isPaused() { + return isPaused; + } + + public boolean isResumed() { + return !isPaused; + } + + public int putRecordsToParticipant() { + for (int i = 1; i <= NUM_RECORDS_BATCH; i++) { + participant.buffer(getNextKafkaRecord()); + } + participant.processRecords(); + return NUM_RECORDS_BATCH; + } + + public SinkRecord getNextKafkaRecord() { + return new SinkRecord(testPartition.topic(), + testPartition.partition(), + Schema.OPTIONAL_BYTES_SCHEMA, + ("key-" + currentKafkaOffset).getBytes(), + Schema.OPTIONAL_BYTES_SCHEMA, + "value".getBytes(), currentKafkaOffset++); + } + + public long getCurrentKafkaOffset() { + return currentKafkaOffset; + } + + @Override + public void pause(TopicPartition... partitions) { + if (Arrays.stream(partitions).allMatch(testPartition::equals)) { + isPaused = true; + } + } + + @Override + public void resume(TopicPartition... partitions) { + if (Arrays.stream(partitions).allMatch(testPartition::equals)) { + isPaused = false; + } + } + + @Override + public void offset(Map offsets) { + for (TopicPartition tp : offsets.keySet()) { + if (tp.equals(testPartition)) { + currentKafkaOffset = offsets.get(tp); + } + } + } + + @Override + public void offset(TopicPartition tp, long offset) { + if (tp.equals(testPartition)) { + currentKafkaOffset = offset; + } + } + + @Override + public Map configs() { + return null; + } + + @Override + public void timeout(long timeoutMs) { + + } + + @Override + public Set assignment() { + return null; + } + + @Override + public void requestCommit() { + + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java new file mode 100644 index 000000000..3ca64c33d --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestAbstractConnectWriter.java @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.writers; + +import org.apache.hudi.avro.HoodieAvroUtils; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.config.TypedProperties; +import org.apache.hudi.common.model.HoodieAvroPayload; +import org.apache.hudi.common.model.HoodieKey; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.SchemaTestUtil; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.connect.writers.AbstractConnectWriter; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.keygen.KeyGenerator; +import org.apache.hudi.schema.SchemaProvider; + +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.avro.io.DecoderFactory; +import org.apache.kafka.connect.sink.SinkRecord; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class TestAbstractConnectWriter { + + private static final String TOPIC_NAME = "kafka-connect-test-topic"; + private static final int PARTITION_NUMBER = 4; + private static final int NUM_RECORDS = 10; + private static final int RECORD_KEY_INDEX = 0; + + private KafkaConnectConfigs configs; + private TestKeyGenerator keyGenerator; + private SchemaProvider schemaProvider; + private long currentKafkaOffset; + + @BeforeEach + public void setUp() throws Exception { + keyGenerator = new TestKeyGenerator(new TypedProperties()); + schemaProvider = new TestSchemaProvider(); + } + + @ParameterizedTest + @EnumSource(value = TestInputFormats.class) + public void testAbstractWriterForAllFormats(TestInputFormats inputFormats) throws Exception { + Schema schema = schemaProvider.getSourceSchema(); + List inputRecords; + List expectedRecords; + + String formatConverter; + switch (inputFormats) { + case JSON_STRING: + formatConverter = AbstractConnectWriter.KAFKA_STRING_CONVERTER; + GenericDatumReader reader = new GenericDatumReader<>(schema, schema); + inputRecords = SchemaTestUtil.generateTestJsonRecords(0, NUM_RECORDS); + expectedRecords = ((List) inputRecords).stream().map(s -> { + try { + return HoodieAvroUtils.rewriteRecord((GenericRecord) reader.read(null, DecoderFactory.get().jsonDecoder(schema, s)), + schema); + } catch (IOException exception) { + throw new HoodieException("Error converting JSON records to AVRO"); + } + }).map(p -> convertToHoodieRecords(p, p.get(RECORD_KEY_INDEX).toString(), "000/00/00")).collect(Collectors.toList()); + break; + case AVRO: + formatConverter = AbstractConnectWriter.KAFKA_AVRO_CONVERTER; + inputRecords = SchemaTestUtil.generateTestRecords(0, NUM_RECORDS); + expectedRecords = inputRecords.stream().map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, schema)) + .map(p -> convertToHoodieRecords(p, p.get(RECORD_KEY_INDEX).toString(), "000/00/00")).collect(Collectors.toList()); + break; + default: + throw new HoodieException("Unknown test scenario " + inputFormats); + } + + configs = KafkaConnectConfigs.newBuilder() + .withProperties( + Collections.singletonMap(KafkaConnectConfigs.KAFKA_VALUE_CONVERTER, formatConverter)) + .build(); + AbstractHudiConnectWriterTestWrapper writer = new AbstractHudiConnectWriterTestWrapper( + configs, + keyGenerator, + schemaProvider); + + for (int i = 0; i < NUM_RECORDS; i++) { + writer.writeRecord(getNextKafkaRecord(inputRecords.get(i))); + } + + validateRecords(writer.getWrittenRecords(), expectedRecords); + } + + private static void validateRecords(List actualRecords, List expectedRecords) { + assertEquals(actualRecords.size(), expectedRecords.size()); + + actualRecords.sort(Comparator.comparing(HoodieRecord::getRecordKey)); + expectedRecords.sort(Comparator.comparing(HoodieRecord::getRecordKey)); + + // iterate through the elements and compare them one by one using + // the provided comparator. + Iterator it1 = actualRecords.iterator(); + Iterator it2 = expectedRecords.iterator(); + while (it1.hasNext()) { + HoodieRecord t1 = it1.next(); + HoodieRecord t2 = it2.next(); + assertEquals(t1.getRecordKey(), t2.getRecordKey()); + } + } + + private SinkRecord getNextKafkaRecord(Object record) { + return new SinkRecord(TOPIC_NAME, PARTITION_NUMBER, + org.apache.kafka.connect.data.Schema.OPTIONAL_BYTES_SCHEMA, + ("key-" + currentKafkaOffset).getBytes(), + org.apache.kafka.connect.data.Schema.OPTIONAL_BYTES_SCHEMA, + record, currentKafkaOffset++); + } + + private static class AbstractHudiConnectWriterTestWrapper extends AbstractConnectWriter { + + private List writtenRecords; + + public AbstractHudiConnectWriterTestWrapper(KafkaConnectConfigs connectConfigs, KeyGenerator keyGenerator, SchemaProvider schemaProvider) { + super(connectConfigs, keyGenerator, schemaProvider); + writtenRecords = new ArrayList<>(); + } + + public List getWrittenRecords() { + return writtenRecords; + } + + @Override + protected void writeHudiRecord(HoodieRecord record) { + writtenRecords.add(record); + } + + @Override + protected List flushHudiRecords() { + return null; + } + } + + private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) { + return new HoodieRecord<>(new HoodieKey(key, partitionPath), + new HoodieAvroPayload(Option.of((GenericRecord) iRecord))); + } + + private enum TestInputFormats { + AVRO, + JSON_STRING + } + + static class TestKeyGenerator extends KeyGenerator { + + protected TestKeyGenerator(TypedProperties config) { + super(config); + } + + @Override + public HoodieKey getKey(GenericRecord record) { + return new HoodieKey(record.get(RECORD_KEY_INDEX).toString(), "000/00/00"); + } + } + + static class TestSchemaProvider extends SchemaProvider { + + @Override + public Schema getSourceSchema() { + try { + return SchemaTestUtil.getSimpleSchema(); + } catch (IOException exception) { + throw new HoodieException("Fatal error parsing schema", exception); + } + } + } +} diff --git a/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java new file mode 100644 index 000000000..d1813e1a6 --- /dev/null +++ b/hudi-kafka-connect/src/test/java/org/apache/hudi/writers/TestBufferedConnectWriter.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.writers; + +import org.apache.hudi.client.HoodieJavaWriteClient; +import org.apache.hudi.client.common.HoodieJavaEngineContext; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.testutils.HoodieTestDataGenerator; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.connect.writers.BufferedConnectWriter; +import org.apache.hudi.connect.writers.KafkaConnectConfigs; +import org.apache.hudi.schema.SchemaProvider; + +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.ArgumentCaptor; +import org.mockito.Mockito; + +import java.util.Comparator; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; + +public class TestBufferedConnectWriter { + + private static final int NUM_RECORDS = 10; + private static final String COMMIT_TIME = "101"; + + private HoodieJavaWriteClient mockHoodieJavaWriteClient; + private HoodieJavaEngineContext javaEngineContext; + private KafkaConnectConfigs configs; + private HoodieWriteConfig writeConfig; + private SchemaProvider schemaProvider; + + @BeforeEach + public void setUp() throws Exception { + mockHoodieJavaWriteClient = mock(HoodieJavaWriteClient.class); + Configuration hadoopConf = new Configuration(); + javaEngineContext = new HoodieJavaEngineContext(hadoopConf); + configs = KafkaConnectConfigs.newBuilder().build(); + schemaProvider = new TestAbstractConnectWriter.TestSchemaProvider(); + writeConfig = HoodieWriteConfig.newBuilder() + .withPath("/tmp") + .withSchema(schemaProvider.getSourceSchema().toString()) + .build(); + } + + @Test + public void testSimpleWriteAndFlush() throws Exception { + String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0]; + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] {partitionPath}); + List records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS); + + BufferedConnectWriter writer = new BufferedConnectWriter( + javaEngineContext, + mockHoodieJavaWriteClient, + COMMIT_TIME, + configs, + writeConfig, + null, + schemaProvider); + + for (int i = 0; i < NUM_RECORDS; i++) { + writer.writeHudiRecord(records.get(i)); + } + Mockito.verify(mockHoodieJavaWriteClient, times(0)) + .bulkInsertPreppedRecords(anyList(), eq(COMMIT_TIME), eq(Option.empty())); + + writer.flushHudiRecords(); + final ArgumentCaptor> actualRecords = ArgumentCaptor.forClass(List.class); + Mockito.verify(mockHoodieJavaWriteClient, times(1)) + .bulkInsertPreppedRecords(actualRecords.capture(), eq(COMMIT_TIME), eq(Option.empty())); + + actualRecords.getValue().sort(Comparator.comparing(HoodieRecord::getRecordKey)); + records.sort(Comparator.comparing(HoodieRecord::getRecordKey)); + + assertEquals(records, actualRecords.getValue()); + } +} diff --git a/hudi-kafka-connect/src/test/resources/log4j-surefire.properties b/hudi-kafka-connect/src/test/resources/log4j-surefire.properties new file mode 100644 index 000000000..9ee04e1a3 --- /dev/null +++ b/hudi-kafka-connect/src/test/resources/log4j-surefire.properties @@ -0,0 +1,32 @@ +### +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +### +log4j.rootLogger=WARN, CONSOLE +log4j.logger.org.apache=INFO +log4j.logger.org.apache.hudi=DEBUG +log4j.logger.org.apache.hadoop.hbase=ERROR + +# A1 is set to be a ConsoleAppender. +log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender +# A1 uses PatternLayout. +log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout +log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n +log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter +log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true +log4j.appender.CONSOLE.filter.a.LevelMin=WARN +log4j.appender.CONSOLE.filter.a.LevelMax=FATAL + diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java index 88d1d8c8a..53d68c323 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/deltastreamer/DeltaSync.java @@ -352,6 +352,7 @@ public class DeltaSync implements Serializable { } } } else { + // initialize the table for the first time. String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props); HoodieTableMetaClient.withPropertyBuilder() .setTableType(cfg.tableType) diff --git a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaProvider.java b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaProvider.java index bcbdbf049..2410798d3 100644 --- a/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaProvider.java +++ b/hudi-utilities/src/main/java/org/apache/hudi/utilities/schema/SchemaProvider.java @@ -29,7 +29,8 @@ import org.apache.spark.api.java.JavaSparkContext; import java.io.Serializable; /** - * Class to provide schema for reading data and also writing into a Hoodie table. + * Class to provide schema for reading data and also writing into a Hoodie table, + * used by deltastreamer (runs over Spark). */ @PublicAPIClass(maturity = ApiMaturityLevel.STABLE) public abstract class SchemaProvider implements Serializable { diff --git a/packaging/hudi-kafka-connect-bundle/pom.xml b/packaging/hudi-kafka-connect-bundle/pom.xml new file mode 100644 index 000000000..14bc4e4c8 --- /dev/null +++ b/packaging/hudi-kafka-connect-bundle/pom.xml @@ -0,0 +1,186 @@ + + + + + hudi + org.apache.hudi + 0.10.0-SNAPSHOT + ../../pom.xml + + 4.0.0 + hudi-kafka-connect-bundle + jar + + + true + ${project.parent.basedir} + + + + + + org.apache.rat + apache-rat-plugin + + + org.apache.maven.plugins + maven-shade-plugin + ${maven-shade-plugin.version} + + + package + + shade + + + ${shadeSources} + ${project.build.directory}/dependency-reduced-pom.xml + + + + + + true + + + META-INF/LICENSE + target/classes/META-INF/LICENSE + + + + + com.amazonaws.* + org.apache.zookeeper:zookeeper + com.fasterxml.jackson.core:jackson-annotations + commons-httpclient:commons-httpclient + org.apache.htrace:htrace-core + org.jamon:jamon-runtime + jdk.tools:jdk.tools + junit:junit + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + META-INF/services/javax.* + + + + + + + + + + + + src/main/resources + + + src/test/resources + + + + + + + + org.apache.hudi + hudi-kafka-connect + ${project.version} + + + org.apache.hudi + hudi-java-client + ${project.version} + + + org.apache.hudi + hudi-utilities_${scala.binary.version} + ${project.version} + + + org.apache.hudi + hudi-common + ${project.version} + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + + + org.apache.avro + avro + ${avro.version} + compile + + + org.apache.parquet + parquet-avro + compile + + + + + org.apache.hadoop + hadoop-common + compile + + + org.mortbay.jetty + * + + + javax.servlet.jsp + * + + + javax.servlet + * + + + + + org.apache.hadoop + hadoop-auth + ${hadoop.version} + compile + + + + diff --git a/packaging/hudi-kafka-connect-bundle/src/main/java/org/apache/hudi/kafka/connect/bundle/Main.java b/packaging/hudi-kafka-connect-bundle/src/main/java/org/apache/hudi/kafka/connect/bundle/Main.java new file mode 100644 index 000000000..3b86e5409 --- /dev/null +++ b/packaging/hudi-kafka-connect-bundle/src/main/java/org/apache/hudi/kafka/connect/bundle/Main.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.kafka.connect.bundle; + +import org.apache.hudi.common.util.ReflectionUtils; + +/** + * A simple main class to dump all classes loaded in current classpath + * + * This is a workaround for generating sources and javadoc jars for packaging modules. The maven plugins for generating + * javadoc and sources plugins do not generate corresponding jars if there are no source files. + * + * This class does not have anything to do with Hudi but is there to keep mvn javadocs/source plugin happy. + */ +public class Main { + + public static void main(String[] args) { + ReflectionUtils.getTopLevelClassesInClasspath(Main.class).forEach(System.out::println); + } +} diff --git a/pom.xml b/pom.xml index 65e391ab1..a1beac06a 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,9 @@ packaging/hudi-integ-test-bundle hudi-examples hudi-flink + hudi-kafka-connect packaging/hudi-flink-bundle + packaging/hudi-kafka-connect-bundle