1
0

[HUDI-2394] Implement Kafka Sink Protocol for Hudi for Ingesting Immutable Data (#3592)

- Fixing packaging, naming of classes
 - Use of log4j over slf4j for uniformity
- More follow-on fixes
 - Added a version to control/coordinator events.
 - Eliminated the config added to write config
 - Fixed fetching of checkpoints based on table type
 - Clean up of naming, code placement

Co-authored-by: Rajesh Mahindra <rmahindra@Rajeshs-MacBook-Pro.local>
Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
rmahindra123
2021-09-10 18:20:26 -07:00
committed by GitHub
parent bd1d2d4952
commit e528dd798a
51 changed files with 4710 additions and 22 deletions

2
.gitignore vendored
View File

@@ -78,4 +78,4 @@ dependency-reduced-pom.xml
#######################################
hudi-integ-test/compose_env
node_modules
package-lock.json
package-lock.json

View File

@@ -46,6 +46,7 @@ import org.apache.hudi.keygen.SimpleAvroKeyGenerator;
import org.apache.hudi.keygen.constant.KeyGeneratorType;
import org.apache.hudi.metrics.MetricsReporterType;
import org.apache.hudi.metrics.datadog.DatadogHttpClient.ApiSite;
import org.apache.hudi.table.RandomFileIdPrefixProvider;
import org.apache.hudi.table.action.compact.CompactionTriggerStrategy;
import org.apache.hudi.table.action.compact.strategy.CompactionStrategy;
@@ -413,6 +414,12 @@ public class HoodieWriteConfig extends HoodieConfig {
.withDocumentation("Whether to include '_hoodie_operation' in the metadata fields. "
+ "Once enabled, all the changes of a record are persisted to the delta log directly without merge");
public static final ConfigProperty<String> FILEID_PREFIX_PROVIDER_CLASS = ConfigProperty
.key("hoodie.fileid.prefix.provider.class")
.defaultValue(RandomFileIdPrefixProvider.class.getName())
.sinceVersion("0.10.0")
.withDocumentation("File Id Prefix provider class, that implements `org.apache.hudi.fileid.FileIdPrefixProvider`");
private ConsistencyGuardConfig consistencyGuardConfig;
// Hoodie Write Client transparently rewrites File System View config when embedded mode is enabled
@@ -1748,6 +1755,10 @@ public class HoodieWriteConfig extends HoodieConfig {
return getBooleanOrDefault(ALLOW_OPERATION_METADATA_FIELD);
}
public String getFileIdPrefixProviderClassName() {
return getString(FILEID_PREFIX_PROVIDER_CLASS);
}
public static class Builder {
protected final HoodieWriteConfig writeConfig = new HoodieWriteConfig();
@@ -2079,6 +2090,11 @@ public class HoodieWriteConfig extends HoodieConfig {
return this;
}
public Builder withFileIdPrefixProviderClassName(String fileIdPrefixProviderClassName) {
writeConfig.setValue(FILEID_PREFIX_PROVIDER_CLASS, fileIdPrefixProviderClassName);
return this;
}
public Builder withProperties(Properties properties) {
this.writeConfig.getProps().putAll(properties);
return this;

View File

@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table;
import java.util.Properties;
public abstract class FileIdPrefixProvider {
private final Properties props;
public FileIdPrefixProvider(Properties props) {
this.props = props;
}
public Properties getProps() {
return props;
}
public abstract String createFilePrefix(String partitionPath);
}

View File

@@ -0,0 +1,35 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.table;
import org.apache.hudi.common.fs.FSUtils;
import java.util.Properties;
public class RandomFileIdPrefixProvider extends FileIdPrefixProvider {
public RandomFileIdPrefixProvider(Properties props) {
super(props);
}
@Override
public String createFilePrefix(String partitionPath) {
return FSUtils.createNewFileIdPfx();
}
}

View File

@@ -18,8 +18,6 @@
package org.apache.hudi.client;
import com.codahale.metrics.Timer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.client.common.HoodieJavaEngineContext;
import org.apache.hudi.client.embedded.EmbeddedTimelineService;
import org.apache.hudi.common.engine.HoodieEngineContext;
@@ -30,6 +28,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.model.HoodieWriteStat;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
@@ -41,6 +40,9 @@ import org.apache.hudi.table.HoodieJavaTable;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
import com.codahale.metrics.Timer;
import org.apache.hadoop.conf.Configuration;
import java.io.IOException;
import java.util.List;
import java.util.Map;
@@ -153,11 +155,23 @@ public class HoodieJavaWriteClient<T extends HoodieRecordPayload> extends
throw new HoodieNotSupportedException("BulkInsert is not supported in HoodieJavaClient");
}
public void transitionInflight(String instantTime) {
HoodieTableMetaClient metaClient = createMetaClient(true);
metaClient.getActiveTimeline().transitionRequestedToInflight(
new HoodieInstant(HoodieInstant.State.REQUESTED, metaClient.getCommitActionType(), instantTime),
Option.empty(), config.shouldAllowMultiWriteOnSameInstant());
}
@Override
public List<WriteStatus> bulkInsertPreppedRecords(List<HoodieRecord<T>> preppedRecords,
String instantTime,
Option<BulkInsertPartitioner<List<HoodieRecord<T>>>> bulkInsertPartitioner) {
throw new HoodieNotSupportedException("BulkInsertPreppedRecords is not supported in HoodieJavaClient");
HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table =
getTableAndInitCtx(WriteOperationType.BULK_INSERT_PREPPED, instantTime);
table.validateInsertSchema();
preWrite(instantTime, WriteOperationType.BULK_INSERT_PREPPED, table.getMetaClient());
HoodieWriteMetadata<List<WriteStatus>> result = table.bulkInsertPrepped(context, instantTime, preppedRecords, bulkInsertPartitioner);
return postWrite(result, instantTime, table);
}
@Override

View File

@@ -19,17 +19,18 @@
package org.apache.hudi.table.action.commit;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.execution.JavaLazyInsertIterable;
import org.apache.hudi.execution.bulkinsert.JavaBulkInsertInternalPartitionerFactory;
import org.apache.hudi.io.CreateHandleFactory;
import org.apache.hudi.table.BulkInsertPartitioner;
import org.apache.hudi.table.FileIdPrefixProvider;
import org.apache.hudi.table.HoodieTable;
import org.apache.hudi.table.action.HoodieWriteMetadata;
@@ -66,10 +67,14 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Abst
final Option<BulkInsertPartitioner<T>> userDefinedBulkInsertPartitioner) {
HoodieWriteMetadata result = new HoodieWriteMetadata();
//transition bulk_insert state to inflight
table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(HoodieInstant.State.REQUESTED,
table.getMetaClient().getCommitActionType(), instantTime), Option.empty(),
config.shouldAllowMultiWriteOnSameInstant());
// It's possible the transition to inflight could have already happened.
if (!table.getActiveTimeline().filterInflights().containsInstant(instantTime)) {
table.getActiveTimeline().transitionRequestedToInflight(
new HoodieInstant(HoodieInstant.State.REQUESTED, table.getMetaClient().getCommitActionType(), instantTime),
Option.empty(),
config.shouldAllowMultiWriteOnSameInstant());
}
// write new files
List<WriteStatus> writeStatuses = bulkInsert(inputRecords, instantTime, table, config, performDedupe, userDefinedBulkInsertPartitioner, false, config.getBulkInsertShuffleParallelism(), false);
//update index
@@ -102,12 +107,16 @@ public class JavaBulkInsertHelper<T extends HoodieRecordPayload, R> extends Abst
: JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
repartitionedRecords = (List<HoodieRecord<T>>) partitioner.repartitionRecords(dedupedRecords, parallelism);
String idPfx = FSUtils.createNewFileIdPfx();
FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass(
config.getFileIdPrefixProviderClassName(),
config.getProps());
List<WriteStatus> writeStatuses = new ArrayList<>();
new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true, config, instantTime, table, idPfx,
table.getTaskContextSupplier(), new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll);
new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true,
config, instantTime, table,
fileIdPrefixProvider.createFilePrefix(""), table.getTaskContextSupplier(),
new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll);
return writeStatuses;
}

View File

@@ -30,7 +30,8 @@ public class ConfigGroups {
FLINK_SQL("Flink Sql Configs"),
WRITE_CLIENT("Write Client Configs"),
METRICS("Metrics Configs"),
RECORD_PAYLOAD("Record Payload Config");
RECORD_PAYLOAD("Record Payload Config"),
KAFKA_CONNECT("Kafka Connect Configs");
public final String name;
@@ -72,6 +73,9 @@ public class ConfigGroups {
description = "These set of configs are used to enable monitoring and reporting of key"
+ "Hudi stats and metrics.";
break;
case KAFKA_CONNECT:
description = "These set of configs are used for Kakfa Connect Sink Connector for writing Hudi Tables";
break;
default:
description = "Please fill in the description for Config Group Name: " + names.name;
break;

View File

@@ -26,6 +26,7 @@ import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.exception.HoodieException;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

View File

@@ -71,17 +71,20 @@ public final class SchemaTestUtil {
return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit);
}
public static List<String> generateTestJsonRecords(int from, int limit) throws IOException, URISyntaxException {
Path dataPath = initializeSampleDataPath();
try (Stream<String> stream = Files.lines(dataPath)) {
return stream.skip(from).limit(limit).collect(Collectors.toList());
} catch (IOException e) {
throw new HoodieIOException("Could not read data from " + RESOURCE_SAMPLE_DATA, e);
}
}
private static List<IndexedRecord> toRecords(Schema writerSchema, Schema readerSchema, int from, int limit)
throws IOException, URISyntaxException {
GenericDatumReader<IndexedRecord> reader = new GenericDatumReader<>(writerSchema, readerSchema);
// Required to register the necessary JAR:// file system
URI resource = SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI();
Path dataPath;
if (resource.toString().contains("!")) {
dataPath = uriToPath(resource);
} else {
dataPath = Paths.get(SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI());
}
Path dataPath = initializeSampleDataPath();
try (Stream<String> stream = Files.lines(dataPath)) {
return stream.skip(from).limit(limit).map(s -> {
@@ -96,6 +99,21 @@ public final class SchemaTestUtil {
}
}
/**
* Required to register the necessary JAR:// file system.
* @return Path to the sample data in the resource file.
* @throws IOException
* @throws URISyntaxException
*/
private static Path initializeSampleDataPath() throws IOException, URISyntaxException {
URI resource = SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI();
if (resource.toString().contains("!")) {
return uriToPath(resource);
} else {
return Paths.get(SchemaTestUtil.class.getResource(RESOURCE_SAMPLE_DATA).toURI());
}
}
public static Path uriToPath(URI uri) throws IOException {
final Map<String, String> env = new HashMap<>();
final String[] array = uri.toString().split("!");

View File

@@ -0,0 +1,94 @@
<!--
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
-->
# Quick Start guide for Kafka Connect Sink for Hudi
This repo contains a sample project that can be used to start off your own source connector for Kafka Connect.
## Building the connector
The first thing you need to do to start using this connector is building it. In order to do that, you need to install the following dependencies:
- [Java 1.8+](https://openjdk.java.net/)
- [Apache Maven](https://maven.apache.org/)
After installing these dependencies, execute the following command:
```bash
cd $HUDI_DIR
mvn clean package
```
## Incremental Builds
```bash
mvn clean -pl hudi-kafka-connect install -DskipTests
mvn clean -pl packaging/hudi-kafka-connect-bundle install
```
## Put hudi connector in Kafka Connect classpath
```bash
cp $HUDI_DIR/packaging/hudi-kafka-connect-bundle/target/hudi-kafka-connect-bundle-0.10.0-SNAPSHOT.jar /usr/local/share/java/hudi-kafka-connect/
```
## Trying the connector
After building the package, we need to install the Apache Kafka
### 1 - Starting the environment
Start the ZK and Kafka:
```bash
./bin/zookeeper-server-start.sh ./config/zookeeper.properties
./bin/kafka-server-start.sh ./config/server.properties
```
Wait until the kafka cluster is up and running.
### 2 - Create the Hudi Control Topic for Coordination of the transactions
The control topic should only have `1` partition
```bash
./bin/kafka-topics.sh --delete --topic hudi-control-topic --bootstrap-server localhost:9092
./bin/kafka-topics.sh --create --topic hudi-control-topic --partitions 1 --replication-factor 1 --bootstrap-server localhost:9092
```
### 3 - Create the Hudi Topic for the Sink and insert data into the topic
Open a terminal to execute the following command:
```bash
bash runKafkaTrafficGenerator.sh <total_messages>
```
### 4 - Run the Sink connector worker (multiple workers can be run)
Open a terminal to execute the following command:
```bash
./bin/connect-distributed.sh ../hudi-kafka-connect/configs/connect-distributed.properties
```
### 5- To add the Hudi Sink to the Connector (delete it if you want to re-configure)
```bash
curl -X DELETE http://localhost:8083/connectors/hudi-sink
curl -X POST -H "Content-Type:application/json" -d @$HUDI-DIR/hudi-kafka-connect/configs/config-sink.json http://localhost:8083/connectors
```

View File

@@ -0,0 +1,19 @@
{
"name": "hudi-sink",
"config": {
"bootstrap.servers": "localhost:9092",
"connector.class": "org.apache.hudi.connect.HoodieSinkConnector",
"tasks.max": "4",
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
"value.converter": "org.apache.kafka.connect.storage.StringConverter",
"value.converter.schemas.enable": "false",
"topics": "hudi-test-topic",
"hoodie.table.name": "hudi-test-topic",
"hoodie.base.path": "file:///tmp/hoodie/sample-table",
"hoodie.datasource.write.recordkey.field": "volume",
"hoodie.datasource.write.partitionpath.field": "year",
"hoodie.schemaprovider.class": "org.apache.hudi.schema.FilebasedSchemaProvider",
"hoodie.deltastreamer.schemaprovider.source.schema.file": "file:///tmp/hoodie/schema.avsc",
"hoodie.deltastreamer.schemaprovider.target.schema.file": "file:///tmp/hoodie/schema.avsc"
}
}

View File

@@ -0,0 +1,33 @@
##
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##
bootstrap.servers=localhost:9092
group.id=hudi-connect-cluster
key.converter=org.apache.kafka.connect.json.JsonConverter
value.converter=org.apache.kafka.connect.json.JsonConverter
key.converter.schemas.enable=true
value.converter.schemas.enable=true
offset.storage.topic=connect-offsets
offset.storage.replication.factor=1
config.storage.topic=connect-configs
config.storage.replication.factor=1
status.storage.topic=connect-status
status.storage.replication.factor=1
offset.flush.interval.ms=60000
listeners=HTTP://:8083
plugin.path=/usr/local/share/java,/usr/local/share/kafka/plugins,/opt/connectors,

231
hudi-kafka-connect/pom.xml Normal file
View File

@@ -0,0 +1,231 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hudi</artifactId>
<groupId>org.apache.hudi</groupId>
<version>0.10.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-kafka-connect</artifactId>
<description>Kafka Connect Sink Connector for Hudi</description>
<version>0.10.0-SNAPSHOT</version>
<packaging>jar</packaging>
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
<connect.api.version>2.5.0</connect.api.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.1.2</version>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
</plugins>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
<resource>
<directory>src/test/resources</directory>
</resource>
</resources>
</build>
<dependencies>
<!-- Kafka Connect -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>connect-api</artifactId>
<version>${connect.api.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>connect-json</artifactId>
<version>${connect.api.version}</version>
<scope>provided</scope>
</dependency>
<!-- Hudi Dependencies -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-java-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- NOTE: This is temp (SchemaProvide dep) until PR3162 lands -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>1.12.1</version>
<exclusions>
<exclusion>
<groupId>com.esotericsoftware.kryo</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Logging -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</dependency>
<!-- Fasterxml -->
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.dataformat</groupId>
<artifactId>jackson-dataformat-csv</artifactId>
<version>${fasterxml.version}</version>
</dependency>
<!-- Parquet -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>${avro.version}</version>
</dependency>
<!-- Hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- Hudi - Test -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<!-- Tests Misc -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-runner</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-suite-api</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.platform</groupId>
<artifactId>junit-platform-commons</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,5 @@
{"volume": 0, "symbol": "TPNL", "ts": "2017-08-31 09:30:00", "month": "08", "high": 6.37, "low": 1.37, "key": "TPNL_2017-08-31 09", "year": 2017, "date": "2017/08/31", "close": 4.44, "open": 1.37, "day": "31"}
{"volume": 0, "symbol": "SPOT", "ts": "2018-08-31 09:30:00", "month": "08", "high": 1.87, "low": 0.37, "key": "TPNL_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 1.44, "open": 1.77, "day": "31"}
{"volume": 0, "symbol": "GOOG", "ts": "2019-08-31 09:30:00", "month": "08", "high": 2.1, "low": 1.7, "key": "TPNL_2019-08-31 09", "year": 2019, "date": "2019/08/31", "close": 1.94, "open": 2.0, "day": "31"}
{"volume": 0, "symbol": "MSFT", "ts": "2020-08-31 09:30:00", "month": "08", "high": 3.33, "low": 0.87, "key": "TPNL_2020-08-31 09", "year": 2020, "date": "2020/08/31", "close": 3.33, "open": 3.1, "day": "31"}
{"volume": 0, "symbol": "APPL", "ts": "2021-08-31 09:30:00", "month": "08", "high": 3.17, "low": 2.37, "key": "TPNL_2021-08-31 09", "year": 2021, "date": "2021/08/31", "close": 2.66, "open": 3.1, "day": "31"}

View File

@@ -0,0 +1,38 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
# First delete the existing topic
$KAFKA_HOME/bin/kafka-topics.sh --delete --topic hudi-test-topic --bootstrap-server localhost:9092
# Create the topic with 4 partitions
$KAFKA_HOME/bin/kafka-topics.sh --create --topic hudi-test-topic --partitions 4 --replication-factor 1 --bootstrap-server localhost:9092
# Generate kafka messages from raw records
inputFile="raw.json"
# Generate the records with unique keys
for ((recordKey=0; recordKey<=$1; ))
do
while IFS= read line
do
echo $line | jq --argjson recordKey $recordKey -c '.volume = $recordKey' | kcat -P -b localhost:9092 -t hudi-test-topic
((recordKey++))
if [ $(( $recordKey % 1000 )) -eq 0 ]
then sleep 1
fi
done < "$inputFile"
done

View File

@@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect;
import org.apache.kafka.common.config.ConfigDef;
import org.apache.kafka.connect.connector.Task;
import org.apache.kafka.connect.sink.SinkConnector;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* HudiSinkConnector is a Kafka Connect Connector implementation
* that ingest data from Kafka to Hudi.
*/
public class HoodieSinkConnector extends SinkConnector {
public static final String VERSION = "0.1.0";
private static final Logger LOG = LogManager.getLogger(HoodieSinkConnector.class);
private Map<String, String> configProps;
/**
* No-arg constructor. It is instantiated by Connect framework.
*/
public HoodieSinkConnector() {
}
@Override
public String version() {
return VERSION;
}
@Override
public void start(Map<String, String> props) {
configProps = new HashMap<>(props);
}
@Override
public Class<? extends Task> taskClass() {
return HoodieSinkTask.class;
}
@Override
public List<Map<String, String>> taskConfigs(int maxTasks) {
Map<String, String> taskProps = new HashMap<>(configProps);
List<Map<String, String>> taskConfigs = new ArrayList<>(maxTasks);
for (int i = 0; i < maxTasks; ++i) {
taskConfigs.add(taskProps);
}
return taskConfigs;
}
@Override
public void stop() {
LOG.info(String.format("Shutting down Hudi Sink connector %s", configProps.get("name")));
}
@Override
public ConfigDef config() {
// we use Hudi configs instead
return new ConfigDef();
}
}

View File

@@ -0,0 +1,212 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect;
import org.apache.hudi.connect.kafka.KafkaConnectControlAgent;
import org.apache.hudi.connect.transaction.ConnectTransactionCoordinator;
import org.apache.hudi.connect.transaction.ConnectTransactionParticipant;
import org.apache.hudi.connect.transaction.TransactionCoordinator;
import org.apache.hudi.connect.transaction.TransactionParticipant;
import org.apache.hudi.connect.writers.KafkaConnectConfigs;
import org.apache.hudi.exception.HoodieException;
import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.config.ConfigException;
import org.apache.kafka.connect.errors.ConnectException;
import org.apache.kafka.connect.sink.SinkRecord;
import org.apache.kafka.connect.sink.SinkTask;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
/**
* Implementation of the {@link SinkTask} interface provided by
* Kafka Connect. Implements methods to receive the Kafka records
* from the assigned partitions and commit the Kafka offsets.
* Also, handles re-assignments of partitions.
*/
public class HoodieSinkTask extends SinkTask {
public static final String TASK_ID_CONFIG_NAME = "task.id";
private static final Logger LOG = LogManager.getLogger(HoodieSinkTask.class);
private static final int COORDINATOR_KAFKA_PARTITION = 0;
private final Map<TopicPartition, TransactionCoordinator> transactionCoordinators;
private final Map<TopicPartition, TransactionParticipant> hudiTransactionParticipants;
private KafkaConnectControlAgent controlKafkaClient;
private KafkaConnectConfigs connectConfigs;
private String taskId;
private String connectorName;
public HoodieSinkTask() {
transactionCoordinators = new HashMap();
hudiTransactionParticipants = new HashMap<>();
}
@Override
public String version() {
return HoodieSinkConnector.VERSION;
}
@Override
public void start(Map<String, String> props) {
connectorName = props.get("name");
taskId = props.get(TASK_ID_CONFIG_NAME);
LOG.info(String.format("Starting Hudi Sink Task for %s connector %s with id %s with assignments %s",
props, connectorName, taskId, context.assignment()));
try {
connectConfigs = KafkaConnectConfigs.newBuilder().withProperties(props).build();
controlKafkaClient = KafkaConnectControlAgent.createKafkaControlManager(
connectConfigs.getBootstrapServers(),
connectConfigs.getControlTopicName());
bootstrap(context.assignment());
} catch (ConfigException e) {
throw new ConnectException("Couldn't start HdfsSinkConnector due to configuration error.", e);
} catch (ConnectException e) {
LOG.error("Couldn't start HudiSinkConnector:", e);
LOG.info("Shutting down HudiSinkConnector.");
cleanup();
// Always throw the original exception that prevent us from starting
throw e;
}
}
@Override
public void put(Collection<SinkRecord> records) {
for (SinkRecord record : records) {
String topic = record.topic();
int partition = record.kafkaPartition();
TopicPartition tp = new TopicPartition(topic, partition);
hudiTransactionParticipants.get(tp).buffer(record);
}
for (TopicPartition partition : context.assignment()) {
hudiTransactionParticipants.get(partition).processRecords();
}
}
@Override
public void stop() {
cleanup();
}
@Override
public void flush(Map<TopicPartition, OffsetAndMetadata> currentOffsets) {
// No-op. The connector is managing the offsets.
}
@Override
public Map<TopicPartition, OffsetAndMetadata> preCommit(Map<TopicPartition, OffsetAndMetadata> currentOffsets) {
// Although the connector manages offsets via commit files in Hudi, we still want to have Connect
// commit the consumer offsets for records this task has consumed from its topic partitions and
// committed to Hudi.
Map<TopicPartition, OffsetAndMetadata> result = new HashMap<>();
for (TopicPartition partition : context.assignment()) {
TransactionParticipant worker = hudiTransactionParticipants.get(partition);
if (worker != null) {
worker.processRecords();
if (worker.getLastKafkaCommittedOffset() >= 0) {
result.put(partition, new OffsetAndMetadata(worker.getLastKafkaCommittedOffset()));
}
}
}
return result;
}
@Override
public void open(Collection<TopicPartition> partitions) {
LOG.info("New partitions added " + partitions.toString());
bootstrap(partitions);
}
@Override
public void close(Collection<TopicPartition> partitions) {
LOG.info("Existing partitions deleted " + partitions.toString());
// Close any writers we have. We may get assigned the same partitions and end up duplicating
// some effort since we'll have to reprocess those messages. It may be possible to hold on to
// the TopicPartitionWriter and continue to use the temp file, but this can get significantly
// more complex due to potential failures and network partitions. For example, we may get
// this close, then miss a few generations of group membership, during which
// data may have continued to be processed and we'd have to restart from the recovery stage,
// make sure we apply the WAL, and only reuse the temp file if the starting offset is still
// valid. For now, we prefer the simpler solution that may result in a bit of wasted effort.
for (TopicPartition partition : partitions) {
if (partition.partition() == COORDINATOR_KAFKA_PARTITION) {
if (transactionCoordinators.containsKey(partition)) {
transactionCoordinators.get(partition).stop();
transactionCoordinators.remove(partition);
}
}
TransactionParticipant worker = hudiTransactionParticipants.remove(partition);
if (worker != null) {
try {
LOG.debug("Closing data writer due to task start failure.");
worker.stop();
} catch (Throwable t) {
LOG.debug(String.format("Error closing and stopping data writer: %s", t.getMessage()), t);
}
}
}
}
private void bootstrap(Collection<TopicPartition> partitions) {
LOG.info(String.format("Bootstrap task for connector %s with id %s with assignments %s part %s",
connectorName, taskId, context.assignment(), partitions));
for (TopicPartition partition : partitions) {
try {
// If the partition is 0, instantiate the Leader
if (partition.partition() == COORDINATOR_KAFKA_PARTITION) {
ConnectTransactionCoordinator coordinator = new ConnectTransactionCoordinator(
connectConfigs,
partition,
controlKafkaClient);
coordinator.start();
transactionCoordinators.put(partition, coordinator);
}
ConnectTransactionParticipant worker = new ConnectTransactionParticipant(connectConfigs, partition, controlKafkaClient, context);
hudiTransactionParticipants.put(partition, worker);
worker.start();
} catch (HoodieException exception) {
LOG.error(String.format("Fatal error initializing task %s for partition %s", taskId, partition.partition()), exception);
}
}
}
private void cleanup() {
for (TopicPartition partition : context.assignment()) {
TransactionParticipant worker = hudiTransactionParticipants.get(partition);
if (worker != null) {
try {
LOG.debug("Closing data writer due to task start failure.");
worker.stop();
} catch (Throwable t) {
LOG.debug("Error closing and stopping data writer", t);
}
}
}
hudiTransactionParticipants.clear();
transactionCoordinators.forEach((topic, transactionCoordinator) -> transactionCoordinator.stop());
transactionCoordinators.clear();
}
}

View File

@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.table.FileIdPrefixProvider;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Objects;
import java.util.Properties;
public class KafkaConnectFileIdPrefixProvider extends FileIdPrefixProvider {
public static final String KAFKA_CONNECT_PARTITION_ID = "hudi.kafka.connect.partition";
private static final Logger LOG = LogManager.getLogger(KafkaConnectFileIdPrefixProvider.class);
private final String kafkaPartition;
public KafkaConnectFileIdPrefixProvider(Properties props) {
super(props);
if (!props.containsKey(KAFKA_CONNECT_PARTITION_ID)) {
LOG.error("Fatal error due to Kafka Connect Partition Id is not set");
throw new HoodieException("Kafka Connect Partition Key " + KAFKA_CONNECT_PARTITION_ID + " not provided");
}
this.kafkaPartition = props.getProperty(KAFKA_CONNECT_PARTITION_ID);
}
@Override
public String createFilePrefix(String partitionPath) {
// We use a combination of kafka partition and partition path as the file id, and then hash it
// to generate a fixed sized hash.
String rawFileIdPrefix = kafkaPartition + partitionPath;
MessageDigest md;
try {
md = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
LOG.error("Fatal error selecting hash algorithm", e);
throw new HoodieException(e);
}
byte[] digest = Objects.requireNonNull(md).digest(rawFileIdPrefix.getBytes(StandardCharsets.UTF_8));
LOG.info("CreateFileId for Kafka Partition " + kafkaPartition + " : " + partitionPath + " = " + rawFileIdPrefix
+ " === " + StringUtils.toHexString(digest).toUpperCase());
return StringUtils.toHexString(digest).toUpperCase();
}
}

View File

@@ -0,0 +1,230 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.kafka;
import org.apache.hudi.connect.transaction.ControlEvent;
import org.apache.hudi.connect.transaction.TransactionCoordinator;
import org.apache.hudi.connect.transaction.TransactionParticipant;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.kafka.clients.consumer.CommitFailedException;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.serialization.Deserializer;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.time.Duration;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
/**
* Class that manages the Kafka consumer and producer for
* the Kafka Control Topic that ensures coordination across the
* {@link TransactionCoordinator} and {@link TransactionParticipant}s.
* Use a single instance per worker (single-threaded),
* and register multiple tasks that can receive the control messages.
*/
public class KafkaConnectControlAgent implements KafkaControlAgent {
private static final Logger LOG = LogManager.getLogger(KafkaConnectControlAgent.class);
private static final Object LOCK = new Object();
private static final long KAFKA_POLL_TIMEOUT_MS = 100;
private static final int EXEC_SHUTDOWN_TIMEOUT_MS = 5000;
private static KafkaConnectControlAgent agent;
private final String bootstrapServers;
private final String controlTopicName;
private final ExecutorService executorService;
private final Map<String, TransactionCoordinator> topicCoordinators;
// List of TransactionParticipants per Kafka Topic
private final Map<String, ConcurrentLinkedQueue<TransactionParticipant>> partitionWorkers;
private final KafkaControlProducer producer;
private KafkaConsumer<String, ControlEvent> consumer;
public KafkaConnectControlAgent(String bootstrapServers,
String controlTopicName) {
this.bootstrapServers = bootstrapServers;
this.controlTopicName = controlTopicName;
this.executorService = Executors.newSingleThreadExecutor();
this.topicCoordinators = new HashMap<>();
this.partitionWorkers = new HashMap<>();
this.producer = new KafkaControlProducer(bootstrapServers, controlTopicName);
start();
}
public static KafkaConnectControlAgent createKafkaControlManager(String bootstrapServers,
String controlTopicName) {
if (agent == null) {
synchronized (LOCK) {
if (agent == null) {
agent = new KafkaConnectControlAgent(bootstrapServers, controlTopicName);
}
}
}
return agent;
}
@Override
public void registerTransactionParticipant(TransactionParticipant worker) {
if (!partitionWorkers.containsKey(worker.getPartition().topic())) {
partitionWorkers.put(worker.getPartition().topic(), new ConcurrentLinkedQueue<>());
}
partitionWorkers.get(worker.getPartition().topic()).add(worker);
}
@Override
public void deregisterTransactionParticipant(TransactionParticipant worker) {
if (partitionWorkers.containsKey(worker.getPartition().topic())) {
partitionWorkers.get(worker.getPartition().topic()).remove(worker);
}
}
@Override
public void registerTransactionCoordinator(TransactionCoordinator coordinator) {
if (!topicCoordinators.containsKey(coordinator.getPartition().topic())) {
topicCoordinators.put(coordinator.getPartition().topic(), coordinator);
}
}
public void deregisterTransactionCoordinator(TransactionCoordinator coordinator) {
topicCoordinators.remove(coordinator.getPartition().topic());
}
@Override
public void publishMessage(ControlEvent message) {
producer.publishMessage(message);
}
private void start() {
Properties props = new Properties();
props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
// Todo fetch the worker id or name instead of a uuid.
props.put(ConsumerConfig.GROUP_ID_CONFIG, "hudi-control-group" + UUID.randomUUID().toString());
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, KafkaJsonDeserializer.class);
// Since we are using Kafka Control Topic as a RPC like interface,
// we want consumers to only process messages that are sent after they come online
props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
consumer = new KafkaConsumer<>(props, new StringDeserializer(),
new KafkaJsonDeserializer<>(ControlEvent.class));
consumer.subscribe(Collections.singletonList(controlTopicName));
executorService.submit(() -> {
while (true) {
ConsumerRecords<String, ControlEvent> records;
records = consumer.poll(Duration.ofMillis(KAFKA_POLL_TIMEOUT_MS));
for (ConsumerRecord<String, ControlEvent> record : records) {
try {
LOG.debug(String.format("Kafka consumerGroupId = %s topic = %s, partition = %s, offset = %s, customer = %s, country = %s",
"", record.topic(), record.partition(), record.offset(), record.key(), record.value()));
ControlEvent message = record.value();
String senderTopic = message.senderPartition().topic();
if (message.getSenderType().equals(ControlEvent.SenderType.COORDINATOR)) {
if (partitionWorkers.containsKey(senderTopic)) {
for (TransactionParticipant partitionWorker : partitionWorkers.get(senderTopic)) {
partitionWorker.processControlEvent(message);
}
} else {
LOG.warn(String.format("Failed to send message for unregistered participants for topic %s", senderTopic));
}
} else if (message.getSenderType().equals(ControlEvent.SenderType.PARTICIPANT)) {
if (topicCoordinators.containsKey(senderTopic)) {
topicCoordinators.get(senderTopic).processControlEvent(message);
} else {
LOG.warn(String.format("Failed to send message for unregistered coordinator for topic %s", senderTopic));
}
} else {
LOG.warn(String.format("Sender type of Control Message unknown %s", message.getSenderType().name()));
}
} catch (Exception e) {
LOG.error(String.format("Fatal error while consuming a kafka record for topic = %s partition = %s", record.topic(), record.partition()), e);
}
}
try {
consumer.commitSync();
} catch (CommitFailedException exception) {
LOG.error("Fatal error while committing kafka control topic");
}
}
});
}
public void stop() {
producer.stop();
consumer.close();
if (executorService != null) {
boolean terminated = false;
try {
LOG.info("Shutting down executor service.");
executorService.shutdown();
LOG.info("Awaiting termination.");
terminated = executorService.awaitTermination(EXEC_SHUTDOWN_TIMEOUT_MS, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
// ignored
}
if (!terminated) {
LOG.warn(
"Unclean Kafka Control Manager executor service shutdown ");
executorService.shutdownNow();
}
}
}
/**
* Deserializes the incoming Kafka records for the Control Topic.
*
* @param <T> represents the object that is sent over the Control Topic.
*/
public static class KafkaJsonDeserializer<T> implements Deserializer<T> {
private static final Logger LOG = LogManager.getLogger(KafkaJsonDeserializer.class);
private final Class<T> type;
KafkaJsonDeserializer(Class<T> type) {
this.type = type;
}
@Override
public T deserialize(String s, byte[] bytes) {
ObjectMapper mapper = new ObjectMapper();
T obj = null;
try {
obj = mapper.readValue(bytes, type);
} catch (Exception e) {
LOG.error(e.getMessage());
}
return obj;
}
}
}

View File

@@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.kafka;
import org.apache.hudi.connect.transaction.ControlEvent;
import org.apache.hudi.connect.transaction.TransactionCoordinator;
import org.apache.hudi.connect.transaction.TransactionParticipant;
/**
* Manages the Kafka consumer and producer for
* the Kafka Control Topic that ensures coordination across the
* {@link TransactionCoordinator} and {@link TransactionParticipant}s.
*/
public interface KafkaControlAgent {
void registerTransactionParticipant(TransactionParticipant worker);
void deregisterTransactionParticipant(TransactionParticipant worker);
void registerTransactionCoordinator(TransactionCoordinator coordinator);
void deregisterTransactionCoordinator(TransactionCoordinator coordinator);
void publishMessage(ControlEvent message);
}

View File

@@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.kafka;
import org.apache.hudi.connect.transaction.ControlEvent;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.common.serialization.Serializer;
import org.apache.kafka.common.serialization.StringSerializer;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.Properties;
/**
* Kafka producer to send events to the
* Control Topic that coordinates transactions
* across Participants.
*/
public class KafkaControlProducer {
private static final Logger LOG = LogManager.getLogger(KafkaControlProducer.class);
private final String bootstrapServers;
private final String controlTopicName;
private Producer<String, ControlEvent> producer;
public KafkaControlProducer(String bootstrapServers, String controlTopicName) {
this.bootstrapServers = bootstrapServers;
this.controlTopicName = controlTopicName;
start();
}
private void start() {
Properties props = new Properties();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, KafkaJsonSerializer.class);
producer = new KafkaProducer<>(
props,
new StringSerializer(),
new KafkaJsonSerializer()
);
}
public void stop() {
producer.close();
}
public void publishMessage(ControlEvent message) {
ProducerRecord<String, ControlEvent> record
= new ProducerRecord<>(controlTopicName, message.key(), message);
producer.send(record);
}
public static class KafkaJsonSerializer implements Serializer<ControlEvent> {
private static final Logger LOG = LogManager.getLogger(KafkaJsonSerializer.class);
@Override
public byte[] serialize(String topic, ControlEvent data) {
byte[] retVal = null;
ObjectMapper objectMapper = new ObjectMapper();
objectMapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY);
try {
retVal = objectMapper.writeValueAsBytes(data);
} catch (Exception e) {
LOG.error("Fatal error during serialization of Kafka Control Message ", e);
}
return retVal;
}
}
}

View File

@@ -0,0 +1,399 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.transaction;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.connect.kafka.KafkaControlAgent;
import org.apache.hudi.connect.utils.KafkaConnectUtils;
import org.apache.hudi.connect.writers.ConnectTransactionServices;
import org.apache.hudi.connect.writers.KafkaConnectConfigs;
import org.apache.hudi.connect.writers.KafkaConnectTransactionServices;
import org.apache.hudi.exception.HoodieException;
import org.apache.kafka.common.TopicPartition;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.Collectors;
/**
* Implementation of the Coordinator that
* coordinates the Hudi write transactions
* across all the Kafka partitions for a single Kafka Topic.
*/
public class ConnectTransactionCoordinator implements TransactionCoordinator, Runnable {
private static final Logger LOG = LogManager.getLogger(ConnectTransactionCoordinator.class);
private static final String BOOTSTRAP_SERVERS_CFG = "bootstrap.servers";
private static final String KAFKA_OFFSET_KEY = "kafka.commit.offsets";
private static final String KAFKA_OFFSET_DELIMITER = ",";
private static final String KAFKA_OFFSET_KV_DELIMITER = "=";
private static final Long START_COMMIT_INIT_DELAY_MS = 100L;
private static final Long RESTART_COMMIT_DELAY_MS = 500L;
private static final int COORDINATOR_EVENT_LOOP_TIMEOUT_MS = 1000;
private final KafkaConnectConfigs configs;
private final TopicPartition partition;
private final KafkaControlAgent kafkaControlClient;
private final ConnectTransactionServices transactionServices;
private final KafkaPartitionProvider partitionProvider;
private final Map<Integer, List<WriteStatus>> partitionsWriteStatusReceived;
private final Map<Integer, Long> currentConsumedKafkaOffsets;
private final AtomicBoolean hasStarted = new AtomicBoolean(false);
private final BlockingQueue<CoordinatorEvent> events;
private final ExecutorService executorService;
private final ScheduledExecutorService scheduler;
private String currentCommitTime;
private Map<Integer, Long> globalCommittedKafkaOffsets;
private State currentState;
private int numPartitions;
public ConnectTransactionCoordinator(KafkaConnectConfigs configs,
TopicPartition partition,
KafkaControlAgent kafkaControlClient) throws HoodieException {
this(configs,
partition,
kafkaControlClient,
new KafkaConnectTransactionServices(configs),
KafkaConnectUtils::getLatestNumPartitions);
}
public ConnectTransactionCoordinator(KafkaConnectConfigs configs,
TopicPartition partition,
KafkaControlAgent kafkaControlClient,
ConnectTransactionServices transactionServices,
KafkaPartitionProvider partitionProvider) {
this.configs = configs;
this.partition = partition;
this.kafkaControlClient = kafkaControlClient;
this.transactionServices = transactionServices;
this.partitionProvider = partitionProvider;
this.events = new LinkedBlockingQueue<>();
scheduler = Executors.newSingleThreadScheduledExecutor();
executorService = Executors.newSingleThreadExecutor();
this.currentCommitTime = StringUtils.EMPTY_STRING;
this.partitionsWriteStatusReceived = new HashMap<>();
this.globalCommittedKafkaOffsets = new HashMap<>();
this.currentConsumedKafkaOffsets = new HashMap<>();
this.currentState = State.INIT;
}
@Override
public void start() {
if (hasStarted.compareAndSet(false, true)) {
executorService.submit(this);
}
kafkaControlClient.registerTransactionCoordinator(this);
LOG.info(String.format("Start Transaction Coordinator for topic %s partition %s",
partition.topic(), partition.partition()));
initializeGlobalCommittedKafkaOffsets();
// Submit the first start commit
submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT,
partition.topic(),
StringUtils.EMPTY_STRING),
START_COMMIT_INIT_DELAY_MS, TimeUnit.MILLISECONDS);
}
@Override
public void stop() {
kafkaControlClient.deregisterTransactionCoordinator(this);
hasStarted.set(false);
if (executorService != null) {
boolean terminated = false;
try {
LOG.info("Shutting down executor service.");
executorService.shutdown();
LOG.info("Awaiting termination.");
terminated = executorService.awaitTermination(100, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
// ignored
}
if (!terminated) {
LOG.warn(
"Unclean Kafka Control Manager executor service shutdown ");
executorService.shutdownNow();
}
}
}
@Override
public TopicPartition getPartition() {
return partition;
}
@Override
public void processControlEvent(ControlEvent message) {
CoordinatorEvent.CoordinatorEventType type;
if (message.getMsgType().equals(ControlEvent.MsgType.WRITE_STATUS)) {
type = CoordinatorEvent.CoordinatorEventType.WRITE_STATUS;
} else {
LOG.warn(String.format("The Coordinator should not be receiving messages of type %s", message.getMsgType().name()));
return;
}
CoordinatorEvent event = new CoordinatorEvent(type,
message.senderPartition().topic(),
message.getCommitTime());
event.setMessage(message);
submitEvent(event);
}
@Override
public void run() {
while (true) {
try {
CoordinatorEvent event = events.poll(COORDINATOR_EVENT_LOOP_TIMEOUT_MS, TimeUnit.MILLISECONDS);
if (event != null) {
processCoordinatorEvent(event);
}
} catch (InterruptedException exception) {
LOG.warn("Error received while polling the event loop in Partition Coordinator", exception);
}
}
}
private void submitEvent(CoordinatorEvent event) {
this.submitEvent(event, 0, TimeUnit.SECONDS);
}
private void submitEvent(CoordinatorEvent event, long delay, TimeUnit unit) {
scheduler.schedule(() -> {
events.add(event);
}, delay, unit);
}
private void processCoordinatorEvent(CoordinatorEvent event) {
try {
// Ignore NULL and STALE events, unless its one to start a new COMMIT
if (event == null
|| (!event.getEventType().equals(CoordinatorEvent.CoordinatorEventType.START_COMMIT)
&& (!event.getCommitTime().equals(currentCommitTime)))) {
return;
}
switch (event.getEventType()) {
case START_COMMIT:
startNewCommit();
break;
case END_COMMIT:
endExistingCommit();
break;
case WRITE_STATUS:
// Ignore stale write_status messages sent after
if (event.getMessage() != null
&& currentState.equals(State.ENDED_COMMIT)) {
onReceiveWriteStatus(event.getMessage());
} else {
LOG.warn("Could not process WRITE_STATUS due to missing message");
}
break;
case ACK_COMMIT:
submitAckCommit();
break;
case WRITE_STATUS_TIMEOUT:
handleWriteStatusTimeout();
break;
default:
throw new IllegalStateException("Partition Coordinator has received an illegal event type " + event.getEventType().name());
}
} catch (Exception exception) {
LOG.warn("Error received while polling the event loop in Partition Coordinator", exception);
}
}
private void startNewCommit() {
numPartitions = partitionProvider.getLatestNumPartitions(configs.getString(BOOTSTRAP_SERVERS_CFG), partition.topic());
partitionsWriteStatusReceived.clear();
try {
currentCommitTime = transactionServices.startCommit();
ControlEvent message = new ControlEvent.Builder(
ControlEvent.MsgType.START_COMMIT,
ControlEvent.SenderType.COORDINATOR,
currentCommitTime,
partition)
.setCoordinatorInfo(
new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets))
.build();
kafkaControlClient.publishMessage(message);
currentState = State.STARTED_COMMIT;
// schedule a timeout for ending the current commit
submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.END_COMMIT,
partition.topic(),
currentCommitTime),
configs.getCommitIntervalSecs(), TimeUnit.SECONDS);
} catch (Exception exception) {
LOG.error(String.format("Failed to start a new commit %s, will retry", currentCommitTime), exception);
submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT,
partition.topic(),
StringUtils.EMPTY_STRING),
RESTART_COMMIT_DELAY_MS, TimeUnit.MILLISECONDS);
}
}
private void endExistingCommit() {
try {
ControlEvent message = new ControlEvent.Builder(
ControlEvent.MsgType.END_COMMIT,
ControlEvent.SenderType.COORDINATOR,
currentCommitTime,
partition)
.setCoordinatorInfo(new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets))
.build();
kafkaControlClient.publishMessage(message);
} catch (Exception exception) {
LOG.warn(String.format("Could not send END_COMMIT message for partition %s and commitTime %s", partition, currentCommitTime), exception);
}
currentConsumedKafkaOffsets.clear();
currentState = State.ENDED_COMMIT;
// schedule a timeout for receiving all write statuses
submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.WRITE_STATUS_TIMEOUT,
partition.topic(),
currentCommitTime),
configs.getCoordinatorWriteTimeoutSecs(), TimeUnit.SECONDS);
}
private void onReceiveWriteStatus(ControlEvent message) {
ControlEvent.ParticipantInfo participantInfo = message.getParticipantInfo();
if (participantInfo.getOutcomeType().equals(ControlEvent.OutcomeType.WRITE_SUCCESS)) {
int partition = message.senderPartition().partition();
partitionsWriteStatusReceived.put(partition, participantInfo.writeStatuses());
currentConsumedKafkaOffsets.put(partition, participantInfo.getKafkaCommitOffset());
}
if (partitionsWriteStatusReceived.size() >= numPartitions
&& currentState.equals(State.ENDED_COMMIT)) {
// Commit the kafka offsets to the commit file
try {
List<WriteStatus> allWriteStatuses = new ArrayList<>();
partitionsWriteStatusReceived.forEach((key, value) -> allWriteStatuses.addAll(value));
// Commit the last write in Hudi, along with the latest kafka offset
if (!allWriteStatuses.isEmpty()) {
transactionServices.endCommit(currentCommitTime,
allWriteStatuses,
transformKafkaOffsets(currentConsumedKafkaOffsets));
}
currentState = State.WRITE_STATUS_RCVD;
globalCommittedKafkaOffsets.putAll(currentConsumedKafkaOffsets);
submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.ACK_COMMIT,
partition.topic(),
currentCommitTime));
} catch (Exception exception) {
LOG.error("Fatal error while committing file", exception);
}
}
}
private void handleWriteStatusTimeout() {
// If we are still stuck in ENDED_STATE
if (currentState.equals(State.ENDED_COMMIT)) {
currentState = State.WRITE_STATUS_TIMEDOUT;
LOG.warn("Did not receive the Write Status from all partitions");
// Submit the next start commit
submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT,
partition.topic(),
StringUtils.EMPTY_STRING),
RESTART_COMMIT_DELAY_MS, TimeUnit.MILLISECONDS);
}
}
private void submitAckCommit() {
try {
ControlEvent message = new ControlEvent.Builder(
ControlEvent.MsgType.ACK_COMMIT,
ControlEvent.SenderType.COORDINATOR,
currentCommitTime,
partition)
.setCoordinatorInfo(
new ControlEvent.CoordinatorInfo(globalCommittedKafkaOffsets))
.build();
kafkaControlClient.publishMessage(message);
} catch (Exception exception) {
LOG.warn(String.format("Could not send ACK_COMMIT message for partition %s and commitTime %s", partition, currentCommitTime), exception);
}
currentState = State.ACKED_COMMIT;
// Submit the next start commit
submitEvent(new CoordinatorEvent(CoordinatorEvent.CoordinatorEventType.START_COMMIT,
partition.topic(),
StringUtils.EMPTY_STRING),
START_COMMIT_INIT_DELAY_MS, TimeUnit.MILLISECONDS);
}
private void initializeGlobalCommittedKafkaOffsets() {
try {
Map<String, String> commitMetadata = transactionServices.fetchLatestExtraCommitMetadata();
String latestKafkaOffsets = commitMetadata.get(KAFKA_OFFSET_KEY);
if (!StringUtils.isNullOrEmpty(latestKafkaOffsets)) {
LOG.info("Retrieved Raw Kafka offsets from Hudi Commit File " + latestKafkaOffsets);
globalCommittedKafkaOffsets = Arrays.stream(latestKafkaOffsets.split(KAFKA_OFFSET_DELIMITER))
.map(entry -> entry.split(KAFKA_OFFSET_KV_DELIMITER))
.collect(Collectors.toMap(entry -> Integer.parseInt(entry[0]), entry -> Long.parseLong(entry[1])));
LOG.info("Initialized the kafka offset commits " + globalCommittedKafkaOffsets);
}
} catch (Exception exception) {
throw new HoodieException("Could not deserialize the kafka commit offsets", exception);
}
}
private Map<String, String> transformKafkaOffsets(Map<Integer, Long> kafkaOffsets) {
try {
String kafkaOffsetValue = kafkaOffsets.keySet().stream()
.map(key -> key + KAFKA_OFFSET_KV_DELIMITER + kafkaOffsets.get(key))
.collect(Collectors.joining(KAFKA_OFFSET_DELIMITER));
return Collections.singletonMap(KAFKA_OFFSET_KEY, kafkaOffsetValue);
} catch (Exception exception) {
throw new HoodieException("Could not serialize the kafka commit offsets", exception);
}
}
private enum State {
INIT,
STARTED_COMMIT,
ENDED_COMMIT,
WRITE_STATUS_RCVD,
WRITE_STATUS_TIMEDOUT,
ACKED_COMMIT,
}
/**
* Provides the current partitions of a Kafka Topic dynamically.
*/
public interface KafkaPartitionProvider {
int getLatestNumPartitions(String bootstrapServers, String topicName);
}
}

View File

@@ -0,0 +1,254 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.transaction;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.connect.kafka.KafkaControlAgent;
import org.apache.hudi.connect.writers.ConnectWriterProvider;
import org.apache.hudi.connect.writers.KafkaConnectConfigs;
import org.apache.hudi.connect.writers.KafkaConnectWriterProvider;
import org.apache.hudi.exception.HoodieException;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.connect.sink.SinkRecord;
import org.apache.kafka.connect.sink.SinkTaskContext;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
/**
* Implementation of the {@link TransactionParticipant} that coordinates the Hudi write transactions
* based on events from the {@link TransactionCoordinator} and manages the Hudi Writes for a specific Kafka Partition.
*/
public class ConnectTransactionParticipant implements TransactionParticipant {
private static final Logger LOG = LogManager.getLogger(ConnectTransactionParticipant.class);
private final LinkedList<SinkRecord> buffer;
private final BlockingQueue<ControlEvent> controlEvents;
private final TopicPartition partition;
private final SinkTaskContext context;
private final KafkaControlAgent kafkaControlAgent;
private final ConnectWriterProvider<WriteStatus> writerProvider;
private TransactionInfo<WriteStatus> ongoingTransactionInfo;
private long committedKafkaOffset;
public ConnectTransactionParticipant(KafkaConnectConfigs configs,
TopicPartition partition,
KafkaControlAgent kafkaControlAgent,
SinkTaskContext context) throws HoodieException {
this(partition, kafkaControlAgent, context, new KafkaConnectWriterProvider(configs, partition));
}
public ConnectTransactionParticipant(TopicPartition partition,
KafkaControlAgent kafkaControlAgent,
SinkTaskContext context,
ConnectWriterProvider<WriteStatus> writerProvider) throws HoodieException {
this.buffer = new LinkedList<>();
this.controlEvents = new LinkedBlockingQueue<>();
this.partition = partition;
this.context = context;
this.writerProvider = writerProvider;
this.kafkaControlAgent = kafkaControlAgent;
this.ongoingTransactionInfo = null;
this.committedKafkaOffset = 0;
}
@Override
public void start() {
LOG.info("Start Hudi Transaction Participant for partition " + partition.partition());
this.kafkaControlAgent.registerTransactionParticipant(this);
context.pause(partition);
}
@Override
public void stop() {
this.kafkaControlAgent.deregisterTransactionParticipant(this);
cleanupOngoingTransaction();
}
@Override
public void buffer(SinkRecord record) {
buffer.add(record);
}
@Override
public void processControlEvent(ControlEvent message) {
controlEvents.add(message);
}
@Override
public long getLastKafkaCommittedOffset() {
return committedKafkaOffset;
}
@Override
public TopicPartition getPartition() {
return partition;
}
@Override
public void processRecords() {
while (!controlEvents.isEmpty()) {
ControlEvent message = controlEvents.poll();
switch (message.getMsgType()) {
case START_COMMIT:
handleStartCommit(message);
break;
case END_COMMIT:
handleEndCommit(message);
break;
case ACK_COMMIT:
handleAckCommit(message);
break;
case WRITE_STATUS:
// ignore write status since its only processed by leader
break;
default:
throw new IllegalStateException("HudiTransactionParticipant received incorrect state " + message.getMsgType());
}
}
writeRecords();
}
private void handleStartCommit(ControlEvent message) {
// If there is an existing/ongoing transaction locally
// but it failed globally since we received another START_COMMIT instead of an END_COMMIT or ACK_COMMIT,
// so close it and start new transaction
cleanupOngoingTransaction();
// Resync the last committed Kafka offset from the leader
syncKafkaOffsetWithLeader(message);
context.resume(partition);
String currentCommitTime = message.getCommitTime();
LOG.info("Started a new transaction after receiving START_COMMIT for commit " + currentCommitTime);
try {
ongoingTransactionInfo = new TransactionInfo<>(currentCommitTime, writerProvider.getWriter(currentCommitTime));
ongoingTransactionInfo.setLastWrittenKafkaOffset(committedKafkaOffset);
} catch (Exception exception) {
LOG.warn("Error received while starting a new transaction", exception);
}
}
private void handleEndCommit(ControlEvent message) {
if (ongoingTransactionInfo == null) {
LOG.warn(String.format("END_COMMIT %s is received while we were NOT in active transaction", message.getCommitTime()));
return;
} else if (!ongoingTransactionInfo.getCommitTime().equals(message.getCommitTime())) {
LOG.error(String.format("Fatal error received END_COMMIT with commit time %s while local transaction commit time %s",
message.getCommitTime(), ongoingTransactionInfo.getCommitTime()));
// Recovery: A new END_COMMIT from leader caused interruption to an existing transaction,
// explicitly reset Kafka commit offset to ensure no data loss
cleanupOngoingTransaction();
syncKafkaOffsetWithLeader(message);
return;
}
// send Writer Status Message and wait for ACK_COMMIT in async fashion
try {
context.pause(partition);
ongoingTransactionInfo.commitInitiated();
//sendWriterStatus
List<WriteStatus> writeStatuses = new ArrayList<>();
try {
writeStatuses = ongoingTransactionInfo.getWriter().close();
} catch (IOException exception) {
LOG.warn("Error closing the Hudi Writer", exception);
}
ControlEvent writeStatus = new ControlEvent.Builder(ControlEvent.MsgType.WRITE_STATUS,
ControlEvent.SenderType.PARTICIPANT, ongoingTransactionInfo.getCommitTime(), partition)
.setParticipantInfo(new ControlEvent.ParticipantInfo(
writeStatuses,
ongoingTransactionInfo.getLastWrittenKafkaOffset(),
ControlEvent.OutcomeType.WRITE_SUCCESS))
.build();
kafkaControlAgent.publishMessage(writeStatus);
} catch (Exception exception) {
LOG.warn(String.format("Error ending commit %s for partition %s", message.getCommitTime(), partition.partition()), exception);
}
}
private void handleAckCommit(ControlEvent message) {
// Update lastKafkCommitedOffset locally.
if (ongoingTransactionInfo != null && committedKafkaOffset < ongoingTransactionInfo.getLastWrittenKafkaOffset()) {
committedKafkaOffset = ongoingTransactionInfo.getLastWrittenKafkaOffset();
}
syncKafkaOffsetWithLeader(message);
cleanupOngoingTransaction();
}
private void writeRecords() {
if (ongoingTransactionInfo != null && !ongoingTransactionInfo.isCommitInitiated()) {
while (!buffer.isEmpty()) {
try {
SinkRecord record = buffer.peek();
if (record != null
&& record.kafkaOffset() >= ongoingTransactionInfo.getLastWrittenKafkaOffset()) {
ongoingTransactionInfo.getWriter().writeRecord(record);
ongoingTransactionInfo.setLastWrittenKafkaOffset(record.kafkaOffset() + 1);
} else if (record != null && record.kafkaOffset() < committedKafkaOffset) {
LOG.warn(String.format("Received a kafka record with offset %s prior to last committed offset %s for partition %s",
record.kafkaOffset(), ongoingTransactionInfo.getLastWrittenKafkaOffset(),
partition));
}
buffer.poll();
} catch (Exception exception) {
LOG.warn(String.format("Error received while writing records for transaction %s in partition %s",
ongoingTransactionInfo.getCommitTime(), partition.partition()),
exception);
}
}
}
}
private void cleanupOngoingTransaction() {
if (ongoingTransactionInfo != null) {
try {
ongoingTransactionInfo.getWriter().close();
ongoingTransactionInfo = null;
} catch (IOException exception) {
LOG.warn("Error received while trying to cleanup existing transaction", exception);
}
}
}
private void syncKafkaOffsetWithLeader(ControlEvent message) {
if (message.getCoordinatorInfo() != null) {
Long coordinatorCommittedKafkaOffset = message.getCoordinatorInfo().getGlobalKafkaCommitOffsets().get(partition.partition());
// Recover kafka committed offsets, treating the commit offset from the coordinator
// as the source of truth
if (coordinatorCommittedKafkaOffset != null && coordinatorCommittedKafkaOffset >= 0) {
if (coordinatorCommittedKafkaOffset != committedKafkaOffset) {
LOG.warn(String.format("Recovering the kafka offset for partition %s to offset %s instead of local offset %s",
partition.partition(), coordinatorCommittedKafkaOffset, committedKafkaOffset));
context.offset(partition, coordinatorCommittedKafkaOffset);
}
committedKafkaOffset = coordinatorCommittedKafkaOffset;
}
}
}
}

View File

@@ -0,0 +1,222 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.transaction;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.util.SerializationUtils;
import org.apache.kafka.common.TopicPartition;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
/**
* The events sent over the Kafka Control Topic between the
* coordinator and the followers, in order to ensure
* coordination across all the writes.
*/
@SuppressWarnings("checkstyle:VisibilityModifier")
public class ControlEvent implements Serializable {
private static final Logger LOG = LogManager.getLogger(ControlEvent.class);
private static final int CURRENT_VERSION = 0;
private final int version = CURRENT_VERSION;
private MsgType msgType;
private SenderType senderType;
private String commitTime;
private byte[] senderPartition;
private CoordinatorInfo coordinatorInfo;
private ParticipantInfo participantInfo;
public ControlEvent() {
}
public ControlEvent(MsgType msgType,
SenderType senderType,
String commitTime,
byte[] senderPartition,
CoordinatorInfo coordinatorInfo,
ParticipantInfo participantInfo) {
this.msgType = msgType;
this.senderType = senderType;
this.commitTime = commitTime;
this.senderPartition = senderPartition;
this.coordinatorInfo = coordinatorInfo;
this.participantInfo = participantInfo;
}
public String key() {
return msgType.name().toLowerCase(Locale.ROOT);
}
public MsgType getMsgType() {
return msgType;
}
public SenderType getSenderType() {
return senderType;
}
public String getCommitTime() {
return commitTime;
}
public byte[] getSenderPartition() {
return senderPartition;
}
public TopicPartition senderPartition() {
return SerializationUtils.deserialize(senderPartition);
}
public CoordinatorInfo getCoordinatorInfo() {
return coordinatorInfo;
}
public ParticipantInfo getParticipantInfo() {
return participantInfo;
}
public int getVersion() {
return version;
}
@Override
public String toString() {
return String.format("%s %s %s %s %s %s", version, msgType.name(), commitTime,
Arrays.toString(senderPartition), coordinatorInfo.toString(), participantInfo.toString());
}
/**
* Builder that helps build {@link ControlEvent}.
*/
public static class Builder {
private final MsgType msgType;
private SenderType senderType;
private final String commitTime;
private final byte[] senderPartition;
private CoordinatorInfo coordinatorInfo;
private ParticipantInfo participantInfo;
public Builder(MsgType msgType, SenderType senderType, String commitTime, TopicPartition senderPartition) throws IOException {
this.msgType = msgType;
this.senderType = senderType;
this.commitTime = commitTime;
this.senderPartition = SerializationUtils.serialize(senderPartition);
}
public Builder setCoordinatorInfo(CoordinatorInfo coordinatorInfo) {
this.coordinatorInfo = coordinatorInfo;
return this;
}
public Builder setParticipantInfo(ParticipantInfo participantInfo) {
this.participantInfo = participantInfo;
return this;
}
public ControlEvent build() {
return new ControlEvent(msgType, senderType, commitTime, senderPartition, coordinatorInfo, participantInfo);
}
}
/**
* The info sent by the {@link TransactionCoordinator} to one or more
* {@link TransactionParticipant}s.
*/
public static class CoordinatorInfo implements Serializable {
private Map<Integer, Long> globalKafkaCommitOffsets;
public CoordinatorInfo() {
}
public CoordinatorInfo(Map<Integer, Long> globalKafkaCommitOffsets) {
this.globalKafkaCommitOffsets = globalKafkaCommitOffsets;
}
public Map<Integer, Long> getGlobalKafkaCommitOffsets() {
return (globalKafkaCommitOffsets == null) ? new HashMap<>() : globalKafkaCommitOffsets;
}
}
/**
* The info sent by a {@link TransactionParticipant} instances to the
* {@link TransactionCoordinator}.
*/
public static class ParticipantInfo implements Serializable {
private byte[] writeStatusList;
private long kafkaCommitOffset;
private OutcomeType outcomeType;
public ParticipantInfo() {
}
public ParticipantInfo(List<WriteStatus> writeStatuses, long kafkaCommitOffset, OutcomeType outcomeType) throws IOException {
this.writeStatusList = SerializationUtils.serialize(writeStatuses);
this.kafkaCommitOffset = kafkaCommitOffset;
this.outcomeType = outcomeType;
}
public byte[] getWriteStatusList() {
return writeStatusList;
}
public List<WriteStatus> writeStatuses() {
return SerializationUtils.deserialize(writeStatusList);
}
public long getKafkaCommitOffset() {
return kafkaCommitOffset;
}
public OutcomeType getOutcomeType() {
return outcomeType;
}
}
/**
* Type of Control Event.
*/
public enum MsgType {
START_COMMIT,
END_COMMIT,
ACK_COMMIT,
WRITE_STATUS,
}
public enum SenderType {
COORDINATOR,
PARTICIPANT
}
public enum OutcomeType {
WRITE_SUCCESS,
}
}

View File

@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.transaction;
/**
* The events within the Coordinator that trigger
* the state changes in the state machine of
* the Coordinator.
*/
public class CoordinatorEvent {
private final CoordinatorEventType eventType;
private final String topicName;
private final String commitTime;
private ControlEvent message;
public CoordinatorEvent(CoordinatorEventType eventType,
String topicName,
String commitTime) {
this.eventType = eventType;
this.topicName = topicName;
this.commitTime = commitTime;
}
public CoordinatorEventType getEventType() {
return eventType;
}
public String getTopicName() {
return topicName;
}
public String getCommitTime() {
return commitTime;
}
public ControlEvent getMessage() {
return message;
}
public void setMessage(ControlEvent message) {
this.message = message;
}
/**
* The type of Coordinator Event.
*/
public enum CoordinatorEventType {
START_COMMIT,
END_COMMIT,
WRITE_STATUS,
ACK_COMMIT,
WRITE_STATUS_TIMEOUT
}
}

View File

@@ -0,0 +1,40 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.transaction;
import org.apache.kafka.common.TopicPartition;
/**
* The Base Coordinator that
* coordinates the write transactions
* across all the Kafka partitions, that
* are managed by the {@link TransactionParticipant}.
*/
public interface TransactionCoordinator {
void start();
void stop();
/* Kafka Topic that this Coordinator belongs to */
TopicPartition getPartition();
/* Called when a control event is received from the Kafka control topic */
void processControlEvent(ControlEvent message);
}

View File

@@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.transaction;
import org.apache.hudi.connect.writers.ConnectWriter;
/**
* Stores all the state for the current Transaction within a
* {@link TransactionParticipant}.
* @param <T> The type of status returned by the underlying writer.
*/
public class TransactionInfo<T> {
private final String commitTime;
private final ConnectWriter<T> writer;
private long lastWrittenKafkaOffset;
private boolean commitInitiated;
public TransactionInfo(String commitTime, ConnectWriter<T> writer) {
this.commitTime = commitTime;
this.writer = writer;
this.lastWrittenKafkaOffset = 0;
this.commitInitiated = false;
}
public String getCommitTime() {
return commitTime;
}
public ConnectWriter<T> getWriter() {
return writer;
}
public long getLastWrittenKafkaOffset() {
return lastWrittenKafkaOffset;
}
public boolean isCommitInitiated() {
return commitInitiated;
}
public void setLastWrittenKafkaOffset(long lastWrittenKafkaOffset) {
this.lastWrittenKafkaOffset = lastWrittenKafkaOffset;
}
public void commitInitiated() {
this.commitInitiated = true;
}
}

View File

@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.transaction;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.connect.sink.SinkRecord;
/**
* Interface for the Participant that
* manages Writes for a
* single Kafka partition, based on
* coordination signals from the {@link TransactionCoordinator}.
*/
public interface TransactionParticipant {
void start();
void stop();
void buffer(SinkRecord record);
void processRecords();
TopicPartition getPartition();
void processControlEvent(ControlEvent message);
long getLastKafkaCommittedOffset();
}

View File

@@ -0,0 +1,140 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.utils;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.keygen.BaseKeyGenerator;
import org.apache.hudi.keygen.CustomAvroKeyGenerator;
import org.apache.hudi.keygen.CustomKeyGenerator;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.hadoop.conf.Configuration;
import org.apache.kafka.clients.admin.AdminClient;
import org.apache.kafka.clients.admin.DescribeTopicsResult;
import org.apache.kafka.clients.admin.TopicDescription;
import org.apache.kafka.common.KafkaFuture;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.Arrays;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
/**
* Helper methods for Kafka.
*/
public class KafkaConnectUtils {
private static final Logger LOG = LogManager.getLogger(KafkaConnectUtils.class);
public static int getLatestNumPartitions(String bootstrapServers, String topicName) {
Properties props = new Properties();
props.put("bootstrap.servers", bootstrapServers);
try {
AdminClient client = AdminClient.create(props);
DescribeTopicsResult result = client.describeTopics(Arrays.asList(topicName));
Map<String, KafkaFuture<TopicDescription>> values = result.values();
KafkaFuture<TopicDescription> topicDescription = values.get(topicName);
int numPartitions = topicDescription.get().partitions().size();
LOG.info(String.format("Latest number of partitions for topic %s is %s", topicName, numPartitions));
return numPartitions;
} catch (Exception exception) {
throw new HoodieException("Fatal error fetching the latest partition of kafka topic name" + topicName, exception);
}
}
/**
* Returns the default Hadoop Configuration.
* @return
*/
public static Configuration getDefaultHadoopConf() {
Configuration hadoopConf = new Configuration();
hadoopConf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
return hadoopConf;
}
/**
* Extract the record fields.
* @param keyGenerator key generator Instance of the keygenerator.
* @return Returns the record key columns seprarated by comma.
*/
public static String getRecordKeyColumns(KeyGenerator keyGenerator) {
return String.join(",", keyGenerator.getRecordKeyFieldNames());
}
/**
* Extract partition columns directly if an instance of class {@link BaseKeyGenerator},
* else extract partition columns from the properties.
*
* @param keyGenerator key generator Instance of the keygenerator.
* @param typedProperties properties from the config.
* @return partition columns Returns the partition columns seprarated by comma.
*/
public static String getPartitionColumns(KeyGenerator keyGenerator, TypedProperties typedProperties) {
if (keyGenerator instanceof CustomKeyGenerator || keyGenerator instanceof CustomAvroKeyGenerator) {
return ((BaseKeyGenerator) keyGenerator).getPartitionPathFields().stream().map(
pathField -> Arrays.stream(pathField.split(CustomAvroKeyGenerator.SPLIT_REGEX))
.findFirst().orElse("Illegal partition path field format: '$pathField' for ${c.getClass.getSimpleName}"))
.collect(Collectors.joining(","));
}
if (keyGenerator instanceof BaseKeyGenerator) {
return String.join(",", ((BaseKeyGenerator) keyGenerator).getPartitionPathFields());
}
return typedProperties.getString(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key());
}
/**
* Get the Metadata from the latest commit file.
*
* @param metaClient The {@link HoodieTableMetaClient} to get access to the meta data.
* @return An Optional {@link HoodieCommitMetadata} containing the meta data from the latest commit file.
*/
public static Option<HoodieCommitMetadata> getCommitMetadataForLatestInstant(HoodieTableMetaClient metaClient) {
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline()
.filterCompletedInstants()
.filter(instant -> (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE && instant.getAction().equals(HoodieActiveTimeline.COMMIT_ACTION))
|| (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ && instant.getAction().equals(HoodieActiveTimeline.DELTA_COMMIT_ACTION))
);
Option<HoodieInstant> latestInstant = timeline.lastInstant();
if (latestInstant.isPresent()) {
try {
byte[] data = timeline.getInstantDetails(latestInstant.get()).get();
return Option.of(HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class));
} catch (Exception e) {
throw new HoodieException("Failed to read schema from commit metadata", e);
}
} else {
return Option.empty();
}
}
}

View File

@@ -0,0 +1,91 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.writers;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.schema.SchemaProvider;
import org.apache.hudi.utilities.sources.helpers.AvroConvertor;
import org.apache.avro.generic.GenericRecord;
import org.apache.kafka.connect.sink.SinkRecord;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.List;
/**
* Base Hudi Writer that manages reading the raw Kafka records and
* converting them to {@link HoodieRecord}s that can be written to Hudi by
* the derived implementations of this class.
*/
public abstract class AbstractConnectWriter implements ConnectWriter<WriteStatus> {
public static final String KAFKA_AVRO_CONVERTER = "io.confluent.connect.avro.AvroConverter";
public static final String KAFKA_JSON_CONVERTER = "org.apache.kafka.connect.json.JsonConverter";
public static final String KAFKA_STRING_CONVERTER = "org.apache.kafka.connect.storage.StringConverter";
private static final Logger LOG = LogManager.getLogger(AbstractConnectWriter.class);
private final KafkaConnectConfigs connectConfigs;
private final KeyGenerator keyGenerator;
private final SchemaProvider schemaProvider;
public AbstractConnectWriter(KafkaConnectConfigs connectConfigs,
KeyGenerator keyGenerator,
SchemaProvider schemaProvider) {
this.connectConfigs = connectConfigs;
this.keyGenerator = keyGenerator;
this.schemaProvider = schemaProvider;
}
@Override
public void writeRecord(SinkRecord record) throws IOException {
AvroConvertor convertor = new AvroConvertor(schemaProvider.getSourceSchema());
Option<GenericRecord> avroRecord;
switch (connectConfigs.getKafkaValueConverter()) {
case KAFKA_AVRO_CONVERTER:
avroRecord = Option.of((GenericRecord) record.value());
break;
case KAFKA_STRING_CONVERTER:
avroRecord = Option.of(convertor.fromJson((String) record.value()));
break;
case KAFKA_JSON_CONVERTER:
throw new UnsupportedEncodingException("Currently JSON objects are not supported");
default:
throw new IOException("Unsupported Kafka Format type (" + connectConfigs.getKafkaValueConverter() + ")");
}
HoodieRecord hoodieRecord = new HoodieRecord<>(keyGenerator.getKey(avroRecord.get()), new HoodieAvroPayload(avroRecord));
writeHudiRecord(hoodieRecord);
}
@Override
public List<WriteStatus> close() {
return flushHudiRecords();
}
protected abstract void writeHudiRecord(HoodieRecord<HoodieAvroPayload> record);
protected abstract List<WriteStatus> flushHudiRecords();
}

View File

@@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.writers;
import org.apache.hudi.client.HoodieJavaWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.DefaultSizeEstimator;
import org.apache.hudi.common.util.HoodieRecordSizeEstimator;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.IOUtils;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.schema.SchemaProvider;
import org.apache.avro.Schema;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
/**
* Specific implementation of a Hudi Writer that buffers all incoming records,
* and writes them to Hudi files on the end of a transaction using Bulk Insert.
*/
public class BufferedConnectWriter extends AbstractConnectWriter {
private static final Logger LOG = LogManager.getLogger(BufferedConnectWriter.class);
private final HoodieEngineContext context;
private final HoodieJavaWriteClient writeClient;
private final String instantTime;
private final HoodieWriteConfig config;
private ExternalSpillableMap<String, HoodieRecord<HoodieAvroPayload>> bufferedRecords;
public BufferedConnectWriter(HoodieEngineContext context,
HoodieJavaWriteClient writeClient,
String instantTime,
KafkaConnectConfigs connectConfigs,
HoodieWriteConfig config,
KeyGenerator keyGenerator,
SchemaProvider schemaProvider) {
super(connectConfigs, keyGenerator, schemaProvider);
this.context = context;
this.writeClient = writeClient;
this.instantTime = instantTime;
this.config = config;
init();
}
private void init() {
try {
// Load and batch all incoming records in a map
long memoryForMerge = IOUtils.getMaxMemoryPerPartitionMerge(context.getTaskContextSupplier(), config);
LOG.info("MaxMemoryPerPartitionMerge => " + memoryForMerge);
this.bufferedRecords = new ExternalSpillableMap<>(memoryForMerge,
config.getSpillableMapBasePath(),
new DefaultSizeEstimator(),
new HoodieRecordSizeEstimator(new Schema.Parser().parse(config.getSchema())),
config.getCommonConfig().getSpillableDiskMapType(),
config.getCommonConfig().isBitCaskDiskMapCompressionEnabled());
} catch (IOException io) {
throw new HoodieIOException("Cannot instantiate an ExternalSpillableMap", io);
}
}
@Override
public void writeHudiRecord(HoodieRecord<HoodieAvroPayload> record) {
bufferedRecords.put(record.getRecordKey(), record);
}
@Override
public List<WriteStatus> flushHudiRecords() {
try {
LOG.info("Number of entries in MemoryBasedMap => "
+ bufferedRecords.getInMemoryMapNumEntries()
+ "Total size in bytes of MemoryBasedMap => "
+ bufferedRecords.getCurrentInMemoryMapSize() + "Number of entries in BitCaskDiskMap => "
+ bufferedRecords.getDiskBasedMapNumEntries() + "Size of file spilled to disk => "
+ bufferedRecords.getSizeOfFileOnDiskInBytes());
List<WriteStatus> writeStatuses = new ArrayList<>();
// Write out all records if non-empty
if (!bufferedRecords.isEmpty()) {
writeStatuses = writeClient.bulkInsertPreppedRecords(
bufferedRecords.values().stream().collect(Collectors.toList()),
instantTime, Option.empty());
}
bufferedRecords.close();
LOG.info("Flushed hudi records and got writeStatuses: "
+ writeStatuses);
return writeStatuses;
} catch (Exception e) {
throw new HoodieException("Write records failed", e);
}
}
}

View File

@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.writers;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.connect.transaction.TransactionCoordinator;
import java.util.List;
import java.util.Map;
/**
* Transaction service APIs used by
* {@link TransactionCoordinator}.
*/
public interface ConnectTransactionServices {
String startCommit();
void endCommit(String commitTime, List<WriteStatus> writeStatuses, Map<String, String> extraMetadata);
Map<String, String> fetchLatestExtraCommitMetadata();
}

View File

@@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.writers;
import org.apache.kafka.connect.sink.SinkRecord;
import java.io.IOException;
import java.util.List;
public interface ConnectWriter<T> {
void writeRecord(SinkRecord record) throws IOException;
List<T> close() throws IOException;
}

View File

@@ -0,0 +1,24 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.writers;
public interface ConnectWriterProvider<T> {
ConnectWriter<T> getWriter(String commitTime);
}

View File

@@ -0,0 +1,177 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.writers;
import org.apache.hudi.common.config.ConfigClassProperty;
import org.apache.hudi.common.config.ConfigGroups;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.schema.FilebasedSchemaProvider;
import javax.annotation.concurrent.Immutable;
import java.util.Map;
import java.util.Properties;
/**
* Class storing configs for the HoodieWriteClient.
*/
@Immutable
@ConfigClassProperty(name = "Kafka Sink Connect Configurations",
groupName = ConfigGroups.Names.KAFKA_CONNECT,
description = "Configurations for Kakfa Connect Sink Connector for Hudi.")
public class KafkaConnectConfigs extends HoodieConfig {
public static final String KAFKA_VALUE_CONVERTER = "value.converter";
public static final ConfigProperty<String> KAFKA_BOOTSTRAP_SERVERS = ConfigProperty
.key("bootstrap.servers")
.defaultValue("localhost:9092")
.withDocumentation("The bootstrap servers for the Kafka Cluster.");
public static final ConfigProperty<String> CONTROL_TOPIC_NAME = ConfigProperty
.key("hoodie.kafka.control.topic")
.defaultValue("hudi-control-topic")
.withDocumentation("Kafka topic name used by the Hudi Sink Connector for "
+ "sending and receiving control messages. Not used for data records.");
public static final ConfigProperty<String> SCHEMA_PROVIDER_CLASS = ConfigProperty
.key("hoodie.schemaprovider.class")
.defaultValue(FilebasedSchemaProvider.class.getName())
.withDocumentation("subclass of org.apache.hudi.schema.SchemaProvider "
+ "to attach schemas to input & target table data, built in options: "
+ "org.apache.hudi.schema.FilebasedSchemaProvider.");
public static final ConfigProperty<String> COMMIT_INTERVAL_SECS = ConfigProperty
.key("hoodie.kafka.commit.interval.secs")
.defaultValue("60")
.withDocumentation("The interval at which Hudi will commit the records written "
+ "to the files, making them consumable on the read-side.");
public static final ConfigProperty<String> COORDINATOR_WRITE_TIMEOUT_SECS = ConfigProperty
.key("hoodie.kafka.coordinator.write.timeout.secs")
.defaultValue("60")
.withDocumentation("The timeout after sending an END_COMMIT until when "
+ "the coordinator will wait for the write statuses from all the partitions"
+ "to ignore the current commit and start a new commit.");
public static final ConfigProperty<String> META_SYNC_ENABLE = ConfigProperty
.key("hoodie.meta.sync.enable")
.defaultValue("false")
.withDocumentation("Enable Meta Sync such as Hive");
public static final ConfigProperty<String> META_SYNC_CLASSES = ConfigProperty
.key("hoodie.meta.sync.classes")
.defaultValue(HiveSyncTool.class.getName())
.withDocumentation("Meta sync client tool, using comma to separate multi tools");
protected KafkaConnectConfigs() {
super();
}
protected KafkaConnectConfigs(Properties props) {
super(props);
Properties newProps = new Properties();
newProps.putAll(props);
}
public static KafkaConnectConfigs.Builder newBuilder() {
return new KafkaConnectConfigs.Builder();
}
public String getBootstrapServers() {
return getString(KAFKA_BOOTSTRAP_SERVERS);
}
public String getControlTopicName() {
return getString(CONTROL_TOPIC_NAME);
}
public String getSchemaProviderClass() {
return getString(SCHEMA_PROVIDER_CLASS);
}
public Long getCommitIntervalSecs() {
return getLong(COMMIT_INTERVAL_SECS);
}
public Long getCoordinatorWriteTimeoutSecs() {
return getLong(COORDINATOR_WRITE_TIMEOUT_SECS);
}
public String getKafkaValueConverter() {
return getString(KAFKA_VALUE_CONVERTER);
}
public Boolean isMetaSyncEnabled() {
return getBoolean(META_SYNC_ENABLE);
}
public String getMetaSyncClasses() {
return getString(META_SYNC_CLASSES);
}
public static class Builder {
protected final KafkaConnectConfigs connectConfigs = new KafkaConnectConfigs();
public Builder withBootstrapServers(String bootstrapServers) {
connectConfigs.setValue(KAFKA_BOOTSTRAP_SERVERS, bootstrapServers);
return this;
}
public Builder withControlTopicName(String controlTopicName) {
connectConfigs.setValue(CONTROL_TOPIC_NAME, controlTopicName);
return this;
}
public Builder withCommitIntervalSecs(Long commitIntervalSecs) {
connectConfigs.setValue(COMMIT_INTERVAL_SECS, String.valueOf(commitIntervalSecs));
return this;
}
public Builder withCoordinatorWriteTimeoutSecs(Long coordinatorWriteTimeoutSecs) {
connectConfigs.setValue(COORDINATOR_WRITE_TIMEOUT_SECS, String.valueOf(coordinatorWriteTimeoutSecs));
return this;
}
// Kafka connect task are passed with props with type Map<>
public Builder withProperties(Map<?, ?> properties) {
connectConfigs.getProps().putAll(properties);
return this;
}
public Builder withProperties(Properties properties) {
connectConfigs.getProps().putAll(properties);
return this;
}
protected void setDefaults() {
// Check for mandatory properties
connectConfigs.setDefaults(KafkaConnectConfigs.class.getName());
}
public KafkaConnectConfigs build() {
setDefaults();
// Build HudiConnectConfigs at the end
return new KafkaConnectConfigs(connectConfigs.getProps());
}
}
}

View File

@@ -0,0 +1,133 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.writers;
import org.apache.hudi.client.HoodieJavaWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieJavaEngineContext;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.connect.transaction.TransactionCoordinator;
import org.apache.hudi.connect.utils.KafkaConnectUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* Implementation of Transaction service APIs used by
* {@link TransactionCoordinator}
* using {@link HoodieJavaWriteClient}.
*/
public class KafkaConnectTransactionServices implements ConnectTransactionServices {
private static final Logger LOG = LogManager.getLogger(KafkaConnectTransactionServices.class);
private static final String TABLE_FORMAT = "PARQUET";
private final Option<HoodieTableMetaClient> tableMetaClient;
private final Configuration hadoopConf;
private final FileSystem fs;
private final String tableBasePath;
private final String tableName;
private final HoodieEngineContext context;
private final HoodieJavaWriteClient<HoodieAvroPayload> javaClient;
public KafkaConnectTransactionServices(
KafkaConnectConfigs connectConfigs) throws HoodieException {
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder()
.withProperties(connectConfigs.getProps()).build();
tableBasePath = writeConfig.getBasePath();
tableName = writeConfig.getTableName();
hadoopConf = KafkaConnectUtils.getDefaultHadoopConf();
context = new HoodieJavaEngineContext(hadoopConf);
fs = FSUtils.getFs(tableBasePath, hadoopConf);
try {
KeyGenerator keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator(
new TypedProperties(connectConfigs.getProps()));
String recordKeyFields = KafkaConnectUtils.getRecordKeyColumns(keyGenerator);
String partitionColumns = KafkaConnectUtils.getPartitionColumns(keyGenerator,
new TypedProperties(connectConfigs.getProps()));
LOG.info(String.format("Setting record key %s and partitionfields %s for table %s",
recordKeyFields,
partitionColumns,
tableBasePath + tableName));
tableMetaClient = Option.of(HoodieTableMetaClient.withPropertyBuilder()
.setTableType(HoodieTableType.COPY_ON_WRITE.name())
.setTableName(tableName)
.setPayloadClassName(HoodieAvroPayload.class.getName())
.setBaseFileFormat(TABLE_FORMAT)
.setRecordKeyFields(recordKeyFields)
.setPartitionFields(partitionColumns)
.setKeyGeneratorClassProp(writeConfig.getKeyGeneratorClass())
.initTable(hadoopConf, tableBasePath));
javaClient = new HoodieJavaWriteClient<>(context, writeConfig);
} catch (Exception exception) {
throw new HoodieException("Fatal error instantiating Hudi Transaction Services ", exception);
}
}
public String startCommit() {
String newCommitTime = javaClient.startCommit();
javaClient.transitionInflight(newCommitTime);
LOG.info("Starting Hudi commit " + newCommitTime);
return newCommitTime;
}
public void endCommit(String commitTime, List<WriteStatus> writeStatuses, Map<String, String> extraMetadata) {
javaClient.commit(commitTime, writeStatuses, Option.of(extraMetadata),
HoodieActiveTimeline.COMMIT_ACTION, Collections.emptyMap());
LOG.info("Ending Hudi commit " + commitTime);
}
public Map<String, String> fetchLatestExtraCommitMetadata() {
if (tableMetaClient.isPresent()) {
Option<HoodieCommitMetadata> metadata = KafkaConnectUtils.getCommitMetadataForLatestInstant(tableMetaClient.get());
if (metadata.isPresent()) {
return metadata.get().getExtraMetadata();
} else {
LOG.info("Hoodie Extra Metadata from latest commit is absent");
return Collections.emptyMap();
}
}
throw new HoodieException("Fatal error retrieving Hoodie Extra Metadata since Table Meta Client is absent");
}
}

View File

@@ -0,0 +1,105 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect.writers;
import org.apache.hudi.client.HoodieJavaWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieJavaEngineContext;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.connect.KafkaConnectFileIdPrefixProvider;
import org.apache.hudi.connect.utils.KafkaConnectUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.keygen.factory.HoodieAvroKeyGeneratorFactory;
import org.apache.hudi.schema.SchemaProvider;
import org.apache.hadoop.conf.Configuration;
import org.apache.kafka.common.TopicPartition;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.util.Collections;
/**
* Provides the Hudi Writer for the {@link org.apache.hudi.connect.transaction.TransactionParticipant}
* to write the incoming records to Hudi.
*/
public class KafkaConnectWriterProvider implements ConnectWriterProvider<WriteStatus> {
private static final Logger LOG = LogManager.getLogger(KafkaConnectWriterProvider.class);
private final KafkaConnectConfigs connectConfigs;
private final HoodieEngineContext context;
private final HoodieWriteConfig writeConfig;
private final HoodieJavaWriteClient<HoodieAvroPayload> hudiJavaClient;
private final KeyGenerator keyGenerator;
private final SchemaProvider schemaProvider;
public KafkaConnectWriterProvider(
KafkaConnectConfigs connectConfigs,
TopicPartition partition) throws HoodieException {
this.connectConfigs = connectConfigs;
Configuration hadoopConf = KafkaConnectUtils.getDefaultHadoopConf();
try {
this.schemaProvider = StringUtils.isNullOrEmpty(connectConfigs.getSchemaProviderClass()) ? null
: (SchemaProvider) ReflectionUtils.loadClass(connectConfigs.getSchemaProviderClass(),
new TypedProperties(connectConfigs.getProps()));
this.keyGenerator = HoodieAvroKeyGeneratorFactory.createKeyGenerator(
new TypedProperties(connectConfigs.getProps()));
// Create the write client to write some records in
writeConfig = HoodieWriteConfig.newBuilder()
.withProperties(connectConfigs.getProps())
.withFileIdPrefixProviderClassName(KafkaConnectFileIdPrefixProvider.class.getName())
.withProps(Collections.singletonMap(
KafkaConnectFileIdPrefixProvider.KAFKA_CONNECT_PARTITION_ID,
String.valueOf(partition)))
.withSchema(schemaProvider.getSourceSchema().toString())
.withAutoCommit(false)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build())
.build();
context = new HoodieJavaEngineContext(hadoopConf);
hudiJavaClient = new HoodieJavaWriteClient<>(context, writeConfig);
} catch (Throwable e) {
throw new HoodieException("Fatal error instantiating Hudi Write Provider ", e);
}
}
public AbstractConnectWriter getWriter(String commitTime) {
return new BufferedConnectWriter(
context,
hudiJavaClient,
commitTime,
connectConfigs,
writeConfig,
keyGenerator,
schemaProvider);
}
}

View File

@@ -0,0 +1,253 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.connect.transaction.ConnectTransactionCoordinator;
import org.apache.hudi.connect.transaction.ControlEvent;
import org.apache.hudi.connect.transaction.TransactionCoordinator;
import org.apache.hudi.connect.transaction.TransactionParticipant;
import org.apache.hudi.connect.writers.KafkaConnectConfigs;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.helper.MockConnectTransactionServices;
import org.apache.hudi.helper.MockKafkaControlAgent;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.connect.sink.SinkRecord;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.mock;
public class TestConnectTransactionCoordinator {
private static final String TOPIC_NAME = "kafka-connect-test-topic";
private static final int NUM_PARTITIONS = 4;
private static final int MAX_COMMIT_ROUNDS = 5;
private static final int TEST_TIMEOUT_SECS = 60;
private KafkaConnectConfigs configs;
private MockParticipant participant;
private MockKafkaControlAgent kafkaControlAgent;
private MockConnectTransactionServices transactionServices;
private CountDownLatch latch;
@BeforeEach
public void setUp() throws Exception {
transactionServices = new MockConnectTransactionServices();
configs = KafkaConnectConfigs.newBuilder()
.withCommitIntervalSecs(1L)
.withCoordinatorWriteTimeoutSecs(1L)
.build();
latch = new CountDownLatch(1);
}
@ParameterizedTest
@EnumSource(value = MockParticipant.TestScenarios.class)
public void testSingleCommitScenario(MockParticipant.TestScenarios scenario) throws InterruptedException {
kafkaControlAgent = new MockKafkaControlAgent();
participant = new MockParticipant(kafkaControlAgent, latch, scenario, MAX_COMMIT_ROUNDS);
participant.start();
// Test the coordinator using the mock participant
TransactionCoordinator coordinator = new ConnectTransactionCoordinator(
configs,
new TopicPartition(TOPIC_NAME, 0),
kafkaControlAgent,
transactionServices,
(bootstrapServers, topicName) -> NUM_PARTITIONS);
coordinator.start();
latch.await(TEST_TIMEOUT_SECS, TimeUnit.SECONDS);
if (latch.getCount() > 0) {
throw new HoodieException("Test timedout resulting in failure");
}
coordinator.stop();
participant.stop();
}
/**
* A mock Transaction Participant, that exercises all the test scenarios
* for the coordinator as mentioned in {@link TestScenarios}.
*/
private static class MockParticipant implements TransactionParticipant {
private final MockKafkaControlAgent kafkaControlAgent;
private final TopicPartition partition;
private final CountDownLatch latch;
private final TestScenarios testScenario;
private final int maxNumberCommitRounds;
private final Map<Integer, Long> kafkaOffsetsCommitted;
private ControlEvent.MsgType expectedMsgType;
private int numberCommitRounds;
public MockParticipant(MockKafkaControlAgent kafkaControlAgent,
CountDownLatch latch,
TestScenarios testScenario,
int maxNumberCommitRounds) {
this.kafkaControlAgent = kafkaControlAgent;
this.latch = latch;
this.testScenario = testScenario;
this.maxNumberCommitRounds = maxNumberCommitRounds;
this.partition = new TopicPartition(TOPIC_NAME, (NUM_PARTITIONS - 1));
this.kafkaOffsetsCommitted = new HashMap<>();
expectedMsgType = ControlEvent.MsgType.START_COMMIT;
numberCommitRounds = 0;
}
@Override
public void start() {
kafkaControlAgent.registerTransactionParticipant(this);
}
@Override
public void stop() {
kafkaControlAgent.deregisterTransactionParticipant(this);
}
@Override
public void buffer(SinkRecord record) {
}
@Override
public void processRecords() {
}
@Override
public TopicPartition getPartition() {
return partition;
}
@Override
public void processControlEvent(ControlEvent message) {
assertEquals(message.getSenderType(), ControlEvent.SenderType.COORDINATOR);
assertEquals(message.senderPartition().topic(), partition.topic());
testScenarios(message);
}
@Override
public long getLastKafkaCommittedOffset() {
return 0;
}
private void testScenarios(ControlEvent message) {
assertEquals(expectedMsgType, message.getMsgType());
switch (message.getMsgType()) {
case START_COMMIT:
expectedMsgType = ControlEvent.MsgType.END_COMMIT;
break;
case END_COMMIT:
assertEquals(kafkaOffsetsCommitted, message.getCoordinatorInfo().getGlobalKafkaCommitOffsets());
int numSuccessPartitions;
Map<Integer, Long> kafkaOffsets = new HashMap<>();
List<ControlEvent> controlEvents = new ArrayList<>();
// Prepare the WriteStatuses for all partitions
for (int i = 1; i <= NUM_PARTITIONS; i++) {
try {
long kafkaOffset = (long) (Math.random() * 10000);
kafkaOffsets.put(i, kafkaOffset);
ControlEvent event = successWriteStatus(
message.getCommitTime(),
new TopicPartition(TOPIC_NAME, i),
kafkaOffset);
controlEvents.add(event);
} catch (Exception exception) {
throw new HoodieException("Fatal error sending control event to Coordinator");
}
}
switch (testScenario) {
case ALL_CONNECT_TASKS_SUCCESS:
numSuccessPartitions = NUM_PARTITIONS;
kafkaOffsetsCommitted.putAll(kafkaOffsets);
expectedMsgType = ControlEvent.MsgType.ACK_COMMIT;
break;
case SUBSET_CONNECT_TASKS_FAILED:
numSuccessPartitions = NUM_PARTITIONS / 2;
expectedMsgType = ControlEvent.MsgType.START_COMMIT;
break;
default:
throw new HoodieException("Unknown test scenario " + testScenario);
}
// Send events based on test scenario
for (int i = 0; i < numSuccessPartitions; i++) {
kafkaControlAgent.publishMessage(controlEvents.get(i));
}
break;
case ACK_COMMIT:
if (numberCommitRounds >= maxNumberCommitRounds) {
latch.countDown();
}
expectedMsgType = ControlEvent.MsgType.START_COMMIT;
break;
default:
throw new HoodieException("Illegal control message type " + message.getMsgType());
}
if (message.getMsgType().equals(ControlEvent.MsgType.START_COMMIT)) {
if (numberCommitRounds >= maxNumberCommitRounds) {
latch.countDown();
}
numberCommitRounds++;
expectedMsgType = ControlEvent.MsgType.END_COMMIT;
}
}
public enum TestScenarios {
SUBSET_CONNECT_TASKS_FAILED,
ALL_CONNECT_TASKS_SUCCESS
}
private static ControlEvent successWriteStatus(String commitTime,
TopicPartition partition,
long kafkaOffset) throws Exception {
// send WS
WriteStatus writeStatus = new WriteStatus();
WriteStatus status = new WriteStatus(false, 1.0);
for (int i = 0; i < 1000; i++) {
status.markSuccess(mock(HoodieRecord.class), Option.empty());
}
return new ControlEvent.Builder(ControlEvent.MsgType.WRITE_STATUS,
ControlEvent.SenderType.PARTICIPANT,
commitTime,
partition)
.setParticipantInfo(new ControlEvent.ParticipantInfo(
Collections.singletonList(writeStatus),
kafkaOffset,
ControlEvent.OutcomeType.WRITE_SUCCESS))
.build();
}
}
}

View File

@@ -0,0 +1,273 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.connect;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.connect.kafka.KafkaControlAgent;
import org.apache.hudi.connect.transaction.ConnectTransactionParticipant;
import org.apache.hudi.connect.transaction.ControlEvent;
import org.apache.hudi.connect.transaction.TransactionCoordinator;
import org.apache.hudi.connect.writers.KafkaConnectConfigs;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.helper.MockKafkaControlAgent;
import org.apache.hudi.helper.TestHudiWriterProvider;
import org.apache.hudi.helper.TestKafkaConnect;
import org.apache.kafka.common.TopicPartition;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import java.util.Collections;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class TestConnectTransactionParticipant {
private static final String TOPIC_NAME = "kafka-connect-test-topic";
private static final int PARTITION_NUMBER = 4;
private ConnectTransactionParticipant participant;
private MockCoordinator coordinator;
private TopicPartition partition;
private KafkaConnectConfigs configs;
private KafkaControlAgent kafkaControlAgent;
private TestHudiWriterProvider testHudiWriterProvider;
private TestKafkaConnect testKafkaConnect;
@BeforeEach
public void setUp() throws Exception {
partition = new TopicPartition(TOPIC_NAME, PARTITION_NUMBER);
kafkaControlAgent = new MockKafkaControlAgent();
testKafkaConnect = new TestKafkaConnect(partition);
coordinator = new MockCoordinator(kafkaControlAgent);
coordinator.start();
configs = KafkaConnectConfigs.newBuilder()
.build();
initializeParticipant();
}
@ParameterizedTest
@EnumSource(value = CoordinatorFailureTestScenarios.class)
public void testAllCoordinatorFailureScenarios(CoordinatorFailureTestScenarios testScenario) {
int expectedRecordsWritten = 0;
switch (testScenario) {
case REGULAR_SCENARIO:
expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant();
assertTrue(testKafkaConnect.isPaused());
break;
case COORDINATOR_FAILED_AFTER_START_COMMIT:
testKafkaConnect.putRecordsToParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT);
testKafkaConnect.putRecordsToParticipant();
// Coordinator Failed
initializeCoordinator();
break;
case COORDINATOR_FAILED_AFTER_END_COMMIT:
testKafkaConnect.putRecordsToParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT);
testKafkaConnect.putRecordsToParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT);
expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant();
// Coordinator Failed
initializeCoordinator();
break;
default:
throw new HoodieException("Unknown test scenario " + testScenario);
}
// Regular Case or Coordinator Recovery Case
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT);
expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant();
assertTrue(testKafkaConnect.isResumed());
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT);
testKafkaConnect.putRecordsToParticipant();
assertTrue(testKafkaConnect.isPaused());
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT);
testKafkaConnect.putRecordsToParticipant();
assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten);
// Ensure Coordinator and participant are in sync in the kafka offsets
assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset());
participant.stop();
}
@ParameterizedTest
@EnumSource(value = ParticipantFailureTestScenarios.class)
public void testAllParticipantFailureScenarios(ParticipantFailureTestScenarios testScenario) {
int expectedRecordsWritten = 0;
switch (testScenario) {
case FAILURE_BEFORE_START_COMMIT:
testKafkaConnect.putRecordsToParticipant();
// Participant fails
initializeParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT);
expectedRecordsWritten += testKafkaConnect.putRecordsToParticipant();
assertTrue(testKafkaConnect.isResumed());
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT);
testKafkaConnect.putRecordsToParticipant();
assertTrue(testKafkaConnect.isPaused());
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT);
testKafkaConnect.putRecordsToParticipant();
assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten);
// Ensure Coordinator and participant are in sync in the kafka offsets
assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset());
break;
case FAILURE_AFTER_START_COMMIT:
testKafkaConnect.putRecordsToParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT);
testKafkaConnect.putRecordsToParticipant();
// Participant fails
initializeParticipant();
testKafkaConnect.putRecordsToParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT);
testKafkaConnect.putRecordsToParticipant();
assertTrue(testKafkaConnect.isPaused());
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT);
testKafkaConnect.putRecordsToParticipant();
assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten);
// Ensure Coordinator and participant are in sync in the kafka offsets
assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset());
break;
case FAILURE_AFTER_END_COMMIT:
testKafkaConnect.putRecordsToParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.START_COMMIT);
testKafkaConnect.putRecordsToParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT);
testKafkaConnect.putRecordsToParticipant();
// Participant fails
initializeParticipant();
testKafkaConnect.putRecordsToParticipant();
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.END_COMMIT);
testKafkaConnect.putRecordsToParticipant();
assertTrue(testKafkaConnect.isPaused());
coordinator.sendEventFromCoordinator(ControlEvent.MsgType.ACK_COMMIT);
testKafkaConnect.putRecordsToParticipant();
assertEquals(testHudiWriterProvider.getLatestNumberWrites(), expectedRecordsWritten);
// Ensure Coordinator and participant are in sync in the kafka offsets
assertEquals(participant.getLastKafkaCommittedOffset(), coordinator.getCommittedKafkaOffset());
break;
default:
throw new HoodieException("Unknown test scenario " + testScenario);
}
}
private void initializeParticipant() {
testHudiWriterProvider = new TestHudiWriterProvider();
participant = new ConnectTransactionParticipant(
partition,
kafkaControlAgent,
testKafkaConnect,
testHudiWriterProvider);
testKafkaConnect.setParticipant(participant);
participant.start();
}
private void initializeCoordinator() {
coordinator = new MockCoordinator(kafkaControlAgent);
coordinator.start();
}
private static class MockCoordinator implements TransactionCoordinator {
private static int currentCommitTime;
static {
currentCommitTime = 101;
}
private final KafkaControlAgent kafkaControlAgent;
private final TopicPartition partition;
private Option<ControlEvent> lastReceivedWriteStatusEvent;
private long committedKafkaOffset;
public MockCoordinator(KafkaControlAgent kafkaControlAgent) {
this.kafkaControlAgent = kafkaControlAgent;
partition = new TopicPartition(TOPIC_NAME, 0);
lastReceivedWriteStatusEvent = Option.empty();
committedKafkaOffset = 0L;
}
public void sendEventFromCoordinator(
ControlEvent.MsgType type) {
try {
if (type.equals(ControlEvent.MsgType.START_COMMIT)) {
++currentCommitTime;
}
kafkaControlAgent.publishMessage(new ControlEvent.Builder(
type,
ControlEvent.SenderType.COORDINATOR,
String.valueOf(currentCommitTime),
partition)
.setCoordinatorInfo(new ControlEvent.CoordinatorInfo(
Collections.singletonMap(PARTITION_NUMBER, committedKafkaOffset)))
.build());
} catch (Exception exception) {
throw new HoodieException("Fatal error sending control event to Participant");
}
}
public Option<ControlEvent> getLastReceivedWriteStatusEvent() {
return lastReceivedWriteStatusEvent;
}
public long getCommittedKafkaOffset() {
return committedKafkaOffset;
}
@Override
public void start() {
kafkaControlAgent.registerTransactionCoordinator(this);
}
@Override
public void stop() {
kafkaControlAgent.deregisterTransactionCoordinator(this);
}
@Override
public TopicPartition getPartition() {
return partition;
}
@Override
public void processControlEvent(ControlEvent message) {
if (message.getMsgType().equals(ControlEvent.MsgType.WRITE_STATUS)) {
lastReceivedWriteStatusEvent = Option.of(message);
assertTrue(message.getParticipantInfo().getKafkaCommitOffset() >= committedKafkaOffset);
committedKafkaOffset = message.getParticipantInfo().getKafkaCommitOffset();
}
}
}
private enum CoordinatorFailureTestScenarios {
REGULAR_SCENARIO,
COORDINATOR_FAILED_AFTER_START_COMMIT,
COORDINATOR_FAILED_AFTER_END_COMMIT,
}
private enum ParticipantFailureTestScenarios {
FAILURE_BEFORE_START_COMMIT,
FAILURE_AFTER_START_COMMIT,
FAILURE_AFTER_END_COMMIT,
}
}

View File

@@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.helper;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.connect.writers.ConnectTransactionServices;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
/**
* Helper class for {@link ConnectTransactionServices} to generate
* a unique commit time for testing purposes.
*/
public class MockConnectTransactionServices implements ConnectTransactionServices {
private int commitTime;
public MockConnectTransactionServices() {
commitTime = 100;
}
@Override
public String startCommit() {
commitTime++;
return String.valueOf(commitTime);
}
@Override
public void endCommit(String commitTime, List<WriteStatus> writeStatuses, Map<String, String> extraMetadata) {
assertEquals(String.valueOf(this.commitTime), commitTime);
}
@Override
public Map<String, String> fetchLatestExtraCommitMetadata() {
return new HashMap<>();
}
}

View File

@@ -0,0 +1,87 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.helper;
import org.apache.hudi.connect.kafka.KafkaControlAgent;
import org.apache.hudi.connect.transaction.ControlEvent;
import org.apache.hudi.connect.transaction.TransactionCoordinator;
import org.apache.hudi.connect.transaction.TransactionParticipant;
import org.apache.hudi.exception.HoodieException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* A mock Kafka Control Agent that supports the testing
* of a {@link TransactionCoordinator} with multiple
* instances of {@link TransactionParticipant}.
*/
public class MockKafkaControlAgent implements KafkaControlAgent {
private final Map<String, TransactionCoordinator> coordinators;
private final Map<String, List<TransactionParticipant>> participants;
public MockKafkaControlAgent() {
coordinators = new HashMap<>();
participants = new HashMap<>();
}
@Override
public void registerTransactionCoordinator(TransactionCoordinator coordinator) {
coordinators.put(coordinator.getPartition().topic(), coordinator);
}
@Override
public void registerTransactionParticipant(TransactionParticipant participant) {
if (!participants.containsKey(participant.getPartition().topic())) {
participants.put(participant.getPartition().topic(), new ArrayList<>());
}
participants.get(participant.getPartition().topic()).add(participant);
}
@Override
public void deregisterTransactionCoordinator(TransactionCoordinator coordinator) {
coordinators.remove(coordinator.getPartition().topic());
}
@Override
public void deregisterTransactionParticipant(TransactionParticipant worker) {
if (participants.containsKey(worker.getPartition().topic())) {
participants.get(worker.getPartition().topic()).remove(worker);
}
}
@Override
public void publishMessage(ControlEvent message) {
try {
String topic = message.senderPartition().topic();
if (message.getSenderType().equals(ControlEvent.SenderType.COORDINATOR)) {
for (TransactionParticipant participant : participants.get(topic)) {
participant.processControlEvent(message);
}
} else {
coordinators.get(topic).processControlEvent(message);
}
} catch (Exception exception) {
throw new HoodieException("Fatal error trying to relay Kafka Control Messages for Testing.");
}
}
}

View File

@@ -0,0 +1,83 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.helper;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.connect.writers.ConnectWriter;
import org.apache.hudi.connect.writers.ConnectWriterProvider;
import org.apache.kafka.connect.sink.SinkRecord;
import java.util.List;
/**
* Helper class the provides a Hudi writer and
* maintains stats that are used for test validation.
*/
public class TestHudiWriterProvider implements ConnectWriterProvider<WriteStatus> {
private TestHudiWriter currentWriter;
public TestHudiWriterProvider() {
}
public int getLatestNumberWrites() {
return (currentWriter != null) ? currentWriter.numberRecords : 0;
}
public boolean isClosed() {
return currentWriter == null || currentWriter.isClosed;
}
@Override
public ConnectWriter<WriteStatus> getWriter(String commitTime) {
currentWriter = new TestHudiWriter();
return currentWriter;
}
private static class TestHudiWriter implements ConnectWriter<WriteStatus> {
private int numberRecords;
private boolean isClosed;
public TestHudiWriter() {
this.numberRecords = 0;
this.isClosed = false;
}
public int getNumberRecords() {
return numberRecords;
}
public boolean isClosed() {
return isClosed;
}
@Override
public void writeRecord(SinkRecord record) {
numberRecords++;
}
@Override
public List<WriteStatus> close() {
isClosed = false;
return null;
}
}
}

View File

@@ -0,0 +1,133 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.helper;
import org.apache.hudi.connect.transaction.TransactionParticipant;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.connect.data.Schema;
import org.apache.kafka.connect.sink.SinkRecord;
import org.apache.kafka.connect.sink.SinkTaskContext;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
/**
* Helper class that emulates the Kafka Connect f/w and additionally
* implements {@link SinkTaskContext} for testing purposes.
*/
public class TestKafkaConnect implements SinkTaskContext {
private static final int NUM_RECORDS_BATCH = 5;
private final TopicPartition testPartition;
private TransactionParticipant participant;
private long currentKafkaOffset;
private boolean isPaused;
public TestKafkaConnect(TopicPartition testPartition) {
this.testPartition = testPartition;
isPaused = false;
currentKafkaOffset = 0L;
}
public void setParticipant(TransactionParticipant participant) {
this.participant = participant;
}
public boolean isPaused() {
return isPaused;
}
public boolean isResumed() {
return !isPaused;
}
public int putRecordsToParticipant() {
for (int i = 1; i <= NUM_RECORDS_BATCH; i++) {
participant.buffer(getNextKafkaRecord());
}
participant.processRecords();
return NUM_RECORDS_BATCH;
}
public SinkRecord getNextKafkaRecord() {
return new SinkRecord(testPartition.topic(),
testPartition.partition(),
Schema.OPTIONAL_BYTES_SCHEMA,
("key-" + currentKafkaOffset).getBytes(),
Schema.OPTIONAL_BYTES_SCHEMA,
"value".getBytes(), currentKafkaOffset++);
}
public long getCurrentKafkaOffset() {
return currentKafkaOffset;
}
@Override
public void pause(TopicPartition... partitions) {
if (Arrays.stream(partitions).allMatch(testPartition::equals)) {
isPaused = true;
}
}
@Override
public void resume(TopicPartition... partitions) {
if (Arrays.stream(partitions).allMatch(testPartition::equals)) {
isPaused = false;
}
}
@Override
public void offset(Map<TopicPartition, Long> offsets) {
for (TopicPartition tp : offsets.keySet()) {
if (tp.equals(testPartition)) {
currentKafkaOffset = offsets.get(tp);
}
}
}
@Override
public void offset(TopicPartition tp, long offset) {
if (tp.equals(testPartition)) {
currentKafkaOffset = offset;
}
}
@Override
public Map<String, String> configs() {
return null;
}
@Override
public void timeout(long timeoutMs) {
}
@Override
public Set<TopicPartition> assignment() {
return null;
}
@Override
public void requestCommit() {
}
}

View File

@@ -0,0 +1,203 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.writers;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.testutils.SchemaTestUtil;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.connect.writers.AbstractConnectWriter;
import org.apache.hudi.connect.writers.KafkaConnectConfigs;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.schema.SchemaProvider;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.io.DecoderFactory;
import org.apache.kafka.connect.sink.SinkRecord;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.assertEquals;
public class TestAbstractConnectWriter {
private static final String TOPIC_NAME = "kafka-connect-test-topic";
private static final int PARTITION_NUMBER = 4;
private static final int NUM_RECORDS = 10;
private static final int RECORD_KEY_INDEX = 0;
private KafkaConnectConfigs configs;
private TestKeyGenerator keyGenerator;
private SchemaProvider schemaProvider;
private long currentKafkaOffset;
@BeforeEach
public void setUp() throws Exception {
keyGenerator = new TestKeyGenerator(new TypedProperties());
schemaProvider = new TestSchemaProvider();
}
@ParameterizedTest
@EnumSource(value = TestInputFormats.class)
public void testAbstractWriterForAllFormats(TestInputFormats inputFormats) throws Exception {
Schema schema = schemaProvider.getSourceSchema();
List<?> inputRecords;
List<HoodieRecord> expectedRecords;
String formatConverter;
switch (inputFormats) {
case JSON_STRING:
formatConverter = AbstractConnectWriter.KAFKA_STRING_CONVERTER;
GenericDatumReader<IndexedRecord> reader = new GenericDatumReader<>(schema, schema);
inputRecords = SchemaTestUtil.generateTestJsonRecords(0, NUM_RECORDS);
expectedRecords = ((List<String>) inputRecords).stream().map(s -> {
try {
return HoodieAvroUtils.rewriteRecord((GenericRecord) reader.read(null, DecoderFactory.get().jsonDecoder(schema, s)),
schema);
} catch (IOException exception) {
throw new HoodieException("Error converting JSON records to AVRO");
}
}).map(p -> convertToHoodieRecords(p, p.get(RECORD_KEY_INDEX).toString(), "000/00/00")).collect(Collectors.toList());
break;
case AVRO:
formatConverter = AbstractConnectWriter.KAFKA_AVRO_CONVERTER;
inputRecords = SchemaTestUtil.generateTestRecords(0, NUM_RECORDS);
expectedRecords = inputRecords.stream().map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, schema))
.map(p -> convertToHoodieRecords(p, p.get(RECORD_KEY_INDEX).toString(), "000/00/00")).collect(Collectors.toList());
break;
default:
throw new HoodieException("Unknown test scenario " + inputFormats);
}
configs = KafkaConnectConfigs.newBuilder()
.withProperties(
Collections.singletonMap(KafkaConnectConfigs.KAFKA_VALUE_CONVERTER, formatConverter))
.build();
AbstractHudiConnectWriterTestWrapper writer = new AbstractHudiConnectWriterTestWrapper(
configs,
keyGenerator,
schemaProvider);
for (int i = 0; i < NUM_RECORDS; i++) {
writer.writeRecord(getNextKafkaRecord(inputRecords.get(i)));
}
validateRecords(writer.getWrittenRecords(), expectedRecords);
}
private static void validateRecords(List<HoodieRecord> actualRecords, List<HoodieRecord> expectedRecords) {
assertEquals(actualRecords.size(), expectedRecords.size());
actualRecords.sort(Comparator.comparing(HoodieRecord::getRecordKey));
expectedRecords.sort(Comparator.comparing(HoodieRecord::getRecordKey));
// iterate through the elements and compare them one by one using
// the provided comparator.
Iterator<HoodieRecord> it1 = actualRecords.iterator();
Iterator<HoodieRecord> it2 = expectedRecords.iterator();
while (it1.hasNext()) {
HoodieRecord t1 = it1.next();
HoodieRecord t2 = it2.next();
assertEquals(t1.getRecordKey(), t2.getRecordKey());
}
}
private SinkRecord getNextKafkaRecord(Object record) {
return new SinkRecord(TOPIC_NAME, PARTITION_NUMBER,
org.apache.kafka.connect.data.Schema.OPTIONAL_BYTES_SCHEMA,
("key-" + currentKafkaOffset).getBytes(),
org.apache.kafka.connect.data.Schema.OPTIONAL_BYTES_SCHEMA,
record, currentKafkaOffset++);
}
private static class AbstractHudiConnectWriterTestWrapper extends AbstractConnectWriter {
private List<HoodieRecord> writtenRecords;
public AbstractHudiConnectWriterTestWrapper(KafkaConnectConfigs connectConfigs, KeyGenerator keyGenerator, SchemaProvider schemaProvider) {
super(connectConfigs, keyGenerator, schemaProvider);
writtenRecords = new ArrayList<>();
}
public List<HoodieRecord> getWrittenRecords() {
return writtenRecords;
}
@Override
protected void writeHudiRecord(HoodieRecord<HoodieAvroPayload> record) {
writtenRecords.add(record);
}
@Override
protected List<WriteStatus> flushHudiRecords() {
return null;
}
}
private static HoodieRecord convertToHoodieRecords(IndexedRecord iRecord, String key, String partitionPath) {
return new HoodieRecord<>(new HoodieKey(key, partitionPath),
new HoodieAvroPayload(Option.of((GenericRecord) iRecord)));
}
private enum TestInputFormats {
AVRO,
JSON_STRING
}
static class TestKeyGenerator extends KeyGenerator {
protected TestKeyGenerator(TypedProperties config) {
super(config);
}
@Override
public HoodieKey getKey(GenericRecord record) {
return new HoodieKey(record.get(RECORD_KEY_INDEX).toString(), "000/00/00");
}
}
static class TestSchemaProvider extends SchemaProvider {
@Override
public Schema getSourceSchema() {
try {
return SchemaTestUtil.getSimpleSchema();
} catch (IOException exception) {
throw new HoodieException("Fatal error parsing schema", exception);
}
}
}
}

View File

@@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.writers;
import org.apache.hudi.client.HoodieJavaWriteClient;
import org.apache.hudi.client.common.HoodieJavaEngineContext;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.connect.writers.BufferedConnectWriter;
import org.apache.hudi.connect.writers.KafkaConnectConfigs;
import org.apache.hudi.schema.SchemaProvider;
import org.apache.hadoop.conf.Configuration;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.ArgumentCaptor;
import org.mockito.Mockito;
import java.util.Comparator;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.ArgumentMatchers.anyList;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
public class TestBufferedConnectWriter {
private static final int NUM_RECORDS = 10;
private static final String COMMIT_TIME = "101";
private HoodieJavaWriteClient mockHoodieJavaWriteClient;
private HoodieJavaEngineContext javaEngineContext;
private KafkaConnectConfigs configs;
private HoodieWriteConfig writeConfig;
private SchemaProvider schemaProvider;
@BeforeEach
public void setUp() throws Exception {
mockHoodieJavaWriteClient = mock(HoodieJavaWriteClient.class);
Configuration hadoopConf = new Configuration();
javaEngineContext = new HoodieJavaEngineContext(hadoopConf);
configs = KafkaConnectConfigs.newBuilder().build();
schemaProvider = new TestAbstractConnectWriter.TestSchemaProvider();
writeConfig = HoodieWriteConfig.newBuilder()
.withPath("/tmp")
.withSchema(schemaProvider.getSourceSchema().toString())
.build();
}
@Test
public void testSimpleWriteAndFlush() throws Exception {
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] {partitionPath});
List<HoodieRecord> records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS);
BufferedConnectWriter writer = new BufferedConnectWriter(
javaEngineContext,
mockHoodieJavaWriteClient,
COMMIT_TIME,
configs,
writeConfig,
null,
schemaProvider);
for (int i = 0; i < NUM_RECORDS; i++) {
writer.writeHudiRecord(records.get(i));
}
Mockito.verify(mockHoodieJavaWriteClient, times(0))
.bulkInsertPreppedRecords(anyList(), eq(COMMIT_TIME), eq(Option.empty()));
writer.flushHudiRecords();
final ArgumentCaptor<List<HoodieRecord>> actualRecords = ArgumentCaptor.forClass(List.class);
Mockito.verify(mockHoodieJavaWriteClient, times(1))
.bulkInsertPreppedRecords(actualRecords.capture(), eq(COMMIT_TIME), eq(Option.empty()));
actualRecords.getValue().sort(Comparator.comparing(HoodieRecord::getRecordKey));
records.sort(Comparator.comparing(HoodieRecord::getRecordKey));
assertEquals(records, actualRecords.getValue());
}
}

View File

@@ -0,0 +1,32 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache=INFO
log4j.logger.org.apache.hudi=DEBUG
log4j.logger.org.apache.hadoop.hbase=ERROR
# A1 is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL

View File

@@ -352,6 +352,7 @@ public class DeltaSync implements Serializable {
}
}
} else {
// initialize the table for the first time.
String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props);
HoodieTableMetaClient.withPropertyBuilder()
.setTableType(cfg.tableType)

View File

@@ -29,7 +29,8 @@ import org.apache.spark.api.java.JavaSparkContext;
import java.io.Serializable;
/**
* Class to provide schema for reading data and also writing into a Hoodie table.
* Class to provide schema for reading data and also writing into a Hoodie table,
* used by deltastreamer (runs over Spark).
*/
@PublicAPIClass(maturity = ApiMaturityLevel.STABLE)
public abstract class SchemaProvider implements Serializable {

View File

@@ -0,0 +1,186 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hudi</artifactId>
<groupId>org.apache.hudi</groupId>
<version>0.10.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-kafka-connect-bundle</artifactId>
<packaging>jar</packaging>
<properties>
<checkstyle.skip>true</checkstyle.skip>
<main.basedir>${project.parent.basedir}</main.basedir>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>${maven-shade-plugin.version}</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createSourcesJar>${shadeSources}</createSourcesJar>
<dependencyReducedPomLocation>${project.build.directory}/dependency-reduced-pom.xml
</dependencyReducedPomLocation>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer">
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer">
<addHeader>true</addHeader>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
</transformer>
</transformers>
<artifactSet>
<excludes>
<exclude>com.amazonaws.*</exclude>
<exclude>org.apache.zookeeper:zookeeper</exclude>
<exclude>com.fasterxml.jackson.core:jackson-annotations</exclude>
<exclude>commons-httpclient:commons-httpclient</exclude>
<exclude>org.apache.htrace:htrace-core</exclude>
<exclude>org.jamon:jamon-runtime</exclude>
<exclude>jdk.tools:jdk.tools</exclude>
<exclude>junit:junit</exclude>
</excludes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/services/javax.*</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
<resource>
<directory>src/test/resources</directory>
</resource>
</resources>
</build>
<dependencies>
<!-- Hudi -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-kafka-connect</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-java-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Avro/ Parquet -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>${avro.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<scope>compile</scope>
</dependency>
<!-- Hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>compile</scope>
<exclusions>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet.jsp</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>${hadoop.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,36 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.kafka.connect.bundle;
import org.apache.hudi.common.util.ReflectionUtils;
/**
* A simple main class to dump all classes loaded in current classpath
*
* This is a workaround for generating sources and javadoc jars for packaging modules. The maven plugins for generating
* javadoc and sources plugins do not generate corresponding jars if there are no source files.
*
* This class does not have anything to do with Hudi but is there to keep mvn javadocs/source plugin happy.
*/
public class Main {
public static void main(String[] args) {
ReflectionUtils.getTopLevelClassesInClasspath(Main.class).forEach(System.out::println);
}
}

View File

@@ -54,7 +54,9 @@
<module>packaging/hudi-integ-test-bundle</module>
<module>hudi-examples</module>
<module>hudi-flink</module>
<module>hudi-kafka-connect</module>
<module>packaging/hudi-flink-bundle</module>
<module>packaging/hudi-kafka-connect-bundle</module>
</modules>
<licenses>