1
0

[HUDI-3563] Make quickstart examples covered by CI tests (#5082)

This commit is contained in:
ForwardXu
2022-03-25 16:37:17 +08:00
committed by GitHub
parent f20c9867d7
commit e5c3f9089b
38 changed files with 2980 additions and 225 deletions

View File

@@ -0,0 +1,109 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hudi-examples</artifactId>
<groupId>org.apache.hudi</groupId>
<version>0.11.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-examples-common</artifactId>
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
<phase>test-compile</phase>
</execution>
</executions>
<configuration>
<skip>false</skip>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
</dependency>
<!-- Avro -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
</dependency>
</dependencies>
</project>

View File

@@ -43,7 +43,6 @@ import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* Class to be used to generate test data.
*/
@@ -63,7 +62,7 @@ public class HoodieExampleDataGenerator<T extends HoodieRecordPayload<T>> {
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
public static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA);
private static Random rand = new Random(46474747);
private static final Random rand = new Random(46474747);
private final Map<Integer, KeyPartition> existingKeys;
private final String[] partitionPaths;

View File

@@ -0,0 +1,364 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hudi-examples</artifactId>
<groupId>org.apache.hudi</groupId>
<version>0.11.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-examples-flink</artifactId>
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
<checkstyle.skip>true</checkstyle.skip>
<parquet.version>1.11.1</parquet.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.jacoco</groupId>
<artifactId>jacoco-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<version>3.1.2</version>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
</plugins>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
<resource>
<directory>src/test/resources</directory>
</resource>
</resources>
</build>
<dependencies>
<!-- Hoodie -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hive-sync</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-sync-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>
<!-- Flink -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<scope>compile</scope>
<exclusions>
<exclusion>
<groupId>com.esotericsoftware.kryo</groupId>
<artifactId>kryo</artifactId>
</exclusion>
<exclusion>
<groupId>com.esotericsoftware.minlog</groupId>
<artifactId>minlog</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>${kafka.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hadoop-compatibility_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-parquet_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-runtime_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>${parquet.version}</version>
<exclusions>
<exclusion>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Avro -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<!-- Override the version to be same with Flink avro -->
<version>1.10.0</version>
<scope>compile</scope>
</dependency>
<!-- Hadoop -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<scope>compile</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<artifactId>bijection-avro_${scala.binary.version}</artifactId>
<version>0.9.7</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.5</version>
</dependency>
<!-- Hive -->
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<classifier>${hive.exec.classifier}</classifier>
<exclusions>
<exclusion>
<groupId>javax.mail</groupId>
<artifactId>mail</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty.aggregate</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Test dependencies -->
<!-- Junit 5 dependencies -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>
<!-- Hoodie dependencies -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-flink-client</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.hudi</groupId>-->
<!-- <artifactId>hudi-flink_${scala.binary.version}</artifactId>-->
<!-- <version>${project.version}</version>-->
<!-- <scope>test</scope>-->
<!-- </dependency>-->
<!-- Flink dependencies -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-test-utils_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-runtime_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-csv</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
</dependency>
<!-- Parquet Test-->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>${parquet.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,211 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart;
import static org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations.sql;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.execution.JobClient;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.api.config.ExecutionConfigOptions;
import org.apache.flink.table.api.internal.TableEnvironmentImpl;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.exceptions.TableNotExistException;
import org.apache.flink.types.Row;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory;
import org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations;
import org.jetbrains.annotations.NotNull;
public final class HoodieFlinkQuickstart {
private EnvironmentSettings settings = null;
private TableEnvironment streamTableEnv = null;
private String tableName;
private HoodieFlinkQuickstart() {
}
public static HoodieFlinkQuickstart instance() {
return new HoodieFlinkQuickstart();
}
public static void main(String[] args) throws TableNotExistException, InterruptedException {
if (args.length < 3) {
System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName> <tableType>");
System.exit(1);
}
String tablePath = args[0];
String tableName = args[1];
String tableType = args[2];
HoodieFlinkQuickstart flinkQuickstart = instance();
flinkQuickstart.initEnv();
// create filesystem table named source
flinkQuickstart.createFileSource();
// create hudi table
flinkQuickstart.createHudiTable(tablePath, tableName, HoodieTableType.valueOf(tableType));
// insert data
flinkQuickstart.insertData();
// query data
flinkQuickstart.queryData();
// update data
flinkQuickstart.updateData();
}
public void initEnv() {
if (this.streamTableEnv == null) {
settings = EnvironmentSettings.newInstance().build();
TableEnvironment streamTableEnv = TableEnvironmentImpl.create(settings);
streamTableEnv.getConfig().getConfiguration()
.setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1);
Configuration execConf = streamTableEnv.getConfig().getConfiguration();
execConf.setString("execution.checkpointing.interval", "2s");
// configure not to retry after failure
execConf.setString("restart-strategy", "fixed-delay");
execConf.setString("restart-strategy.fixed-delay.attempts", "0");
this.streamTableEnv = streamTableEnv;
}
}
public TableEnvironment getStreamTableEnv() {
return streamTableEnv;
}
public TableEnvironment getBatchTableEnv() {
Configuration conf = new Configuration();
// for batch upsert use cases: current suggestion is to disable these 2 options,
// from 1.14, flink runtime execution mode has switched from streaming
// to batch for batch execution mode(before that, both streaming and batch use streaming execution mode),
// current batch execution mode has these limitations:
//
// 1. the keyed stream default to always sort the inputs by key;
// 2. the batch state-backend requires the inputs sort by state key
//
// For our hudi batch pipeline upsert case, we rely on the consuming sequence for index records and data records,
// the index records must be loaded first before data records for BucketAssignFunction to keep upsert semantics correct,
// so we suggest disabling these 2 options to use streaming state-backend for batch execution mode
// to keep the strategy before 1.14.
conf.setBoolean("execution.sorted-inputs.enabled", false);
conf.setBoolean("execution.batch-state-backend.enabled", false);
StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(conf);
settings = EnvironmentSettings.newInstance().inBatchMode().build();
TableEnvironment batchTableEnv = StreamTableEnvironment.create(execEnv, settings);
batchTableEnv.getConfig().getConfiguration()
.setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1);
return batchTableEnv;
}
public void createHudiTable(String tablePath, String tableName,
HoodieTableType tableType) {
this.tableName = tableName;
// create hudi table
String hoodieTableDDL = sql(tableName)
.option(FlinkOptions.PATH, tablePath)
.option(FlinkOptions.READ_AS_STREAMING, true)
.option(FlinkOptions.TABLE_TYPE, tableType)
.end();
streamTableEnv.executeSql(hoodieTableDDL);
}
public void createFileSource() {
// create filesystem table named source
String createSource = QuickstartConfigurations.getFileSourceDDL("source");
streamTableEnv.executeSql(createSource);
}
@NotNull List<Row> insertData() throws InterruptedException, TableNotExistException {
// insert data
String insertInto = String.format("insert into %s select * from source", tableName);
execInsertSql(streamTableEnv, insertInto);
return queryData();
}
List<Row> queryData() throws InterruptedException, TableNotExistException {
// query data
// reading from the latest commit instance.
return execSelectSql(streamTableEnv, String.format("select * from %s", tableName), 10);
}
@NotNull List<Row> updateData() throws InterruptedException, TableNotExistException {
// update data
String insertInto = String.format("insert into %s select * from source", tableName);
execInsertSql(getStreamTableEnv(), insertInto);
return queryData();
}
public static void execInsertSql(TableEnvironment tEnv, String insert) {
TableResult tableResult = tEnv.executeSql(insert);
// wait to finish
try {
tableResult.getJobClient().get().getJobExecutionResult().get();
} catch (InterruptedException | ExecutionException ex) {
// ignored
}
}
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout)
throws InterruptedException, TableNotExistException {
return execSelectSql(tEnv, select, timeout, null);
}
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout, String sourceTable)
throws InterruptedException, TableNotExistException {
final String sinkDDL;
if (sourceTable != null) {
// use the source table schema as the sink schema if the source table was specified, .
ObjectPath objectPath = new ObjectPath(tEnv.getCurrentDatabase(), sourceTable);
TableSchema schema = tEnv.getCatalog(tEnv.getCurrentCatalog()).get().getTable(objectPath).getSchema();
sinkDDL = QuickstartConfigurations.getCollectSinkDDL("sink", schema);
} else {
sinkDDL = QuickstartConfigurations.getCollectSinkDDL("sink");
}
return execSelectSql(tEnv, select, sinkDDL, timeout);
}
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, String sinkDDL, long timeout)
throws InterruptedException {
tEnv.executeSql("DROP TABLE IF EXISTS sink");
tEnv.executeSql(sinkDDL);
TableResult tableResult = tEnv.executeSql("insert into sink " + select);
// wait for the timeout then cancels the job
TimeUnit.SECONDS.sleep(timeout);
tableResult.getJobClient().ifPresent(JobClient::cancel);
tEnv.executeSql("DROP TABLE IF EXISTS sink");
return CollectSinkTableFactory.RESULT.values().stream()
.flatMap(Collection::stream)
.collect(Collectors.toList());
}
}

View File

@@ -0,0 +1,178 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.factory;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.table.connector.ChangelogMode;
import org.apache.flink.table.connector.sink.DynamicTableSink;
import org.apache.flink.table.connector.sink.SinkFunctionProvider;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.factories.DynamicTableSinkFactory;
import org.apache.flink.table.factories.FactoryUtil;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.utils.TypeConversions;
import org.apache.flink.types.Row;
import org.apache.flink.types.RowKind;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Factory for CollectTableSink.
*
* <p>Note: The CollectTableSink collects all the data of a table into a global collection {@code RESULT},
* so the tests should executed in single thread and the table name should be the same.
*/
public class CollectSinkTableFactory implements DynamicTableSinkFactory {
public static final String FACTORY_ID = "collect";
// global results to collect and query
public static final Map<Integer, List<Row>> RESULT = new HashMap<>();
@Override
public DynamicTableSink createDynamicTableSink(Context context) {
FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context);
helper.validate();
TableSchema schema = context.getCatalogTable().getSchema();
RESULT.clear();
return new CollectTableSink(schema, context.getObjectIdentifier().getObjectName());
}
@Override
public String factoryIdentifier() {
return FACTORY_ID;
}
@Override
public Set<ConfigOption<?>> requiredOptions() {
return Collections.emptySet();
}
@Override
public Set<ConfigOption<?>> optionalOptions() {
return Collections.emptySet();
}
// --------------------------------------------------------------------------------------------
// Table sinks
// --------------------------------------------------------------------------------------------
/**
* Values {@link DynamicTableSink} for testing.
*/
private static class CollectTableSink implements DynamicTableSink {
private final TableSchema schema;
private final String tableName;
private CollectTableSink(
TableSchema schema,
String tableName) {
this.schema = schema;
this.tableName = tableName;
}
@Override
public ChangelogMode getChangelogMode(ChangelogMode requestedMode) {
return ChangelogMode.newBuilder()
.addContainedKind(RowKind.INSERT)
.addContainedKind(RowKind.DELETE)
.addContainedKind(RowKind.UPDATE_AFTER)
.build();
}
@Override
public SinkRuntimeProvider getSinkRuntimeProvider(Context context) {
final DataType rowType = schema.toPhysicalRowDataType();
final RowTypeInfo rowTypeInfo = (RowTypeInfo) TypeConversions.fromDataTypeToLegacyInfo(rowType);
DataStructureConverter converter = context.createDataStructureConverter(schema.toPhysicalRowDataType());
return SinkFunctionProvider.of(new CollectSinkFunction(converter, rowTypeInfo));
}
@Override
public DynamicTableSink copy() {
return new CollectTableSink(schema, tableName);
}
@Override
public String asSummaryString() {
return "CollectSink";
}
}
static class CollectSinkFunction extends RichSinkFunction<RowData> implements CheckpointedFunction {
private static final long serialVersionUID = 1L;
private final DynamicTableSink.DataStructureConverter converter;
private final RowTypeInfo rowTypeInfo;
protected transient ListState<Row> resultState;
protected transient List<Row> localResult;
private int taskID;
protected CollectSinkFunction(DynamicTableSink.DataStructureConverter converter, RowTypeInfo rowTypeInfo) {
this.converter = converter;
this.rowTypeInfo = rowTypeInfo;
}
@Override
public void invoke(RowData value, Context context) {
Row row = (Row) converter.toExternal(value);
assert row != null;
row.setKind(value.getRowKind());
RESULT.get(taskID).add(row);
}
@Override
public void initializeState(FunctionInitializationContext context) throws Exception {
this.resultState = context.getOperatorStateStore().getListState(
new ListStateDescriptor<>("sink-results", rowTypeInfo));
this.localResult = new ArrayList<>();
if (context.isRestored()) {
for (Row value : resultState.get()) {
localResult.add(value);
}
}
this.taskID = getRuntimeContext().getIndexOfThisSubtask();
synchronized (CollectSinkTableFactory.class) {
RESULT.put(taskID, localResult);
}
}
@Override
public void snapshotState(FunctionSnapshotContext context) throws Exception {
resultState.clear();
resultState.addAll(RESULT.get(taskID));
}
}
}

View File

@@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.factory;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.table.api.ValidationException;
import org.apache.flink.table.connector.source.DynamicTableSource;
import org.apache.flink.table.factories.DynamicTableSourceFactory;
import org.apache.flink.table.factories.FactoryUtil;
import java.util.Collections;
import java.util.Set;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.examples.quickstart.source.ContinuousFileSource;
/**
* Factory for ContinuousFileSource.
*/
public class ContinuousFileSourceFactory implements DynamicTableSourceFactory {
public static final String FACTORY_ID = "continuous-file-source";
public static final ConfigOption<Integer> CHECKPOINTS = ConfigOptions
.key("checkpoints")
.intType()
.defaultValue(2)
.withDescription("Number of checkpoints to write the data set as, default 2");
@Override
public DynamicTableSource createDynamicTableSource(Context context) {
FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context);
helper.validate();
Configuration conf = (Configuration) helper.getOptions();
Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() ->
new ValidationException("Option [path] should be not empty.")));
return new ContinuousFileSource(context.getCatalogTable().getResolvedSchema(), path, conf);
}
@Override
public String factoryIdentifier() {
return FACTORY_ID;
}
@Override
public Set<ConfigOption<?>> requiredOptions() {
return Collections.singleton(FlinkOptions.PATH);
}
@Override
public Set<ConfigOption<?>> optionalOptions() {
return Collections.singleton(CHECKPOINTS);
}
}

View File

@@ -0,0 +1,185 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.source;
import org.apache.flink.api.common.state.CheckpointListener;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.formats.common.TimestampFormat;
import org.apache.flink.formats.json.JsonRowDataDeserializationSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.table.catalog.ResolvedSchema;
import org.apache.flink.table.connector.ChangelogMode;
import org.apache.flink.table.connector.source.DataStreamScanProvider;
import org.apache.flink.table.connector.source.DynamicTableSource;
import org.apache.flink.table.connector.source.ScanTableSource;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
import org.apache.flink.table.types.logical.RowType;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import static org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory.CHECKPOINTS;
/**
* A continuous file source that can trigger checkpoints continuously.
*
* <p>It loads the data in the specified file and split the data into number of checkpoints batches.
* Say, if you want 4 checkpoints and there are 8 records in the file, the emit strategy is:
*
* <pre>
* | 2 records | 2 records | 2 records | 2 records |
* | cp1 | cp2 |cp3 | cp4 |
* </pre>
*
* <p>If all the data are flushed out, it waits for the next checkpoint to finish and tear down the source.
*/
public class ContinuousFileSource implements ScanTableSource {
private final ResolvedSchema tableSchema;
private final Path path;
private final Configuration conf;
public ContinuousFileSource(
ResolvedSchema tableSchema,
Path path,
Configuration conf) {
this.tableSchema = tableSchema;
this.path = path;
this.conf = conf;
}
@Override
public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
return new DataStreamScanProvider() {
@Override
public boolean isBounded() {
return false;
}
@Override
public DataStream<RowData> produceDataStream(StreamExecutionEnvironment execEnv) {
final RowType rowType = (RowType) tableSchema.toSourceRowDataType().getLogicalType();
JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema(
rowType,
InternalTypeInfo.of(rowType),
false,
true,
TimestampFormat.ISO_8601);
return execEnv.addSource(new BoundedSourceFunction(path, conf.getInteger(CHECKPOINTS)))
.name("continuous_file_source")
.setParallelism(1)
.map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8)),
InternalTypeInfo.of(rowType));
}
};
}
@Override
public ChangelogMode getChangelogMode() {
return ChangelogMode.insertOnly();
}
@Override
public DynamicTableSource copy() {
return new ContinuousFileSource(this.tableSchema, this.path, this.conf);
}
@Override
public String asSummaryString() {
return "ContinuousFileSource";
}
/**
* Source function that partition the data into given number checkpoints batches.
*/
public static class BoundedSourceFunction implements SourceFunction<String>, CheckpointListener {
private final Path path;
private List<String> dataBuffer;
private final int checkpoints;
private final AtomicInteger currentCP = new AtomicInteger(0);
private volatile boolean isRunning = true;
public BoundedSourceFunction(Path path, int checkpoints) {
this.path = path;
this.checkpoints = checkpoints;
}
@Override
public void run(SourceContext<String> context) throws Exception {
if (this.dataBuffer == null) {
loadDataBuffer();
}
int oldCP = this.currentCP.get();
boolean finish = false;
while (isRunning) {
int batchSize = this.dataBuffer.size() / this.checkpoints;
int start = batchSize * oldCP;
synchronized (context.getCheckpointLock()) {
for (int i = start; i < start + batchSize; i++) {
if (i >= this.dataBuffer.size()) {
finish = true;
break;
// wait for the next checkpoint and exit
}
context.collect(this.dataBuffer.get(i));
}
}
oldCP++;
while (this.currentCP.get() < oldCP) {
synchronized (context.getCheckpointLock()) {
context.getCheckpointLock().wait(10);
}
}
if (finish || !isRunning) {
return;
}
}
}
@Override
public void cancel() {
this.isRunning = false;
}
private void loadDataBuffer() {
try {
this.dataBuffer = Files.readAllLines(Paths.get(this.path.toUri()));
} catch (IOException e) {
throw new RuntimeException("Read file " + this.path + " error", e);
}
}
@Override
public void notifyCheckpointComplete(long l) {
this.currentCP.incrementAndGet();
}
}
}

View File

@@ -0,0 +1,317 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.utils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.api.TableSchema;
import org.apache.flink.table.catalog.ResolvedSchema;
import org.apache.flink.table.runtime.typeutils.RowDataSerializer;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.hudi.configuration.FlinkOptions;
import org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory;
import org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory;
import org.apache.hudi.streamer.FlinkStreamerConfig;
/**
* Configurations for the test.
*/
public class QuickstartConfigurations {
private QuickstartConfigurations() {
}
public static final DataType ROW_DATA_TYPE = DataTypes.ROW(
DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key
DataTypes.FIELD("name", DataTypes.VARCHAR(10)),
DataTypes.FIELD("age", DataTypes.INT()),
DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field
DataTypes.FIELD("partition", DataTypes.VARCHAR(10)))
.notNull();
public static final RowType ROW_TYPE = (RowType) ROW_DATA_TYPE.getLogicalType();
public static final ResolvedSchema TABLE_SCHEMA = SchemaBuilder.instance()
.fields(ROW_TYPE.getFieldNames(), ROW_DATA_TYPE.getChildren())
.build();
private static final List<String> FIELDS = ROW_TYPE.getFields().stream()
.map(RowType.RowField::asSummaryString).collect(Collectors.toList());
public static final DataType ROW_DATA_TYPE_WIDER = DataTypes.ROW(
DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key
DataTypes.FIELD("name", DataTypes.VARCHAR(10)),
DataTypes.FIELD("age", DataTypes.INT()),
DataTypes.FIELD("salary", DataTypes.DOUBLE()),
DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field
DataTypes.FIELD("partition", DataTypes.VARCHAR(10)))
.notNull();
public static final RowType ROW_TYPE_WIDER = (RowType) ROW_DATA_TYPE_WIDER.getLogicalType();
public static String getCreateHoodieTableDDL(String tableName, Map<String, String> options) {
return getCreateHoodieTableDDL(tableName, options, true, "partition");
}
public static String getCreateHoodieTableDDL(
String tableName,
Map<String, String> options,
boolean havePartition,
String partitionField) {
return getCreateHoodieTableDDL(tableName, FIELDS, options, havePartition, "uuid", partitionField);
}
public static String getCreateHoodieTableDDL(
String tableName,
List<String> fields,
Map<String, String> options,
boolean havePartition,
String pkField,
String partitionField) {
StringBuilder builder = new StringBuilder();
builder.append("create table ").append(tableName).append("(\n");
for (String field : fields) {
builder.append(" ").append(field).append(",\n");
}
builder.append(" PRIMARY KEY(").append(pkField).append(") NOT ENFORCED\n")
.append(")\n");
if (havePartition) {
builder.append("PARTITIONED BY (`").append(partitionField).append("`)\n");
}
final String connector = options.computeIfAbsent("connector", k -> "hudi");
builder.append("with (\n"
+ " 'connector' = '").append(connector).append("'");
options.forEach((k, v) -> builder.append(",\n")
.append(" '").append(k).append("' = '").append(v).append("'"));
builder.append("\n)");
return builder.toString();
}
public static String getCreateHudiCatalogDDL(final String catalogName, final String catalogPath) {
StringBuilder builder = new StringBuilder();
builder.append("create catalog ").append(catalogName).append(" with (\n");
builder.append(" 'type' = 'hudi',\n"
+ " 'catalog.path' = '").append(catalogPath).append("'");
builder.append("\n)");
return builder.toString();
}
public static String getFileSourceDDL(String tableName) {
return getFileSourceDDL(tableName, "source-file.json");
}
public static String getFileSourceDDL(String tableName, int checkpoints) {
return getFileSourceDDL(tableName, "source-file.json", checkpoints);
}
public static String getFileSourceDDL(String tableName, String fileName) {
return getFileSourceDDL(tableName, fileName, 2);
}
public static String getFileSourceDDL(String tableName, String fileName, int checkpoints) {
String sourcePath = Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource(fileName)).toString();
return "create table " + tableName + "(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20)\n"
+ ") with (\n"
+ " 'connector' = '" + ContinuousFileSourceFactory.FACTORY_ID + "',\n"
+ " 'path' = '" + sourcePath + "',\n"
+ " 'checkpoints' = '" + checkpoints + "'\n"
+ ")";
}
public static String getCollectSinkDDL(String tableName) {
return "create table " + tableName + "(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20)\n"
+ ") with (\n"
+ " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'"
+ ")";
}
public static String getCollectSinkDDL(String tableName, TableSchema tableSchema) {
final StringBuilder builder = new StringBuilder("create table " + tableName + "(\n");
String[] fieldNames = tableSchema.getFieldNames();
DataType[] fieldTypes = tableSchema.getFieldDataTypes();
for (int i = 0; i < fieldNames.length; i++) {
builder.append(" `")
.append(fieldNames[i])
.append("` ")
.append(fieldTypes[i].toString());
if (i != fieldNames.length - 1) {
builder.append(",");
}
builder.append("\n");
}
final String withProps = ""
+ ") with (\n"
+ " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'\n"
+ ")";
builder.append(withProps);
return builder.toString();
}
public static String getCsvSourceDDL(String tableName, String fileName) {
String sourcePath = Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource(fileName)).toString();
return "create table " + tableName + "(\n"
+ " uuid varchar(20),\n"
+ " name varchar(10),\n"
+ " age int,\n"
+ " ts timestamp(3),\n"
+ " `partition` varchar(20)\n"
+ ") with (\n"
+ " 'connector' = 'filesystem',\n"
+ " 'path' = '" + sourcePath + "',\n"
+ " 'format' = 'csv'\n"
+ ")";
}
public static final RowDataSerializer SERIALIZER = new RowDataSerializer(ROW_TYPE);
public static Configuration getDefaultConf(String tablePath) {
Configuration conf = new Configuration();
conf.setString(FlinkOptions.PATH, tablePath);
conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH,
Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource("test_read_schema.avsc")).toString());
conf.setString(FlinkOptions.TABLE_NAME, "TestHoodieTable");
conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition");
return conf;
}
public static FlinkStreamerConfig getDefaultStreamerConf(String tablePath) {
FlinkStreamerConfig streamerConf = new FlinkStreamerConfig();
streamerConf.targetBasePath = tablePath;
streamerConf.sourceAvroSchemaPath = Objects.requireNonNull(Thread.currentThread()
.getContextClassLoader().getResource("test_read_schema.avsc")).toString();
streamerConf.targetTableName = "TestHoodieTable";
streamerConf.partitionPathField = "partition";
streamerConf.tableType = "COPY_ON_WRITE";
streamerConf.checkpointInterval = 4000L;
return streamerConf;
}
/**
* Creates the tool to build hoodie table DDL.
*/
public static Sql sql(String tableName) {
return new Sql(tableName);
}
public static Catalog catalog(String catalogName) {
return new Catalog(catalogName);
}
// -------------------------------------------------------------------------
// Utilities
// -------------------------------------------------------------------------
/**
* Tool to build hoodie table DDL with schema {@link #TABLE_SCHEMA}.
*/
public static class Sql {
private final Map<String, String> options;
private final String tableName;
private List<String> fields = new ArrayList<>();
private boolean withPartition = true;
private String pkField = "uuid";
private String partitionField = "partition";
public Sql(String tableName) {
options = new HashMap<>();
this.tableName = tableName;
}
public Sql option(ConfigOption<?> option, Object val) {
this.options.put(option.key(), val.toString());
return this;
}
public Sql option(String key, Object val) {
this.options.put(key, val.toString());
return this;
}
public Sql options(Map<String, String> options) {
this.options.putAll(options);
return this;
}
public Sql noPartition() {
this.withPartition = false;
return this;
}
public Sql pkField(String pkField) {
this.pkField = pkField;
return this;
}
public Sql partitionField(String partitionField) {
this.partitionField = partitionField;
return this;
}
public Sql field(String fieldSchema) {
fields.add(fieldSchema);
return this;
}
public String end() {
if (this.fields.size() == 0) {
this.fields = FIELDS;
}
return QuickstartConfigurations.getCreateHoodieTableDDL(this.tableName, this.fields, options,
this.withPartition, this.pkField, this.partitionField);
}
}
public static class Catalog {
private final String catalogName;
private String catalogPath = ".";
public Catalog(String catalogName) {
this.catalogName = catalogName;
}
public Catalog catalogPath(String catalogPath) {
this.catalogPath = catalogPath;
return this;
}
public String end() {
return QuickstartConfigurations.getCreateHudiCatalogDDL(catalogName, catalogPath);
}
}
}

View File

@@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart.utils;
import org.apache.flink.table.catalog.Column;
import org.apache.flink.table.catalog.ResolvedSchema;
import org.apache.flink.table.catalog.UniqueConstraint;
import org.apache.flink.table.catalog.WatermarkSpec;
import org.apache.flink.table.types.DataType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Builder for {@link ResolvedSchema}.
*/
public class SchemaBuilder {
private List<Column> columns;
private List<WatermarkSpec> watermarkSpecs;
private UniqueConstraint constraint;
public static SchemaBuilder instance() {
return new SchemaBuilder();
}
private SchemaBuilder() {
this.columns = new ArrayList<>();
this.watermarkSpecs = new ArrayList<>();
}
public SchemaBuilder field(String name, DataType type) {
this.columns.add(Column.physical(name, type));
return this;
}
public SchemaBuilder fields(List<String> names, List<DataType> types) {
List<Column> columns = IntStream.range(0, names.size())
.mapToObj(idx -> Column.physical(names.get(idx), types.get(idx)))
.collect(Collectors.toList());
this.columns.addAll(columns);
return this;
}
public SchemaBuilder primaryKey(String... columns) {
this.constraint = UniqueConstraint.primaryKey("pk", Arrays.asList(columns));
return this;
}
public ResolvedSchema build() {
return new ResolvedSchema(columns, watermarkSpecs, constraint);
}
}

View File

@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory
org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory

View File

@@ -0,0 +1,8 @@
{"uuid": "id1", "name": "Danny", "age": 23, "ts": "1970-01-01T00:00:01", "partition": "par1"}
{"uuid": "id2", "name": "Stephen", "age": 33, "ts": "1970-01-01T00:00:02", "partition": "par1"}
{"uuid": "id3", "name": "Julian", "age": 53, "ts": "1970-01-01T00:00:03", "partition": "par2"}
{"uuid": "id4", "name": "Fabian", "age": 31, "ts": "1970-01-01T00:00:04", "partition": "par2"}
{"uuid": "id5", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"}
{"uuid": "id6", "name": "Emma", "age": 20, "ts": "1970-01-01T00:00:06", "partition": "par3"}
{"uuid": "id7", "name": "Bob", "age": 44, "ts": "1970-01-01T00:00:07", "partition": "par4"}
{"uuid": "id8", "name": "Han", "age": 56, "ts": "1970-01-01T00:00:08", "partition": "par4"}

View File

@@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart;
import static org.apache.hudi.examples.quickstart.TestQuickstartData.assertRowsEquals;
import java.io.File;
import java.util.List;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.test.util.AbstractTestBase;
import org.apache.flink.types.Row;
import org.apache.hudi.common.model.HoodieTableType;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.io.TempDir;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
/**
* IT cases for Hoodie table source and sink.
*/
public class TestHoodieFlinkQuickstart extends AbstractTestBase {
private final HoodieFlinkQuickstart flinkQuickstart = HoodieFlinkQuickstart.instance();
@BeforeEach
void beforeEach() {
flinkQuickstart.initEnv();
}
@TempDir
File tempFile;
@ParameterizedTest
@EnumSource(value = HoodieTableType.class)
void testHoodieFlinkQuickstart(HoodieTableType tableType) throws Exception {
// create filesystem table named source
flinkQuickstart.createFileSource();
// create hudi table
flinkQuickstart.createHudiTable(tempFile.getAbsolutePath(), "t1", tableType);
// insert data
List<Row> rows = flinkQuickstart.insertData();
assertRowsEquals(rows, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT);
// query data
List<Row> rows1 = flinkQuickstart.queryData();
assertRowsEquals(rows1, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT);
// update data
List<Row> rows2 = flinkQuickstart.updateData();
assertRowsEquals(rows2, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT);
}
}

View File

@@ -0,0 +1,422 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart;
import static junit.framework.TestCase.assertEquals;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.data.TimestampData;
import org.apache.flink.table.data.binary.BinaryRowData;
import org.apache.flink.table.data.conversion.DataStructureConverter;
import org.apache.flink.table.data.conversion.DataStructureConverters;
import org.apache.flink.table.data.writer.BinaryRowWriter;
import org.apache.flink.table.data.writer.BinaryWriter;
import org.apache.flink.table.runtime.typeutils.InternalSerializers;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.types.Row;
import org.apache.flink.types.RowKind;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.config.HoodieCommonConfig;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
import org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations;
import org.apache.parquet.Strings;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.hadoop.ParquetReader;
/**
* Data set for testing, also some utilities to check the results.
*/
public class TestQuickstartData {
public static List<RowData> DATA_SET_INSERT_DUPLICATES = new ArrayList<>();
static {
IntStream.range(0, 5).forEach(i -> DATA_SET_INSERT_DUPLICATES.add(
insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
TimestampData.fromEpochMillis(1), StringData.fromString("par1"))));
}
public static List<RowData> DATA_SET_INSERT_SAME_KEY = new ArrayList<>();
static {
IntStream.range(0, 5).forEach(i -> DATA_SET_INSERT_SAME_KEY.add(
insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
TimestampData.fromEpochMillis(i), StringData.fromString("par1"))));
}
// data set of source-file.json latest commit.
public static List<RowData> DATA_SET_SOURCE_INSERT_LATEST_COMMIT = Arrays.asList(
insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18,
TimestampData.fromEpochMillis(5000), StringData.fromString("par3")),
insertRow(StringData.fromString("id6"), StringData.fromString("Emma"), 20,
TimestampData.fromEpochMillis(6000), StringData.fromString("par3")),
insertRow(StringData.fromString("id7"), StringData.fromString("Bob"), 44,
TimestampData.fromEpochMillis(7000), StringData.fromString("par4")),
insertRow(StringData.fromString("id8"), StringData.fromString("Han"), 56,
TimestampData.fromEpochMillis(8000), StringData.fromString("par4"))
);
public static List<RowData> DATA_SET_DISORDER_UPDATE_DELETE = Arrays.asList(
// DISORDER UPDATE
updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21,
TimestampData.fromEpochMillis(3), StringData.fromString("par1")),
updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20,
TimestampData.fromEpochMillis(2), StringData.fromString("par1")),
updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
TimestampData.fromEpochMillis(1), StringData.fromString("par1")),
updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20,
TimestampData.fromEpochMillis(2), StringData.fromString("par1")),
updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22,
TimestampData.fromEpochMillis(4), StringData.fromString("par1")),
updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21,
TimestampData.fromEpochMillis(3), StringData.fromString("par1")),
// DISORDER DELETE
deleteRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22,
TimestampData.fromEpochMillis(2), StringData.fromString("par1"))
);
public static List<RowData> dataSetInsert(int... ids) {
List<RowData> inserts = new ArrayList<>();
Arrays.stream(ids).forEach(i -> inserts.add(
insertRow(StringData.fromString("id" + i), StringData.fromString("Danny"), 23,
TimestampData.fromEpochMillis(i), StringData.fromString("par1"))));
return inserts;
}
private static Integer toIdSafely(Object id) {
if (id == null) {
return -1;
}
final String idStr = id.toString();
if (idStr.startsWith("id")) {
return Integer.parseInt(idStr.substring(2));
}
return -1;
}
/**
* Returns string format of a list of RowData.
*/
public static String rowDataToString(List<RowData> rows) {
DataStructureConverter<Object, Object> converter =
DataStructureConverters.getConverter(QuickstartConfigurations.ROW_DATA_TYPE);
return rows.stream()
.sorted(Comparator.comparing(o -> toIdSafely(o.getString(0))))
.map(row -> converter.toExternal(row).toString())
.collect(Collectors.toList()).toString();
}
private static String toStringSafely(Object obj) {
return obj == null ? "null" : obj.toString();
}
/**
* Sort the {@code rows} using field at index 0 and asserts
* it equals with the expected string {@code expected}.
*
* @param rows Actual result rows
* @param expected Expected string of the sorted rows
*/
public static void assertRowsEquals(List<Row> rows, String expected) {
assertRowsEquals(rows, expected, false);
}
/**
* Sort the {@code rows} using field at index 0 and asserts
* it equals with the expected string {@code expected}.
*
* @param rows Actual result rows
* @param expected Expected string of the sorted rows
* @param withChangeFlag Whether compares with change flags
*/
public static void assertRowsEquals(List<Row> rows, String expected, boolean withChangeFlag) {
String rowsString = rows.stream()
.sorted(Comparator.comparing(o -> toStringSafely(o.getField(0))))
.map(row -> {
final String rowStr = row.toString();
if (withChangeFlag) {
return row.getKind().shortString() + "(" + rowStr + ")";
} else {
return rowStr;
}
})
.collect(Collectors.toList()).toString();
assertThat(rowsString, is(expected));
}
/**
* Sort the {@code rows} using field at index {@code orderingPos} and asserts
* it equals with the expected string {@code expected}.
*
* @param rows Actual result rows
* @param expected Expected string of the sorted rows
* @param orderingPos Field position for ordering
*/
public static void assertRowsEquals(List<Row> rows, String expected, int orderingPos) {
String rowsString = rows.stream()
.sorted(Comparator.comparing(o -> toStringSafely(o.getField(orderingPos))))
.collect(Collectors.toList()).toString();
assertThat(rowsString, is(expected));
}
/**
* Sort the {@code rows} using field at index 0 and asserts
* it equals with the expected row data list {@code expected}.
*
* @param rows Actual result rows
* @param expected Expected row data list
*/
public static void assertRowsEquals(List<Row> rows, List<RowData> expected) {
String rowsString = rows.stream()
.sorted(Comparator.comparing(o -> toIdSafely(o.getField(0))))
.collect(Collectors.toList()).toString();
assertThat(rowsString, is(rowDataToString(expected)));
}
/**
* Sort the {@code rows} using field at index 0 and asserts
* it equals with the expected string {@code expected}.
*
* @param rows Actual result rows
* @param expected Expected string of the sorted rows
*/
public static void assertRowDataEquals(List<RowData> rows, String expected) {
String rowsString = rowDataToString(rows);
assertThat(rowsString, is(expected));
}
/**
* Sort the {@code rows} using field at index 0 and asserts
* it equals with the expected row data list {@code expected}.
*
* @param rows Actual result rows
* @param expected Expected row data list
*/
public static void assertRowDataEquals(List<RowData> rows, List<RowData> expected) {
String rowsString = rowDataToString(rows);
assertThat(rowsString, is(rowDataToString(expected)));
}
/**
* Checks the source data set are written as expected.
*
* <p>Note: Replace it with the Flink reader when it is supported.
*
* @param baseFile The file base to check, should be a directory
* @param expected The expected results mapping, the key should be the partition path
* and value should be values list with the key partition
*/
public static void checkWrittenData(File baseFile, Map<String, String> expected) throws IOException {
checkWrittenData(baseFile, expected, 4);
}
/**
* Checks the source data set are written as expected.
*
* <p>Note: Replace it with the Flink reader when it is supported.
*
* @param baseFile The file base to check, should be a directory
* @param expected The expected results mapping, the key should be the partition path
* and value should be values list with the key partition
* @param partitions The expected partition number
*/
public static void checkWrittenData(
File baseFile,
Map<String, String> expected,
int partitions) throws IOException {
assert baseFile.isDirectory();
FileFilter filter = file -> !file.getName().startsWith(".");
File[] partitionDirs = baseFile.listFiles(filter);
assertNotNull(partitionDirs);
assertThat(partitionDirs.length, is(partitions));
for (File partitionDir : partitionDirs) {
File[] dataFiles = partitionDir.listFiles(filter);
assertNotNull(dataFiles);
File latestDataFile = Arrays.stream(dataFiles)
.max(Comparator.comparing(f -> FSUtils.getCommitTime(f.getName())))
.orElse(dataFiles[0]);
ParquetReader<GenericRecord> reader = AvroParquetReader
.<GenericRecord>builder(new Path(latestDataFile.getAbsolutePath())).build();
List<String> readBuffer = new ArrayList<>();
GenericRecord nextRecord = reader.read();
while (nextRecord != null) {
readBuffer.add(filterOutVariables(nextRecord));
nextRecord = reader.read();
}
readBuffer.sort(Comparator.naturalOrder());
assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName())));
}
}
/**
* Checks the MERGE_ON_READ source data are written as expected.
*
* <p>Note: Replace it with the Flink reader when it is supported.
*
* @param fs The file system
* @param latestInstant The latest committed instant of current table
* @param baseFile The file base to check, should be a directory
* @param expected The expected results mapping, the key should be the partition path
* @param partitions The expected partition number
* @param schema The read schema
*/
public static void checkWrittenDataMOR(
FileSystem fs,
String latestInstant,
File baseFile,
Map<String, String> expected,
int partitions,
Schema schema) {
assert baseFile.isDirectory() : "Base path should be a directory";
FileFilter partitionFilter = file -> !file.getName().startsWith(".");
File[] partitionDirs = baseFile.listFiles(partitionFilter);
assertNotNull(partitionDirs);
assertThat(partitionDirs.length, is(partitions));
for (File partitionDir : partitionDirs) {
File[] dataFiles = partitionDir.listFiles(file ->
file.getName().contains(".log.") && !file.getName().startsWith(".."));
assertNotNull(dataFiles);
HoodieMergedLogRecordScanner scanner = getScanner(
fs, baseFile.getPath(), Arrays.stream(dataFiles).map(File::getAbsolutePath)
.sorted(Comparator.naturalOrder()).collect(Collectors.toList()),
schema, latestInstant);
List<String> readBuffer = scanner.getRecords().values().stream()
.map(hoodieRecord -> {
try {
// in case it is a delete
GenericRecord record = (GenericRecord) hoodieRecord.getData()
.getInsertValue(schema, new Properties())
.orElse(null);
return record == null ? (String) null : filterOutVariables(record);
} catch (IOException e) {
throw new RuntimeException(e);
}
})
.filter(Objects::nonNull)
.sorted(Comparator.naturalOrder())
.collect(Collectors.toList());
assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName())));
}
}
/**
* Returns the scanner to read avro log files.
*/
private static HoodieMergedLogRecordScanner getScanner(
FileSystem fs,
String basePath,
List<String> logPaths,
Schema readSchema,
String instant) {
return HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(logPaths)
.withReaderSchema(readSchema)
.withLatestInstantTime(instant)
.withReadBlocksLazily(false)
.withReverseReader(false)
.withBufferSize(16 * 1024 * 1024)
.withMaxMemorySizeInBytes(1024 * 1024L)
.withSpillableMapBasePath("/tmp/")
.withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue())
.withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue())
.build();
}
/**
* Filter out the variables like file name.
*/
private static String filterOutVariables(GenericRecord genericRecord) {
List<String> fields = new ArrayList<>();
fields.add(genericRecord.get("_hoodie_record_key").toString());
fields.add(genericRecord.get("_hoodie_partition_path").toString());
fields.add(genericRecord.get("uuid").toString());
fields.add(genericRecord.get("name").toString());
fields.add(genericRecord.get("age").toString());
fields.add(genericRecord.get("ts").toString());
fields.add(genericRecord.get("partition").toString());
return Strings.join(fields, ",");
}
public static BinaryRowData insertRow(Object... fields) {
return insertRow(QuickstartConfigurations.ROW_TYPE, fields);
}
public static BinaryRowData insertRow(RowType rowType, Object... fields) {
LogicalType[] types = rowType.getFields().stream().map(RowType.RowField::getType)
.toArray(LogicalType[]::new);
assertEquals(
"Filed count inconsistent with type information",
fields.length,
types.length);
BinaryRowData row = new BinaryRowData(fields.length);
BinaryRowWriter writer = new BinaryRowWriter(row);
writer.reset();
for (int i = 0; i < fields.length; i++) {
Object field = fields[i];
if (field == null) {
writer.setNullAt(i);
} else {
BinaryWriter.write(writer, i, field, types[i], InternalSerializers.create(types[i]));
}
}
writer.complete();
return row;
}
private static BinaryRowData deleteRow(Object... fields) {
BinaryRowData rowData = insertRow(fields);
rowData.setRowKind(RowKind.DELETE);
return rowData;
}
private static BinaryRowData updateBeforeRow(Object... fields) {
BinaryRowData rowData = insertRow(fields);
rowData.setRowKind(RowKind.UPDATE_BEFORE);
return rowData;
}
private static BinaryRowData updateAfterRow(Object... fields) {
BinaryRowData rowData = insertRow(fields);
rowData.setRowKind(RowKind.UPDATE_AFTER);
return rowData;
}
}

View File

@@ -0,0 +1,18 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory
org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory

View File

@@ -0,0 +1,30 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache.hudi=DEBUG
log4j.logger.org.apache.hadoop.hbase=ERROR
# CONSOLE is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# CONSOLE uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL

View File

@@ -0,0 +1,31 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=INFO, CONSOLE
log4j.logger.org.apache=INFO
log4j.logger.org.apache.hudi=DEBUG
log4j.logger.org.apache.hadoop.hbase=ERROR
# A1 is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=INFO
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL

View File

@@ -0,0 +1,129 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hudi-examples</artifactId>
<groupId>org.apache.hudi</groupId>
<version>0.11.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-examples-java</artifactId>
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>prepare-package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
<overWriteReleases>true</overWriteReleases>
<overWriteSnapshots>true</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
<phase>test-compile</phase>
</execution>
</executions>
<configuration>
<skip>false</skip>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-examples-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-java-client</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,283 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>hudi-examples</artifactId>
<groupId>org.apache.hudi</groupId>
<version>0.11.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-examples-spark</artifactId>
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>prepare-package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
<overWriteReleases>true</overWriteReleases>
<overWriteSnapshots>true</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
<phase>test-compile</phase>
</execution>
</executions>
<configuration>
<skip>false</skip>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-examples-common</artifactId>
<version>${project.version}</version>
<exclusions>
<exclusion>
<groupId>*</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-cli</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-java-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-timeline-service</artifactId>
<version>${project.version}</version>
</dependency>
<!-- Spark -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-avro_${scala.binary.version}</artifactId>
</dependency>
<!-- Parquet -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>${parquet.version}</version>
</dependency>
<!-- Avro -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
</dependency>
<!-- Hive -->
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-common</artifactId>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<scope>provided</scope>
<classifier>${hive.exec.classifier}</classifier>
<exclusions>
<exclusion>
<groupId>javax.mail</groupId>
<artifactId>mail</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty.aggregate</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- Junit dependencies -->
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.vintage</groupId>
<artifactId>junit-vintage-engine</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
<!-- Hudi Tests -->
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-client</artifactId>
<version>${project.version}</version>
<classifier>tests</classifier>
<type>test-jar</type>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@@ -23,7 +23,6 @@ import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.spark.api.java.JavaSparkContext;
/**
* the example SchemaProvider of example json data from uber.
*/

View File

@@ -0,0 +1,227 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart;
import static org.apache.hudi.config.HoodieWriteConfig.TBL_NAME;
import static org.apache.spark.sql.SaveMode.Append;
import static org.apache.spark.sql.SaveMode.Overwrite;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.common.model.WriteOperationType;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.examples.common.HoodieExampleDataGenerator;
import org.apache.hudi.examples.common.HoodieExampleSparkUtils;
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public final class HoodieSparkQuickstart {
private HoodieSparkQuickstart() {
}
public static void main(String[] args) {
if (args.length < 2) {
System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName>");
System.exit(1);
}
String tablePath = args[0];
String tableName = args[1];
SparkSession spark = HoodieExampleSparkUtils.defaultSparkSession("Hudi Spark basic example");
SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example");
try (JavaSparkContext jsc = new JavaSparkContext(sparkConf)) {
final HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
insertData(spark, jsc, tablePath, tableName, dataGen);
updateData(spark, jsc, tablePath, tableName, dataGen);
queryData(spark, jsc, tablePath, tableName, dataGen);
incrementalQuery(spark, tablePath, tableName);
pointInTimeQuery(spark, tablePath, tableName);
delete(spark, tablePath, tableName);
deleteByPartition(spark, tablePath, tableName);
}
}
/**
* Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi dataset as below.
*/
public static void insertData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName,
HoodieExampleDataGenerator<HoodieAvroPayload> dataGen) {
String commitTime = Long.toString(System.currentTimeMillis());
List<String> inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20));
Dataset<Row> df = spark.read().json(jsc.parallelize(inserts, 1));
df.write().format("org.apache.hudi").
options(QuickstartUtils.getQuickstartWriteConfigs()).
option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts").
option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid").
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath").
option(TBL_NAME.key(), tableName).
mode(Overwrite).
save(tablePath);
}
/**
* Load the data files into a DataFrame.
*/
public static void queryData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName,
HoodieExampleDataGenerator<HoodieAvroPayload> dataGen) {
Dataset<Row> roViewDF = spark.
read().
format("org.apache.hudi").
load(tablePath + "/*/*/*/*");
roViewDF.createOrReplaceTempView("hudi_ro_table");
spark.sql("select fare, begin_lon, begin_lat, ts from hudi_ro_table where fare > 20.0").show();
// +-----------------+-------------------+-------------------+---+
// | fare| begin_lon| begin_lat| ts|
// +-----------------+-------------------+-------------------+---+
// |98.88075495133515|0.39556048623031603|0.17851135255091155|0.0|
// ...
spark.sql(
"select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_ro_table")
.show();
// +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+
// |_hoodie_commit_time| _hoodie_record_key|_hoodie_partition_path| rider| driver| fare|
// +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+
// | 20191231181501|31cafb9f-0196-4b1...| 2020/01/02|rider-1577787297889|driver-1577787297889| 98.88075495133515|
// ...
}
/**
* This is similar to inserting new data. Generate updates to existing trips using the data generator,
* load into a DataFrame and write DataFrame into the hudi dataset.
*/
public static void updateData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName,
HoodieExampleDataGenerator<HoodieAvroPayload> dataGen) {
String commitTime = Long.toString(System.currentTimeMillis());
List<String> updates = dataGen.convertToStringList(dataGen.generateUpdates(commitTime, 10));
Dataset<Row> df = spark.read().json(jsc.parallelize(updates, 1));
df.write().format("org.apache.hudi").
options(QuickstartUtils.getQuickstartWriteConfigs()).
option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts").
option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid").
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath").
option(TBL_NAME.key(), tableName).
mode(Append).
save(tablePath);
}
/**
* Deleta data based in data information.
*/
public static void delete(SparkSession spark, String tablePath, String tableName) {
Dataset<Row> roViewDF = spark.read().format("org.apache.hudi").load(tablePath + "/*/*/*/*");
roViewDF.createOrReplaceTempView("hudi_ro_table");
Dataset<Row> df = spark.sql("select uuid, partitionpath, ts from hudi_ro_table limit 2");
df.write().format("org.apache.hudi").
options(QuickstartUtils.getQuickstartWriteConfigs()).
option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts").
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "uuid").
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath").
option(TBL_NAME.key(), tableName).
option("hoodie.datasource.write.operation", WriteOperationType.DELETE.value()).
mode(Append).
save(tablePath);
}
/**
* Delete the data of a single or multiple partitions.
*/
public static void deleteByPartition(SparkSession spark, String tablePath, String tableName) {
Dataset<Row> df = spark.emptyDataFrame();
df.write().format("org.apache.hudi").
options(QuickstartUtils.getQuickstartWriteConfigs()).
option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts").
option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid").
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath").
option(TBL_NAME.key(), tableName).
option("hoodie.datasource.write.operation", WriteOperationType.DELETE.value()).
option("hoodie.datasource.write.partitions.to.delete",
ArrayUtils.toString(HoodieExampleDataGenerator.DEFAULT_PARTITION_PATHS, ",")).
mode(Append).
save(tablePath);
}
/**
* Hudi also provides capability to obtain a stream of records that changed since given commit timestamp.
* This can be achieved using Hudis incremental view and providing a begin time from which changes need to be streamed.
* We do not need to specify endTime, if we want all changes after the given commit (as is the common case).
*/
public static void incrementalQuery(SparkSession spark, String tablePath, String tableName) {
List<String> commits =
spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime")
.toJavaRDD()
.map((Function<Row, String>) row -> row.getString(0))
.take(50);
String beginTime = commits.get(commits.size() - 2); // commit time we are interested in
// incrementally query data
Dataset<Row> incViewDF = spark.
read().
format("org.apache.hudi").
option("hoodie.datasource.query.type", "incremental").
option("hoodie.datasource.read.begin.instanttime", beginTime).
load(tablePath);
incViewDF.createOrReplaceTempView("hudi_incr_table");
spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0")
.show();
}
/**
* Lets look at how to query data as of a specific time.
* The specific time can be represented by pointing endTime to a specific commit time
* and beginTime to “000” (denoting earliest possible commit time).
*/
public static void pointInTimeQuery(SparkSession spark, String tablePath, String tableName) {
List<String> commits =
spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime")
.toJavaRDD()
.map((Function<Row, String>) row -> row.getString(0))
.take(50);
String beginTime = "000"; // Represents all commits > this time.
String endTime = commits.get(commits.size() - 2); // commit time we are interested in
//incrementally query data
Dataset<Row> incViewDF = spark.read().format("org.apache.hudi").
option("hoodie.datasource.query.type", "incremental").
option("hoodie.datasource.read.begin.instanttime", beginTime).
option("hoodie.datasource.read.end.instanttime", endTime).
load(tablePath);
incViewDF.createOrReplaceTempView("hudi_incr_table");
spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0")
.show();
}
}

View File

@@ -0,0 +1,249 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.spark.sql.Row;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.UUID;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* Class to be used in quickstart guide for generating inserts and updates against a corpus. Test data uses a toy Uber
* trips, data model.
*/
public class QuickstartUtils {
public static class DataGenerator {
private static final String DEFAULT_FIRST_PARTITION_PATH = "americas/united_states/san_francisco";
private static final String DEFAULT_SECOND_PARTITION_PATH = "americas/brazil/sao_paulo";
private static final String DEFAULT_THIRD_PARTITION_PATH = "asia/india/chennai";
private static final String[] DEFAULT_PARTITION_PATHS =
{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH};
static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"triprec\",\"fields\": [ "
+ "{\"name\": \"ts\",\"type\": \"long\"},{\"name\": \"uuid\", \"type\": \"string\"},"
+ "{\"name\": \"rider\", \"type\": \"string\"},{\"name\": \"driver\", \"type\": \"string\"},"
+ "{\"name\": \"begin_lat\", \"type\": \"double\"},{\"name\": \"begin_lon\", \"type\": \"double\"},"
+ "{\"name\": \"end_lat\", \"type\": \"double\"},{\"name\": \"end_lon\", \"type\": \"double\"},"
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA);
private static Random rand = new Random(46474747);
private final Map<Integer, HoodieKey> existingKeys;
private final String[] partitionPaths;
private int numExistingKeys;
public DataGenerator() {
this(DEFAULT_PARTITION_PATHS, new HashMap<>());
}
public DataGenerator(String[] partitionPaths) {
this(partitionPaths, new HashMap<>());
}
private DataGenerator(String[] partitionPaths, Map<Integer, HoodieKey> keyPartitionMap) {
this.partitionPaths = Arrays.copyOf(partitionPaths, partitionPaths.length);
this.existingKeys = keyPartitionMap;
}
private static String generateRandomString() {
int leftLimit = 48; // ascii for 0
int rightLimit = 57; // ascii for 9
int stringLength = 3;
StringBuilder buffer = new StringBuilder(stringLength);
for (int i = 0; i < stringLength; i++) {
int randomLimitedInt = leftLimit + (int) (rand.nextFloat() * (rightLimit - leftLimit + 1));
buffer.append((char) randomLimitedInt);
}
return buffer.toString();
}
public int getNumExistingKeys() {
return numExistingKeys;
}
public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName,
long timestamp) {
GenericRecord rec = new GenericData.Record(avroSchema);
rec.put("uuid", rowKey);
rec.put("ts", timestamp);
rec.put("rider", riderName);
rec.put("driver", driverName);
rec.put("begin_lat", rand.nextDouble());
rec.put("begin_lon", rand.nextDouble());
rec.put("end_lat", rand.nextDouble());
rec.put("end_lon", rand.nextDouble());
rec.put("fare", rand.nextDouble() * 100);
return rec;
}
/**
* Generates a new avro record of the above schema format, retaining the key if optionally provided. The
* riderDriverSuffix string is a random String to simulate updates by changing the rider driver fields for records
* belonging to the same commit. It is purely used for demo purposes. In real world, the actual updates are assumed
* to be provided based on the application requirements.
*/
public static OverwriteWithLatestAvroPayload generateRandomValue(HoodieKey key, String riderDriverSuffix)
throws IOException {
// The timestamp generated is limited to range from 7 days before to now, to avoid generating too many
// partitionPaths when user use timestamp as partitionPath filed.
GenericRecord rec =
generateGenericRecord(key.getRecordKey(), "rider-" + riderDriverSuffix, "driver-"
+ riderDriverSuffix, generateRangeRandomTimestamp(7));
return new OverwriteWithLatestAvroPayload(Option.of(rec));
}
/**
* Generate timestamp range from {@param daysTillNow} before to now.
*/
private static long generateRangeRandomTimestamp(int daysTillNow) {
long maxIntervalMillis = daysTillNow * 24 * 60 * 60 * 1000L;
return System.currentTimeMillis() - (long)(Math.random() * maxIntervalMillis);
}
/**
* Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
*/
public Stream<HoodieRecord> generateInsertsStream(String randomString, Integer n) {
int currSize = getNumExistingKeys();
return IntStream.range(0, n).boxed().map(i -> {
String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)];
HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath);
existingKeys.put(currSize + i, key);
numExistingKeys++;
try {
return new HoodieAvroRecord(key, generateRandomValue(key, randomString));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
});
}
/**
* Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
*/
public List<HoodieRecord> generateInserts(Integer n) throws IOException {
String randomString = generateRandomString();
return generateInsertsStream(randomString, n).collect(Collectors.toList());
}
public HoodieRecord generateUpdateRecord(HoodieKey key, String randomString) throws IOException {
return new HoodieAvroRecord(key, generateRandomValue(key, randomString));
}
/**
* Generates new updates, randomly distributed across the keys above. There can be duplicates within the returned
* list
*
* @param n Number of updates (including dups)
* @return list of hoodie record updates
*/
public List<HoodieRecord> generateUpdates(Integer n) {
if (numExistingKeys == 0) {
throw new HoodieException("Data must have been written before performing the update operation");
}
String randomString = generateRandomString();
return IntStream.range(0, n).boxed().map(x -> {
try {
return generateUpdateRecord(existingKeys.get(rand.nextInt(numExistingKeys)), randomString);
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
}).collect(Collectors.toList());
}
/**
* Generates delete records for the passed in rows.
*
* @param rows List of {@link Row}s for which delete record need to be generated
* @return list of hoodie records to delete
*/
public List<String> generateDeletes(List<Row> rows) {
// if row.length() == 2, then the record contains "uuid" and "partitionpath" fields, otherwise,
// another field "ts" is available
return rows.stream().map(row -> row.length() == 2
? convertToString(row.getAs("uuid"), row.getAs("partitionpath"), null) :
convertToString(row.getAs("uuid"), row.getAs("partitionpath"), row.getAs("ts"))
).filter(os -> os.isPresent()).map(os -> os.get())
.collect(Collectors.toList());
}
public void close() {
existingKeys.clear();
}
}
private static Option<String> convertToString(HoodieRecord record) {
try {
String str = HoodieAvroUtils
.bytesToAvro(((OverwriteWithLatestAvroPayload) record.getData()).recordBytes, DataGenerator.avroSchema)
.toString();
str = "{" + str.substring(str.indexOf("\"ts\":"));
return Option.of(str.replaceAll("}", ", \"partitionpath\": \"" + record.getPartitionPath() + "\"}"));
} catch (IOException e) {
return Option.empty();
}
}
private static Option<String> convertToString(String uuid, String partitionPath, Long ts) {
StringBuffer stringBuffer = new StringBuffer();
stringBuffer.append("{");
stringBuffer.append("\"ts\": \"" + (ts == null ? "0.0" : ts) + "\",");
stringBuffer.append("\"uuid\": \"" + uuid + "\",");
stringBuffer.append("\"partitionpath\": \"" + partitionPath + "\"");
stringBuffer.append("}");
return Option.of(stringBuffer.toString());
}
public static List<String> convertToStringList(List<HoodieRecord> records) {
return records.stream().map(hr -> convertToString(hr)).filter(os -> os.isPresent()).map(os -> os.get())
.collect(Collectors.toList());
}
public static Map<String, String> getQuickstartWriteConfigs() {
Map<String, String> demoConfigs = new HashMap<>();
demoConfigs.put("hoodie.insert.shuffle.parallelism", "2");
demoConfigs.put("hoodie.upsert.shuffle.parallelism", "2");
demoConfigs.put("hoodie.bulkinsert.shuffle.parallelism", "2");
demoConfigs.put("hoodie.delete.shuffle.parallelism", "2");
return demoConfigs;
}
}

View File

@@ -20,7 +20,7 @@ package org.apache.hudi.examples.spark
import org.apache.hudi.DataSourceReadOptions.{BEGIN_INSTANTTIME, END_INSTANTTIME, QUERY_TYPE, QUERY_TYPE_INCREMENTAL_OPT_VAL}
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD, PARTITIONS_TO_DELETE, OPERATION, DELETE_PARTITION_OPERATION_OPT_VAL, DELETE_OPERATION_OPT_VAL}
import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs
import org.apache.hudi.examples.quickstart.QuickstartUtils.getQuickstartWriteConfigs
import org.apache.hudi.common.model.HoodieAvroPayload
import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
import org.apache.hudi.examples.common.{HoodieExampleDataGenerator, HoodieExampleSparkUtils}
@@ -172,7 +172,7 @@ object HoodieDataSourceExample {
* This can be achieved using Hudis incremental view and providing a begin time from which changes need to be streamed.
* We do not need to specify endTime, if we want all changes after the given commit (as is the common case).
*/
def incrementalQuery(spark: SparkSession, tablePath: String, tableName: String) {
def incrementalQuery(spark: SparkSession, tablePath: String, tableName: String): Unit = {
import spark.implicits._
val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50)
val beginTime = commits(commits.length - 2) // commit time we are interested in
@@ -193,7 +193,7 @@ object HoodieDataSourceExample {
* The specific time can be represented by pointing endTime to a specific commit time
* and beginTime to 000 (denoting earliest possible commit time).
*/
def pointInTimeQuery(spark: SparkSession, tablePath: String, tableName: String) {
def pointInTimeQuery(spark: SparkSession, tablePath: String, tableName: String): Unit = {
import spark.implicits._
val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50)
val beginTime = "000" // Represents all commits > this time.

View File

@@ -20,7 +20,7 @@
package org.apache.hudi.examples.spark
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD, TABLE_TYPE}
import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs
import org.apache.hudi.examples.quickstart.QuickstartUtils.getQuickstartWriteConfigs
import org.apache.hudi.client.SparkRDDWriteClient
import org.apache.hudi.client.common.HoodieSparkEngineContext
import org.apache.hudi.common.model.{HoodieAvroPayload, HoodieRecordPayload, HoodieTableType}
@@ -55,6 +55,7 @@ object HoodieMorCompactionJob {
val dataGen = new HoodieExampleDataGenerator[HoodieAvroPayload]
val tablePath = args(0)
val tableName = args(1)
insertData(spark, tablePath, tableName, dataGen, HoodieTableType.MERGE_ON_READ.name())
updateData(spark, tablePath, tableName, dataGen, HoodieTableType.MERGE_ON_READ.name())
val cfg = HoodieWriteConfig.newBuilder()

View File

@@ -0,0 +1,115 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart;
import java.io.File;
import java.nio.file.Paths;
import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.model.HoodieAvroPayload;
import org.apache.hudi.examples.common.HoodieExampleDataGenerator;
import org.apache.hudi.testutils.providers.SparkProvider;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.Utils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
public class TestHoodieSparkQuickstart implements SparkProvider {
protected static transient HoodieSparkEngineContext context;
private static transient SparkSession spark;
private static transient SQLContext sqlContext;
private static transient JavaSparkContext jsc;
/**
* An indicator of the initialization status.
*/
protected boolean initialized = false;
@TempDir
protected java.nio.file.Path tempDir;
private static final HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
@Override
public SparkSession spark() {
return spark;
}
@Override
public SQLContext sqlContext() {
return sqlContext;
}
@Override
public JavaSparkContext jsc() {
return jsc;
}
@Override
public HoodieSparkEngineContext context() {
return context;
}
public String basePath() {
return tempDir.toAbsolutePath().toString();
}
public String tablePath(String tableName) {
return Paths.get(basePath(), tableName).toString();
}
@BeforeEach
public synchronized void runBeforeEach() {
initialized = spark != null;
if (!initialized) {
SparkConf sparkConf = conf();
SparkRDDWriteClient.registerClasses(sparkConf);
HoodieReadClient.addHoodieSupport(sparkConf);
spark = SparkSession.builder().config(sparkConf).getOrCreate();
sqlContext = spark.sqlContext();
jsc = new JavaSparkContext(spark.sparkContext());
context = new HoodieSparkEngineContext(jsc);
}
}
@Test
public void testHoodieSparkQuickstart() {
String tableName = "spark_quick_start";
String tablePath = tablePath(tableName);
try {
HoodieSparkQuickstart.insertData(spark, jsc, tablePath, tableName, dataGen);
HoodieSparkQuickstart.updateData(spark, jsc, tablePath, tableName, dataGen);
HoodieSparkQuickstart.queryData(spark, jsc, tablePath, tableName, dataGen);
HoodieSparkQuickstart.incrementalQuery(spark, tablePath, tableName);
HoodieSparkQuickstart.pointInTimeQuery(spark, tablePath, tableName);
HoodieSparkQuickstart.delete(spark, tablePath, tableName);
HoodieSparkQuickstart.deleteByPartition(spark, tablePath, tableName);
} finally {
Utils.deleteRecursively(new File(tablePath));
}
}
}

View File

@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.examples.quickstart;
import org.apache.hudi.exception.HoodieException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.junit.jupiter.MockitoExtension;
import static org.junit.jupiter.api.Assertions.assertEquals;
@ExtendWith(MockitoExtension.class)
public class TestQuickstartUtils {
@Test
public void testGenerateUpdates() throws Exception {
QuickstartUtils.DataGenerator dataGenerator = new QuickstartUtils.DataGenerator();
//Call generateUpdates directly then throws HoodieException
assertEquals(Assertions.assertThrows(HoodieException.class, () -> {
dataGenerator.generateUpdates(10);
}).getMessage(), "Data must have been written before performing the update operation");
//Execute generateInserts before executing generateUpdates then no exception
assertEquals(dataGenerator.generateInserts(10).size(), 10);
assertEquals(dataGenerator.generateUpdates(10).size(), 10);
}
}

View File

@@ -0,0 +1,30 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache.hudi=DEBUG
log4j.logger.org.apache.hadoop.hbase=ERROR
# CONSOLE is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# CONSOLE uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL

View File

@@ -0,0 +1,31 @@
###
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=INFO, CONSOLE
log4j.logger.org.apache=INFO
log4j.logger.org.apache.hudi=DEBUG
log4j.logger.org.apache.hadoop.hbase=ERROR
# A1 is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=INFO
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL

View File

@@ -25,204 +25,13 @@
<modelVersion>4.0.0</modelVersion>
<artifactId>hudi-examples</artifactId>
<packaging>jar</packaging>
<packaging>pom</packaging>
<properties>
<main.basedir>${project.parent.basedir}</main.basedir>
<checkstyle.skip>true</checkstyle.skip>
</properties>
<modules>
<module>hudi-examples-common</module>
<module>hudi-examples-spark</module>
<module>hudi-examples-flink</module>
<module>hudi-examples-java</module>
</modules>
<build>
<resources>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>prepare-package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
<overWriteReleases>true</overWriteReleases>
<overWriteSnapshots>true</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
<phase>test-compile</phase>
</execution>
</executions>
<configuration>
<skip>false</skip>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-cli</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-client-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-java-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark-client</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-hadoop-mr</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hudi</groupId>
<artifactId>hudi-timeline-service</artifactId>
<version>${project.version}</version>
</dependency>
<!-- Spark -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-avro_${scala.binary.version}</artifactId>
</dependency>
<!-- Parquet -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>${parquet.version}</version>
</dependency>
<!-- Avro -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
</dependency>
<!-- Hive -->
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-common</artifactId>
</dependency>
<dependency>
<groupId>${hive.groupid}</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
<scope>provided</scope>
<classifier>${hive.exec.classifier}</classifier>
<exclusions>
<exclusion>
<groupId>javax.mail</groupId>
<artifactId>mail</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty.aggregate</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>