[HUDI-3563] Make quickstart examples covered by CI tests (#5082)
This commit is contained in:
109
hudi-examples/hudi-examples-common/pom.xml
Normal file
109
hudi-examples/hudi-examples-common/pom.xml
Normal file
@@ -0,0 +1,109 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>hudi-examples</artifactId>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<version>0.11.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>hudi-examples-common</artifactId>
|
||||
|
||||
<properties>
|
||||
<main.basedir>${project.parent.basedir}</main.basedir>
|
||||
<checkstyle.skip>true</checkstyle.skip>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>process-resources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>compile</phase>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
<phase>test-compile</phase>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<skip>false</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.rat</groupId>
|
||||
<artifactId>apache-rat-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Avro -->
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-avro</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -43,7 +43,6 @@ import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
|
||||
/**
|
||||
* Class to be used to generate test data.
|
||||
*/
|
||||
@@ -63,7 +62,7 @@ public class HoodieExampleDataGenerator<T extends HoodieRecordPayload<T>> {
|
||||
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
|
||||
public static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA);
|
||||
|
||||
private static Random rand = new Random(46474747);
|
||||
private static final Random rand = new Random(46474747);
|
||||
|
||||
private final Map<Integer, KeyPartition> existingKeys;
|
||||
private final String[] partitionPaths;
|
||||
364
hudi-examples/hudi-examples-flink/pom.xml
Normal file
364
hudi-examples/hudi-examples-flink/pom.xml
Normal file
@@ -0,0 +1,364 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>hudi-examples</artifactId>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<version>0.11.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>hudi-examples-flink</artifactId>
|
||||
|
||||
<properties>
|
||||
<main.basedir>${project.parent.basedir}</main.basedir>
|
||||
<checkstyle.skip>true</checkstyle.skip>
|
||||
<parquet.version>1.11.1</parquet.version>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.jacoco</groupId>
|
||||
<artifactId>jacoco-maven-plugin</artifactId>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<version>3.1.2</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.rat</groupId>
|
||||
<artifactId>apache-rat-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
</resource>
|
||||
<resource>
|
||||
<directory>src/test/resources</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<!-- Hoodie -->
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-client-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-flink-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-hadoop-mr</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-hive-sync</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-sync-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-flink</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Flink -->
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-clients_${scala.binary.version}</artifactId>
|
||||
<scope>compile</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>com.esotericsoftware.kryo</groupId>
|
||||
<artifactId>kryo</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>com.esotericsoftware.minlog</groupId>
|
||||
<artifactId>minlog</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.kafka</groupId>
|
||||
<artifactId>kafka-clients</artifactId>
|
||||
<version>${kafka.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-hadoop-compatibility_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-parquet_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-json</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-common</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-runtime_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-planner_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-statebackend-rocksdb_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-hadoop</artifactId>
|
||||
<version>${parquet.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.xerial.snappy</groupId>
|
||||
<artifactId>snappy-java</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<!-- Avro -->
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro</artifactId>
|
||||
<!-- Override the version to be same with Flink avro -->
|
||||
<version>1.10.0</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Hadoop -->
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-mapreduce-client-core</artifactId>
|
||||
<scope>compile</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-log4j12</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.beust</groupId>
|
||||
<artifactId>jcommander</artifactId>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.twitter</groupId>
|
||||
<artifactId>bijection-avro_${scala.binary.version}</artifactId>
|
||||
<version>0.9.7</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>joda-time</groupId>
|
||||
<artifactId>joda-time</artifactId>
|
||||
<version>2.5</version>
|
||||
</dependency>
|
||||
<!-- Hive -->
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-exec</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
<classifier>${hive.exec.classifier}</classifier>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>javax.mail</groupId>
|
||||
<artifactId>mail</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.eclipse.jetty.aggregate</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<!-- Test dependencies -->
|
||||
|
||||
<!-- Junit 5 dependencies -->
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-api</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-engine</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.vintage</groupId>
|
||||
<artifactId>junit-vintage-engine</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-params</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Hoodie dependencies -->
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<classifier>tests</classifier>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-client-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<classifier>tests</classifier>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-flink-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<classifier>tests</classifier>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<!-- <dependency>-->
|
||||
<!-- <groupId>org.apache.hudi</groupId>-->
|
||||
<!-- <artifactId>hudi-flink_${scala.binary.version}</artifactId>-->
|
||||
<!-- <version>${project.version}</version>-->
|
||||
<!-- <scope>test</scope>-->
|
||||
<!-- </dependency>-->
|
||||
|
||||
<!-- Flink dependencies -->
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-test-utils_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-runtime</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>test</scope>
|
||||
<type>test-jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>test</scope>
|
||||
<type>test-jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-table-runtime_${scala.binary.version}</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>test</scope>
|
||||
<type>test-jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-json</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>test</scope>
|
||||
<type>test-jar</type>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.flink</groupId>
|
||||
<artifactId>flink-csv</artifactId>
|
||||
<version>${flink.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Parquet Test-->
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-avro</artifactId>
|
||||
<version>${parquet.version}</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -0,0 +1,211 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart;
|
||||
|
||||
import static org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations.sql;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.core.execution.JobClient;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.table.api.EnvironmentSettings;
|
||||
import org.apache.flink.table.api.TableEnvironment;
|
||||
import org.apache.flink.table.api.TableResult;
|
||||
import org.apache.flink.table.api.TableSchema;
|
||||
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
|
||||
import org.apache.flink.table.api.config.ExecutionConfigOptions;
|
||||
import org.apache.flink.table.api.internal.TableEnvironmentImpl;
|
||||
import org.apache.flink.table.catalog.ObjectPath;
|
||||
import org.apache.flink.table.catalog.exceptions.TableNotExistException;
|
||||
import org.apache.flink.types.Row;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.configuration.FlinkOptions;
|
||||
import org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory;
|
||||
import org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
public final class HoodieFlinkQuickstart {
|
||||
private EnvironmentSettings settings = null;
|
||||
private TableEnvironment streamTableEnv = null;
|
||||
|
||||
private String tableName;
|
||||
|
||||
private HoodieFlinkQuickstart() {
|
||||
}
|
||||
|
||||
public static HoodieFlinkQuickstart instance() {
|
||||
return new HoodieFlinkQuickstart();
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws TableNotExistException, InterruptedException {
|
||||
if (args.length < 3) {
|
||||
System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName> <tableType>");
|
||||
System.exit(1);
|
||||
}
|
||||
String tablePath = args[0];
|
||||
String tableName = args[1];
|
||||
String tableType = args[2];
|
||||
|
||||
HoodieFlinkQuickstart flinkQuickstart = instance();
|
||||
flinkQuickstart.initEnv();
|
||||
|
||||
// create filesystem table named source
|
||||
flinkQuickstart.createFileSource();
|
||||
|
||||
// create hudi table
|
||||
flinkQuickstart.createHudiTable(tablePath, tableName, HoodieTableType.valueOf(tableType));
|
||||
|
||||
// insert data
|
||||
flinkQuickstart.insertData();
|
||||
|
||||
// query data
|
||||
flinkQuickstart.queryData();
|
||||
|
||||
// update data
|
||||
flinkQuickstart.updateData();
|
||||
}
|
||||
|
||||
public void initEnv() {
|
||||
if (this.streamTableEnv == null) {
|
||||
settings = EnvironmentSettings.newInstance().build();
|
||||
TableEnvironment streamTableEnv = TableEnvironmentImpl.create(settings);
|
||||
streamTableEnv.getConfig().getConfiguration()
|
||||
.setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1);
|
||||
Configuration execConf = streamTableEnv.getConfig().getConfiguration();
|
||||
execConf.setString("execution.checkpointing.interval", "2s");
|
||||
// configure not to retry after failure
|
||||
execConf.setString("restart-strategy", "fixed-delay");
|
||||
execConf.setString("restart-strategy.fixed-delay.attempts", "0");
|
||||
this.streamTableEnv = streamTableEnv;
|
||||
}
|
||||
}
|
||||
|
||||
public TableEnvironment getStreamTableEnv() {
|
||||
return streamTableEnv;
|
||||
}
|
||||
|
||||
public TableEnvironment getBatchTableEnv() {
|
||||
Configuration conf = new Configuration();
|
||||
// for batch upsert use cases: current suggestion is to disable these 2 options,
|
||||
// from 1.14, flink runtime execution mode has switched from streaming
|
||||
// to batch for batch execution mode(before that, both streaming and batch use streaming execution mode),
|
||||
// current batch execution mode has these limitations:
|
||||
//
|
||||
// 1. the keyed stream default to always sort the inputs by key;
|
||||
// 2. the batch state-backend requires the inputs sort by state key
|
||||
//
|
||||
// For our hudi batch pipeline upsert case, we rely on the consuming sequence for index records and data records,
|
||||
// the index records must be loaded first before data records for BucketAssignFunction to keep upsert semantics correct,
|
||||
// so we suggest disabling these 2 options to use streaming state-backend for batch execution mode
|
||||
// to keep the strategy before 1.14.
|
||||
conf.setBoolean("execution.sorted-inputs.enabled", false);
|
||||
conf.setBoolean("execution.batch-state-backend.enabled", false);
|
||||
StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.getExecutionEnvironment(conf);
|
||||
settings = EnvironmentSettings.newInstance().inBatchMode().build();
|
||||
TableEnvironment batchTableEnv = StreamTableEnvironment.create(execEnv, settings);
|
||||
batchTableEnv.getConfig().getConfiguration()
|
||||
.setInteger(ExecutionConfigOptions.TABLE_EXEC_RESOURCE_DEFAULT_PARALLELISM, 1);
|
||||
return batchTableEnv;
|
||||
}
|
||||
|
||||
public void createHudiTable(String tablePath, String tableName,
|
||||
HoodieTableType tableType) {
|
||||
this.tableName = tableName;
|
||||
|
||||
// create hudi table
|
||||
String hoodieTableDDL = sql(tableName)
|
||||
.option(FlinkOptions.PATH, tablePath)
|
||||
.option(FlinkOptions.READ_AS_STREAMING, true)
|
||||
.option(FlinkOptions.TABLE_TYPE, tableType)
|
||||
.end();
|
||||
streamTableEnv.executeSql(hoodieTableDDL);
|
||||
}
|
||||
|
||||
public void createFileSource() {
|
||||
// create filesystem table named source
|
||||
String createSource = QuickstartConfigurations.getFileSourceDDL("source");
|
||||
streamTableEnv.executeSql(createSource);
|
||||
}
|
||||
|
||||
@NotNull List<Row> insertData() throws InterruptedException, TableNotExistException {
|
||||
// insert data
|
||||
String insertInto = String.format("insert into %s select * from source", tableName);
|
||||
execInsertSql(streamTableEnv, insertInto);
|
||||
return queryData();
|
||||
}
|
||||
|
||||
List<Row> queryData() throws InterruptedException, TableNotExistException {
|
||||
// query data
|
||||
// reading from the latest commit instance.
|
||||
return execSelectSql(streamTableEnv, String.format("select * from %s", tableName), 10);
|
||||
}
|
||||
|
||||
@NotNull List<Row> updateData() throws InterruptedException, TableNotExistException {
|
||||
// update data
|
||||
String insertInto = String.format("insert into %s select * from source", tableName);
|
||||
execInsertSql(getStreamTableEnv(), insertInto);
|
||||
return queryData();
|
||||
}
|
||||
|
||||
public static void execInsertSql(TableEnvironment tEnv, String insert) {
|
||||
TableResult tableResult = tEnv.executeSql(insert);
|
||||
// wait to finish
|
||||
try {
|
||||
tableResult.getJobClient().get().getJobExecutionResult().get();
|
||||
} catch (InterruptedException | ExecutionException ex) {
|
||||
// ignored
|
||||
}
|
||||
}
|
||||
|
||||
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout)
|
||||
throws InterruptedException, TableNotExistException {
|
||||
return execSelectSql(tEnv, select, timeout, null);
|
||||
}
|
||||
|
||||
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, long timeout, String sourceTable)
|
||||
throws InterruptedException, TableNotExistException {
|
||||
final String sinkDDL;
|
||||
if (sourceTable != null) {
|
||||
// use the source table schema as the sink schema if the source table was specified, .
|
||||
ObjectPath objectPath = new ObjectPath(tEnv.getCurrentDatabase(), sourceTable);
|
||||
TableSchema schema = tEnv.getCatalog(tEnv.getCurrentCatalog()).get().getTable(objectPath).getSchema();
|
||||
sinkDDL = QuickstartConfigurations.getCollectSinkDDL("sink", schema);
|
||||
} else {
|
||||
sinkDDL = QuickstartConfigurations.getCollectSinkDDL("sink");
|
||||
}
|
||||
return execSelectSql(tEnv, select, sinkDDL, timeout);
|
||||
}
|
||||
|
||||
public static List<Row> execSelectSql(TableEnvironment tEnv, String select, String sinkDDL, long timeout)
|
||||
throws InterruptedException {
|
||||
tEnv.executeSql("DROP TABLE IF EXISTS sink");
|
||||
tEnv.executeSql(sinkDDL);
|
||||
TableResult tableResult = tEnv.executeSql("insert into sink " + select);
|
||||
// wait for the timeout then cancels the job
|
||||
TimeUnit.SECONDS.sleep(timeout);
|
||||
tableResult.getJobClient().ifPresent(JobClient::cancel);
|
||||
tEnv.executeSql("DROP TABLE IF EXISTS sink");
|
||||
return CollectSinkTableFactory.RESULT.values().stream()
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,178 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart.factory;
|
||||
|
||||
import org.apache.flink.api.common.state.ListState;
|
||||
import org.apache.flink.api.common.state.ListStateDescriptor;
|
||||
import org.apache.flink.api.java.typeutils.RowTypeInfo;
|
||||
import org.apache.flink.configuration.ConfigOption;
|
||||
import org.apache.flink.runtime.state.FunctionInitializationContext;
|
||||
import org.apache.flink.runtime.state.FunctionSnapshotContext;
|
||||
import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
|
||||
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
|
||||
import org.apache.flink.table.api.TableSchema;
|
||||
import org.apache.flink.table.connector.ChangelogMode;
|
||||
import org.apache.flink.table.connector.sink.DynamicTableSink;
|
||||
import org.apache.flink.table.connector.sink.SinkFunctionProvider;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.factories.DynamicTableSinkFactory;
|
||||
import org.apache.flink.table.factories.FactoryUtil;
|
||||
import org.apache.flink.table.types.DataType;
|
||||
import org.apache.flink.table.types.utils.TypeConversions;
|
||||
import org.apache.flink.types.Row;
|
||||
import org.apache.flink.types.RowKind;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Factory for CollectTableSink.
|
||||
*
|
||||
* <p>Note: The CollectTableSink collects all the data of a table into a global collection {@code RESULT},
|
||||
* so the tests should executed in single thread and the table name should be the same.
|
||||
*/
|
||||
public class CollectSinkTableFactory implements DynamicTableSinkFactory {
|
||||
public static final String FACTORY_ID = "collect";
|
||||
|
||||
// global results to collect and query
|
||||
public static final Map<Integer, List<Row>> RESULT = new HashMap<>();
|
||||
|
||||
@Override
|
||||
public DynamicTableSink createDynamicTableSink(Context context) {
|
||||
FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context);
|
||||
helper.validate();
|
||||
|
||||
TableSchema schema = context.getCatalogTable().getSchema();
|
||||
RESULT.clear();
|
||||
return new CollectTableSink(schema, context.getObjectIdentifier().getObjectName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String factoryIdentifier() {
|
||||
return FACTORY_ID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ConfigOption<?>> requiredOptions() {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ConfigOption<?>> optionalOptions() {
|
||||
return Collections.emptySet();
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------
|
||||
// Table sinks
|
||||
// --------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Values {@link DynamicTableSink} for testing.
|
||||
*/
|
||||
private static class CollectTableSink implements DynamicTableSink {
|
||||
|
||||
private final TableSchema schema;
|
||||
private final String tableName;
|
||||
|
||||
private CollectTableSink(
|
||||
TableSchema schema,
|
||||
String tableName) {
|
||||
this.schema = schema;
|
||||
this.tableName = tableName;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ChangelogMode getChangelogMode(ChangelogMode requestedMode) {
|
||||
return ChangelogMode.newBuilder()
|
||||
.addContainedKind(RowKind.INSERT)
|
||||
.addContainedKind(RowKind.DELETE)
|
||||
.addContainedKind(RowKind.UPDATE_AFTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SinkRuntimeProvider getSinkRuntimeProvider(Context context) {
|
||||
final DataType rowType = schema.toPhysicalRowDataType();
|
||||
final RowTypeInfo rowTypeInfo = (RowTypeInfo) TypeConversions.fromDataTypeToLegacyInfo(rowType);
|
||||
DataStructureConverter converter = context.createDataStructureConverter(schema.toPhysicalRowDataType());
|
||||
return SinkFunctionProvider.of(new CollectSinkFunction(converter, rowTypeInfo));
|
||||
}
|
||||
|
||||
@Override
|
||||
public DynamicTableSink copy() {
|
||||
return new CollectTableSink(schema, tableName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String asSummaryString() {
|
||||
return "CollectSink";
|
||||
}
|
||||
}
|
||||
|
||||
static class CollectSinkFunction extends RichSinkFunction<RowData> implements CheckpointedFunction {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final DynamicTableSink.DataStructureConverter converter;
|
||||
private final RowTypeInfo rowTypeInfo;
|
||||
|
||||
protected transient ListState<Row> resultState;
|
||||
protected transient List<Row> localResult;
|
||||
|
||||
private int taskID;
|
||||
|
||||
protected CollectSinkFunction(DynamicTableSink.DataStructureConverter converter, RowTypeInfo rowTypeInfo) {
|
||||
this.converter = converter;
|
||||
this.rowTypeInfo = rowTypeInfo;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void invoke(RowData value, Context context) {
|
||||
Row row = (Row) converter.toExternal(value);
|
||||
assert row != null;
|
||||
row.setKind(value.getRowKind());
|
||||
RESULT.get(taskID).add(row);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initializeState(FunctionInitializationContext context) throws Exception {
|
||||
this.resultState = context.getOperatorStateStore().getListState(
|
||||
new ListStateDescriptor<>("sink-results", rowTypeInfo));
|
||||
this.localResult = new ArrayList<>();
|
||||
if (context.isRestored()) {
|
||||
for (Row value : resultState.get()) {
|
||||
localResult.add(value);
|
||||
}
|
||||
}
|
||||
this.taskID = getRuntimeContext().getIndexOfThisSubtask();
|
||||
synchronized (CollectSinkTableFactory.class) {
|
||||
RESULT.put(taskID, localResult);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void snapshotState(FunctionSnapshotContext context) throws Exception {
|
||||
resultState.clear();
|
||||
resultState.addAll(RESULT.get(taskID));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart.factory;
|
||||
|
||||
import org.apache.flink.configuration.ConfigOption;
|
||||
import org.apache.flink.configuration.ConfigOptions;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.core.fs.Path;
|
||||
import org.apache.flink.table.api.ValidationException;
|
||||
import org.apache.flink.table.connector.source.DynamicTableSource;
|
||||
import org.apache.flink.table.factories.DynamicTableSourceFactory;
|
||||
import org.apache.flink.table.factories.FactoryUtil;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import org.apache.hudi.configuration.FlinkOptions;
|
||||
import org.apache.hudi.examples.quickstart.source.ContinuousFileSource;
|
||||
|
||||
/**
|
||||
* Factory for ContinuousFileSource.
|
||||
*/
|
||||
public class ContinuousFileSourceFactory implements DynamicTableSourceFactory {
|
||||
public static final String FACTORY_ID = "continuous-file-source";
|
||||
|
||||
public static final ConfigOption<Integer> CHECKPOINTS = ConfigOptions
|
||||
.key("checkpoints")
|
||||
.intType()
|
||||
.defaultValue(2)
|
||||
.withDescription("Number of checkpoints to write the data set as, default 2");
|
||||
|
||||
@Override
|
||||
public DynamicTableSource createDynamicTableSource(Context context) {
|
||||
FactoryUtil.TableFactoryHelper helper = FactoryUtil.createTableFactoryHelper(this, context);
|
||||
helper.validate();
|
||||
|
||||
Configuration conf = (Configuration) helper.getOptions();
|
||||
Path path = new Path(conf.getOptional(FlinkOptions.PATH).orElseThrow(() ->
|
||||
new ValidationException("Option [path] should be not empty.")));
|
||||
return new ContinuousFileSource(context.getCatalogTable().getResolvedSchema(), path, conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String factoryIdentifier() {
|
||||
return FACTORY_ID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ConfigOption<?>> requiredOptions() {
|
||||
return Collections.singleton(FlinkOptions.PATH);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<ConfigOption<?>> optionalOptions() {
|
||||
return Collections.singleton(CHECKPOINTS);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,185 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart.source;
|
||||
|
||||
import org.apache.flink.api.common.state.CheckpointListener;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.core.fs.Path;
|
||||
import org.apache.flink.formats.common.TimestampFormat;
|
||||
import org.apache.flink.formats.json.JsonRowDataDeserializationSchema;
|
||||
import org.apache.flink.streaming.api.datastream.DataStream;
|
||||
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
|
||||
import org.apache.flink.streaming.api.functions.source.SourceFunction;
|
||||
import org.apache.flink.table.catalog.ResolvedSchema;
|
||||
import org.apache.flink.table.connector.ChangelogMode;
|
||||
import org.apache.flink.table.connector.source.DataStreamScanProvider;
|
||||
import org.apache.flink.table.connector.source.DynamicTableSource;
|
||||
import org.apache.flink.table.connector.source.ScanTableSource;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.runtime.typeutils.InternalTypeInfo;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import static org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory.CHECKPOINTS;
|
||||
|
||||
/**
|
||||
* A continuous file source that can trigger checkpoints continuously.
|
||||
*
|
||||
* <p>It loads the data in the specified file and split the data into number of checkpoints batches.
|
||||
* Say, if you want 4 checkpoints and there are 8 records in the file, the emit strategy is:
|
||||
*
|
||||
* <pre>
|
||||
* | 2 records | 2 records | 2 records | 2 records |
|
||||
* | cp1 | cp2 |cp3 | cp4 |
|
||||
* </pre>
|
||||
*
|
||||
* <p>If all the data are flushed out, it waits for the next checkpoint to finish and tear down the source.
|
||||
*/
|
||||
public class ContinuousFileSource implements ScanTableSource {
|
||||
|
||||
private final ResolvedSchema tableSchema;
|
||||
private final Path path;
|
||||
private final Configuration conf;
|
||||
|
||||
public ContinuousFileSource(
|
||||
ResolvedSchema tableSchema,
|
||||
Path path,
|
||||
Configuration conf) {
|
||||
this.tableSchema = tableSchema;
|
||||
this.path = path;
|
||||
this.conf = conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
|
||||
return new DataStreamScanProvider() {
|
||||
|
||||
@Override
|
||||
public boolean isBounded() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DataStream<RowData> produceDataStream(StreamExecutionEnvironment execEnv) {
|
||||
final RowType rowType = (RowType) tableSchema.toSourceRowDataType().getLogicalType();
|
||||
JsonRowDataDeserializationSchema deserializationSchema = new JsonRowDataDeserializationSchema(
|
||||
rowType,
|
||||
InternalTypeInfo.of(rowType),
|
||||
false,
|
||||
true,
|
||||
TimestampFormat.ISO_8601);
|
||||
|
||||
return execEnv.addSource(new BoundedSourceFunction(path, conf.getInteger(CHECKPOINTS)))
|
||||
.name("continuous_file_source")
|
||||
.setParallelism(1)
|
||||
.map(record -> deserializationSchema.deserialize(record.getBytes(StandardCharsets.UTF_8)),
|
||||
InternalTypeInfo.of(rowType));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public ChangelogMode getChangelogMode() {
|
||||
return ChangelogMode.insertOnly();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DynamicTableSource copy() {
|
||||
return new ContinuousFileSource(this.tableSchema, this.path, this.conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String asSummaryString() {
|
||||
return "ContinuousFileSource";
|
||||
}
|
||||
|
||||
/**
|
||||
* Source function that partition the data into given number checkpoints batches.
|
||||
*/
|
||||
public static class BoundedSourceFunction implements SourceFunction<String>, CheckpointListener {
|
||||
private final Path path;
|
||||
private List<String> dataBuffer;
|
||||
|
||||
private final int checkpoints;
|
||||
private final AtomicInteger currentCP = new AtomicInteger(0);
|
||||
|
||||
private volatile boolean isRunning = true;
|
||||
|
||||
public BoundedSourceFunction(Path path, int checkpoints) {
|
||||
this.path = path;
|
||||
this.checkpoints = checkpoints;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(SourceContext<String> context) throws Exception {
|
||||
if (this.dataBuffer == null) {
|
||||
loadDataBuffer();
|
||||
}
|
||||
int oldCP = this.currentCP.get();
|
||||
boolean finish = false;
|
||||
while (isRunning) {
|
||||
int batchSize = this.dataBuffer.size() / this.checkpoints;
|
||||
int start = batchSize * oldCP;
|
||||
synchronized (context.getCheckpointLock()) {
|
||||
for (int i = start; i < start + batchSize; i++) {
|
||||
if (i >= this.dataBuffer.size()) {
|
||||
finish = true;
|
||||
break;
|
||||
// wait for the next checkpoint and exit
|
||||
}
|
||||
context.collect(this.dataBuffer.get(i));
|
||||
}
|
||||
}
|
||||
oldCP++;
|
||||
while (this.currentCP.get() < oldCP) {
|
||||
synchronized (context.getCheckpointLock()) {
|
||||
context.getCheckpointLock().wait(10);
|
||||
}
|
||||
}
|
||||
if (finish || !isRunning) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void cancel() {
|
||||
this.isRunning = false;
|
||||
}
|
||||
|
||||
private void loadDataBuffer() {
|
||||
try {
|
||||
this.dataBuffer = Files.readAllLines(Paths.get(this.path.toUri()));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Read file " + this.path + " error", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void notifyCheckpointComplete(long l) {
|
||||
this.currentCP.incrementAndGet();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,317 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart.utils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.flink.configuration.ConfigOption;
|
||||
import org.apache.flink.configuration.Configuration;
|
||||
import org.apache.flink.table.api.DataTypes;
|
||||
import org.apache.flink.table.api.TableSchema;
|
||||
import org.apache.flink.table.catalog.ResolvedSchema;
|
||||
import org.apache.flink.table.runtime.typeutils.RowDataSerializer;
|
||||
import org.apache.flink.table.types.DataType;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.hudi.configuration.FlinkOptions;
|
||||
import org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory;
|
||||
import org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory;
|
||||
import org.apache.hudi.streamer.FlinkStreamerConfig;
|
||||
|
||||
/**
|
||||
* Configurations for the test.
|
||||
*/
|
||||
public class QuickstartConfigurations {
|
||||
private QuickstartConfigurations() {
|
||||
}
|
||||
|
||||
public static final DataType ROW_DATA_TYPE = DataTypes.ROW(
|
||||
DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key
|
||||
DataTypes.FIELD("name", DataTypes.VARCHAR(10)),
|
||||
DataTypes.FIELD("age", DataTypes.INT()),
|
||||
DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field
|
||||
DataTypes.FIELD("partition", DataTypes.VARCHAR(10)))
|
||||
.notNull();
|
||||
|
||||
public static final RowType ROW_TYPE = (RowType) ROW_DATA_TYPE.getLogicalType();
|
||||
|
||||
public static final ResolvedSchema TABLE_SCHEMA = SchemaBuilder.instance()
|
||||
.fields(ROW_TYPE.getFieldNames(), ROW_DATA_TYPE.getChildren())
|
||||
.build();
|
||||
|
||||
private static final List<String> FIELDS = ROW_TYPE.getFields().stream()
|
||||
.map(RowType.RowField::asSummaryString).collect(Collectors.toList());
|
||||
|
||||
public static final DataType ROW_DATA_TYPE_WIDER = DataTypes.ROW(
|
||||
DataTypes.FIELD("uuid", DataTypes.VARCHAR(20)),// record key
|
||||
DataTypes.FIELD("name", DataTypes.VARCHAR(10)),
|
||||
DataTypes.FIELD("age", DataTypes.INT()),
|
||||
DataTypes.FIELD("salary", DataTypes.DOUBLE()),
|
||||
DataTypes.FIELD("ts", DataTypes.TIMESTAMP(3)), // precombine field
|
||||
DataTypes.FIELD("partition", DataTypes.VARCHAR(10)))
|
||||
.notNull();
|
||||
|
||||
public static final RowType ROW_TYPE_WIDER = (RowType) ROW_DATA_TYPE_WIDER.getLogicalType();
|
||||
|
||||
public static String getCreateHoodieTableDDL(String tableName, Map<String, String> options) {
|
||||
return getCreateHoodieTableDDL(tableName, options, true, "partition");
|
||||
}
|
||||
|
||||
public static String getCreateHoodieTableDDL(
|
||||
String tableName,
|
||||
Map<String, String> options,
|
||||
boolean havePartition,
|
||||
String partitionField) {
|
||||
return getCreateHoodieTableDDL(tableName, FIELDS, options, havePartition, "uuid", partitionField);
|
||||
}
|
||||
|
||||
public static String getCreateHoodieTableDDL(
|
||||
String tableName,
|
||||
List<String> fields,
|
||||
Map<String, String> options,
|
||||
boolean havePartition,
|
||||
String pkField,
|
||||
String partitionField) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("create table ").append(tableName).append("(\n");
|
||||
for (String field : fields) {
|
||||
builder.append(" ").append(field).append(",\n");
|
||||
}
|
||||
builder.append(" PRIMARY KEY(").append(pkField).append(") NOT ENFORCED\n")
|
||||
.append(")\n");
|
||||
if (havePartition) {
|
||||
builder.append("PARTITIONED BY (`").append(partitionField).append("`)\n");
|
||||
}
|
||||
final String connector = options.computeIfAbsent("connector", k -> "hudi");
|
||||
builder.append("with (\n"
|
||||
+ " 'connector' = '").append(connector).append("'");
|
||||
options.forEach((k, v) -> builder.append(",\n")
|
||||
.append(" '").append(k).append("' = '").append(v).append("'"));
|
||||
builder.append("\n)");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static String getCreateHudiCatalogDDL(final String catalogName, final String catalogPath) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("create catalog ").append(catalogName).append(" with (\n");
|
||||
builder.append(" 'type' = 'hudi',\n"
|
||||
+ " 'catalog.path' = '").append(catalogPath).append("'");
|
||||
builder.append("\n)");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static String getFileSourceDDL(String tableName) {
|
||||
return getFileSourceDDL(tableName, "source-file.json");
|
||||
}
|
||||
|
||||
public static String getFileSourceDDL(String tableName, int checkpoints) {
|
||||
return getFileSourceDDL(tableName, "source-file.json", checkpoints);
|
||||
}
|
||||
|
||||
public static String getFileSourceDDL(String tableName, String fileName) {
|
||||
return getFileSourceDDL(tableName, fileName, 2);
|
||||
}
|
||||
|
||||
public static String getFileSourceDDL(String tableName, String fileName, int checkpoints) {
|
||||
String sourcePath = Objects.requireNonNull(Thread.currentThread()
|
||||
.getContextClassLoader().getResource(fileName)).toString();
|
||||
return "create table " + tableName + "(\n"
|
||||
+ " uuid varchar(20),\n"
|
||||
+ " name varchar(10),\n"
|
||||
+ " age int,\n"
|
||||
+ " ts timestamp(3),\n"
|
||||
+ " `partition` varchar(20)\n"
|
||||
+ ") with (\n"
|
||||
+ " 'connector' = '" + ContinuousFileSourceFactory.FACTORY_ID + "',\n"
|
||||
+ " 'path' = '" + sourcePath + "',\n"
|
||||
+ " 'checkpoints' = '" + checkpoints + "'\n"
|
||||
+ ")";
|
||||
}
|
||||
|
||||
public static String getCollectSinkDDL(String tableName) {
|
||||
return "create table " + tableName + "(\n"
|
||||
+ " uuid varchar(20),\n"
|
||||
+ " name varchar(10),\n"
|
||||
+ " age int,\n"
|
||||
+ " ts timestamp(3),\n"
|
||||
+ " `partition` varchar(20)\n"
|
||||
+ ") with (\n"
|
||||
+ " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'"
|
||||
+ ")";
|
||||
}
|
||||
|
||||
public static String getCollectSinkDDL(String tableName, TableSchema tableSchema) {
|
||||
final StringBuilder builder = new StringBuilder("create table " + tableName + "(\n");
|
||||
String[] fieldNames = tableSchema.getFieldNames();
|
||||
DataType[] fieldTypes = tableSchema.getFieldDataTypes();
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
builder.append(" `")
|
||||
.append(fieldNames[i])
|
||||
.append("` ")
|
||||
.append(fieldTypes[i].toString());
|
||||
if (i != fieldNames.length - 1) {
|
||||
builder.append(",");
|
||||
}
|
||||
builder.append("\n");
|
||||
}
|
||||
final String withProps = ""
|
||||
+ ") with (\n"
|
||||
+ " 'connector' = '" + CollectSinkTableFactory.FACTORY_ID + "'\n"
|
||||
+ ")";
|
||||
builder.append(withProps);
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static String getCsvSourceDDL(String tableName, String fileName) {
|
||||
String sourcePath = Objects.requireNonNull(Thread.currentThread()
|
||||
.getContextClassLoader().getResource(fileName)).toString();
|
||||
return "create table " + tableName + "(\n"
|
||||
+ " uuid varchar(20),\n"
|
||||
+ " name varchar(10),\n"
|
||||
+ " age int,\n"
|
||||
+ " ts timestamp(3),\n"
|
||||
+ " `partition` varchar(20)\n"
|
||||
+ ") with (\n"
|
||||
+ " 'connector' = 'filesystem',\n"
|
||||
+ " 'path' = '" + sourcePath + "',\n"
|
||||
+ " 'format' = 'csv'\n"
|
||||
+ ")";
|
||||
}
|
||||
|
||||
public static final RowDataSerializer SERIALIZER = new RowDataSerializer(ROW_TYPE);
|
||||
|
||||
public static Configuration getDefaultConf(String tablePath) {
|
||||
Configuration conf = new Configuration();
|
||||
conf.setString(FlinkOptions.PATH, tablePath);
|
||||
conf.setString(FlinkOptions.SOURCE_AVRO_SCHEMA_PATH,
|
||||
Objects.requireNonNull(Thread.currentThread()
|
||||
.getContextClassLoader().getResource("test_read_schema.avsc")).toString());
|
||||
conf.setString(FlinkOptions.TABLE_NAME, "TestHoodieTable");
|
||||
conf.setString(FlinkOptions.PARTITION_PATH_FIELD, "partition");
|
||||
return conf;
|
||||
}
|
||||
|
||||
public static FlinkStreamerConfig getDefaultStreamerConf(String tablePath) {
|
||||
FlinkStreamerConfig streamerConf = new FlinkStreamerConfig();
|
||||
streamerConf.targetBasePath = tablePath;
|
||||
streamerConf.sourceAvroSchemaPath = Objects.requireNonNull(Thread.currentThread()
|
||||
.getContextClassLoader().getResource("test_read_schema.avsc")).toString();
|
||||
streamerConf.targetTableName = "TestHoodieTable";
|
||||
streamerConf.partitionPathField = "partition";
|
||||
streamerConf.tableType = "COPY_ON_WRITE";
|
||||
streamerConf.checkpointInterval = 4000L;
|
||||
return streamerConf;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the tool to build hoodie table DDL.
|
||||
*/
|
||||
public static Sql sql(String tableName) {
|
||||
return new Sql(tableName);
|
||||
}
|
||||
|
||||
public static Catalog catalog(String catalogName) {
|
||||
return new Catalog(catalogName);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
// Utilities
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Tool to build hoodie table DDL with schema {@link #TABLE_SCHEMA}.
|
||||
*/
|
||||
public static class Sql {
|
||||
private final Map<String, String> options;
|
||||
private final String tableName;
|
||||
private List<String> fields = new ArrayList<>();
|
||||
private boolean withPartition = true;
|
||||
private String pkField = "uuid";
|
||||
private String partitionField = "partition";
|
||||
|
||||
public Sql(String tableName) {
|
||||
options = new HashMap<>();
|
||||
this.tableName = tableName;
|
||||
}
|
||||
|
||||
public Sql option(ConfigOption<?> option, Object val) {
|
||||
this.options.put(option.key(), val.toString());
|
||||
return this;
|
||||
}
|
||||
|
||||
public Sql option(String key, Object val) {
|
||||
this.options.put(key, val.toString());
|
||||
return this;
|
||||
}
|
||||
|
||||
public Sql options(Map<String, String> options) {
|
||||
this.options.putAll(options);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Sql noPartition() {
|
||||
this.withPartition = false;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Sql pkField(String pkField) {
|
||||
this.pkField = pkField;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Sql partitionField(String partitionField) {
|
||||
this.partitionField = partitionField;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Sql field(String fieldSchema) {
|
||||
fields.add(fieldSchema);
|
||||
return this;
|
||||
}
|
||||
|
||||
public String end() {
|
||||
if (this.fields.size() == 0) {
|
||||
this.fields = FIELDS;
|
||||
}
|
||||
return QuickstartConfigurations.getCreateHoodieTableDDL(this.tableName, this.fields, options,
|
||||
this.withPartition, this.pkField, this.partitionField);
|
||||
}
|
||||
}
|
||||
|
||||
public static class Catalog {
|
||||
private final String catalogName;
|
||||
private String catalogPath = ".";
|
||||
|
||||
public Catalog(String catalogName) {
|
||||
this.catalogName = catalogName;
|
||||
}
|
||||
|
||||
public Catalog catalogPath(String catalogPath) {
|
||||
this.catalogPath = catalogPath;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String end() {
|
||||
return QuickstartConfigurations.getCreateHudiCatalogDDL(catalogName, catalogPath);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart.utils;
|
||||
|
||||
import org.apache.flink.table.catalog.Column;
|
||||
import org.apache.flink.table.catalog.ResolvedSchema;
|
||||
import org.apache.flink.table.catalog.UniqueConstraint;
|
||||
import org.apache.flink.table.catalog.WatermarkSpec;
|
||||
import org.apache.flink.table.types.DataType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
/**
|
||||
* Builder for {@link ResolvedSchema}.
|
||||
*/
|
||||
public class SchemaBuilder {
|
||||
private List<Column> columns;
|
||||
private List<WatermarkSpec> watermarkSpecs;
|
||||
private UniqueConstraint constraint;
|
||||
|
||||
public static SchemaBuilder instance() {
|
||||
return new SchemaBuilder();
|
||||
}
|
||||
|
||||
private SchemaBuilder() {
|
||||
this.columns = new ArrayList<>();
|
||||
this.watermarkSpecs = new ArrayList<>();
|
||||
}
|
||||
|
||||
public SchemaBuilder field(String name, DataType type) {
|
||||
this.columns.add(Column.physical(name, type));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SchemaBuilder fields(List<String> names, List<DataType> types) {
|
||||
List<Column> columns = IntStream.range(0, names.size())
|
||||
.mapToObj(idx -> Column.physical(names.get(idx), types.get(idx)))
|
||||
.collect(Collectors.toList());
|
||||
this.columns.addAll(columns);
|
||||
return this;
|
||||
}
|
||||
|
||||
public SchemaBuilder primaryKey(String... columns) {
|
||||
this.constraint = UniqueConstraint.primaryKey("pk", Arrays.asList(columns));
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResolvedSchema build() {
|
||||
return new ResolvedSchema(columns, watermarkSpecs, constraint);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory
|
||||
org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory
|
||||
@@ -0,0 +1,8 @@
|
||||
{"uuid": "id1", "name": "Danny", "age": 23, "ts": "1970-01-01T00:00:01", "partition": "par1"}
|
||||
{"uuid": "id2", "name": "Stephen", "age": 33, "ts": "1970-01-01T00:00:02", "partition": "par1"}
|
||||
{"uuid": "id3", "name": "Julian", "age": 53, "ts": "1970-01-01T00:00:03", "partition": "par2"}
|
||||
{"uuid": "id4", "name": "Fabian", "age": 31, "ts": "1970-01-01T00:00:04", "partition": "par2"}
|
||||
{"uuid": "id5", "name": "Sophia", "age": 18, "ts": "1970-01-01T00:00:05", "partition": "par3"}
|
||||
{"uuid": "id6", "name": "Emma", "age": 20, "ts": "1970-01-01T00:00:06", "partition": "par3"}
|
||||
{"uuid": "id7", "name": "Bob", "age": 44, "ts": "1970-01-01T00:00:07", "partition": "par4"}
|
||||
{"uuid": "id8", "name": "Han", "age": 56, "ts": "1970-01-01T00:00:08", "partition": "par4"}
|
||||
@@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart;
|
||||
|
||||
import static org.apache.hudi.examples.quickstart.TestQuickstartData.assertRowsEquals;
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import org.apache.flink.table.api.TableEnvironment;
|
||||
import org.apache.flink.test.util.AbstractTestBase;
|
||||
import org.apache.flink.types.Row;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.EnumSource;
|
||||
|
||||
/**
|
||||
* IT cases for Hoodie table source and sink.
|
||||
*/
|
||||
public class TestHoodieFlinkQuickstart extends AbstractTestBase {
|
||||
private final HoodieFlinkQuickstart flinkQuickstart = HoodieFlinkQuickstart.instance();
|
||||
|
||||
@BeforeEach
|
||||
void beforeEach() {
|
||||
flinkQuickstart.initEnv();
|
||||
}
|
||||
|
||||
@TempDir
|
||||
File tempFile;
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = HoodieTableType.class)
|
||||
void testHoodieFlinkQuickstart(HoodieTableType tableType) throws Exception {
|
||||
// create filesystem table named source
|
||||
flinkQuickstart.createFileSource();
|
||||
|
||||
// create hudi table
|
||||
flinkQuickstart.createHudiTable(tempFile.getAbsolutePath(), "t1", tableType);
|
||||
|
||||
// insert data
|
||||
List<Row> rows = flinkQuickstart.insertData();
|
||||
assertRowsEquals(rows, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT);
|
||||
|
||||
// query data
|
||||
List<Row> rows1 = flinkQuickstart.queryData();
|
||||
assertRowsEquals(rows1, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT);
|
||||
|
||||
// update data
|
||||
List<Row> rows2 = flinkQuickstart.updateData();
|
||||
assertRowsEquals(rows2, TestQuickstartData.DATA_SET_SOURCE_INSERT_LATEST_COMMIT);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,422 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart;
|
||||
|
||||
import static junit.framework.TestCase.assertEquals;
|
||||
import static org.hamcrest.CoreMatchers.is;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.flink.table.data.RowData;
|
||||
import org.apache.flink.table.data.StringData;
|
||||
import org.apache.flink.table.data.TimestampData;
|
||||
import org.apache.flink.table.data.binary.BinaryRowData;
|
||||
import org.apache.flink.table.data.conversion.DataStructureConverter;
|
||||
import org.apache.flink.table.data.conversion.DataStructureConverters;
|
||||
import org.apache.flink.table.data.writer.BinaryRowWriter;
|
||||
import org.apache.flink.table.data.writer.BinaryWriter;
|
||||
import org.apache.flink.table.runtime.typeutils.InternalSerializers;
|
||||
import org.apache.flink.table.types.logical.LogicalType;
|
||||
import org.apache.flink.table.types.logical.RowType;
|
||||
import org.apache.flink.types.Row;
|
||||
import org.apache.flink.types.RowKind;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.config.HoodieCommonConfig;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
||||
import org.apache.hudi.examples.quickstart.utils.QuickstartConfigurations;
|
||||
import org.apache.parquet.Strings;
|
||||
import org.apache.parquet.avro.AvroParquetReader;
|
||||
import org.apache.parquet.hadoop.ParquetReader;
|
||||
|
||||
/**
|
||||
* Data set for testing, also some utilities to check the results.
|
||||
*/
|
||||
public class TestQuickstartData {
|
||||
|
||||
public static List<RowData> DATA_SET_INSERT_DUPLICATES = new ArrayList<>();
|
||||
|
||||
static {
|
||||
IntStream.range(0, 5).forEach(i -> DATA_SET_INSERT_DUPLICATES.add(
|
||||
insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
|
||||
TimestampData.fromEpochMillis(1), StringData.fromString("par1"))));
|
||||
}
|
||||
|
||||
public static List<RowData> DATA_SET_INSERT_SAME_KEY = new ArrayList<>();
|
||||
|
||||
static {
|
||||
IntStream.range(0, 5).forEach(i -> DATA_SET_INSERT_SAME_KEY.add(
|
||||
insertRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
|
||||
TimestampData.fromEpochMillis(i), StringData.fromString("par1"))));
|
||||
}
|
||||
|
||||
// data set of source-file.json latest commit.
|
||||
public static List<RowData> DATA_SET_SOURCE_INSERT_LATEST_COMMIT = Arrays.asList(
|
||||
insertRow(StringData.fromString("id5"), StringData.fromString("Sophia"), 18,
|
||||
TimestampData.fromEpochMillis(5000), StringData.fromString("par3")),
|
||||
insertRow(StringData.fromString("id6"), StringData.fromString("Emma"), 20,
|
||||
TimestampData.fromEpochMillis(6000), StringData.fromString("par3")),
|
||||
insertRow(StringData.fromString("id7"), StringData.fromString("Bob"), 44,
|
||||
TimestampData.fromEpochMillis(7000), StringData.fromString("par4")),
|
||||
insertRow(StringData.fromString("id8"), StringData.fromString("Han"), 56,
|
||||
TimestampData.fromEpochMillis(8000), StringData.fromString("par4"))
|
||||
);
|
||||
|
||||
public static List<RowData> DATA_SET_DISORDER_UPDATE_DELETE = Arrays.asList(
|
||||
// DISORDER UPDATE
|
||||
updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21,
|
||||
TimestampData.fromEpochMillis(3), StringData.fromString("par1")),
|
||||
updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20,
|
||||
TimestampData.fromEpochMillis(2), StringData.fromString("par1")),
|
||||
updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 23,
|
||||
TimestampData.fromEpochMillis(1), StringData.fromString("par1")),
|
||||
updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 20,
|
||||
TimestampData.fromEpochMillis(2), StringData.fromString("par1")),
|
||||
updateAfterRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22,
|
||||
TimestampData.fromEpochMillis(4), StringData.fromString("par1")),
|
||||
updateBeforeRow(StringData.fromString("id1"), StringData.fromString("Danny"), 21,
|
||||
TimestampData.fromEpochMillis(3), StringData.fromString("par1")),
|
||||
// DISORDER DELETE
|
||||
deleteRow(StringData.fromString("id1"), StringData.fromString("Danny"), 22,
|
||||
TimestampData.fromEpochMillis(2), StringData.fromString("par1"))
|
||||
);
|
||||
|
||||
public static List<RowData> dataSetInsert(int... ids) {
|
||||
List<RowData> inserts = new ArrayList<>();
|
||||
Arrays.stream(ids).forEach(i -> inserts.add(
|
||||
insertRow(StringData.fromString("id" + i), StringData.fromString("Danny"), 23,
|
||||
TimestampData.fromEpochMillis(i), StringData.fromString("par1"))));
|
||||
return inserts;
|
||||
}
|
||||
|
||||
private static Integer toIdSafely(Object id) {
|
||||
if (id == null) {
|
||||
return -1;
|
||||
}
|
||||
final String idStr = id.toString();
|
||||
if (idStr.startsWith("id")) {
|
||||
return Integer.parseInt(idStr.substring(2));
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns string format of a list of RowData.
|
||||
*/
|
||||
public static String rowDataToString(List<RowData> rows) {
|
||||
DataStructureConverter<Object, Object> converter =
|
||||
DataStructureConverters.getConverter(QuickstartConfigurations.ROW_DATA_TYPE);
|
||||
return rows.stream()
|
||||
.sorted(Comparator.comparing(o -> toIdSafely(o.getString(0))))
|
||||
.map(row -> converter.toExternal(row).toString())
|
||||
.collect(Collectors.toList()).toString();
|
||||
}
|
||||
|
||||
private static String toStringSafely(Object obj) {
|
||||
return obj == null ? "null" : obj.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the {@code rows} using field at index 0 and asserts
|
||||
* it equals with the expected string {@code expected}.
|
||||
*
|
||||
* @param rows Actual result rows
|
||||
* @param expected Expected string of the sorted rows
|
||||
*/
|
||||
public static void assertRowsEquals(List<Row> rows, String expected) {
|
||||
assertRowsEquals(rows, expected, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the {@code rows} using field at index 0 and asserts
|
||||
* it equals with the expected string {@code expected}.
|
||||
*
|
||||
* @param rows Actual result rows
|
||||
* @param expected Expected string of the sorted rows
|
||||
* @param withChangeFlag Whether compares with change flags
|
||||
*/
|
||||
public static void assertRowsEquals(List<Row> rows, String expected, boolean withChangeFlag) {
|
||||
String rowsString = rows.stream()
|
||||
.sorted(Comparator.comparing(o -> toStringSafely(o.getField(0))))
|
||||
.map(row -> {
|
||||
final String rowStr = row.toString();
|
||||
if (withChangeFlag) {
|
||||
return row.getKind().shortString() + "(" + rowStr + ")";
|
||||
} else {
|
||||
return rowStr;
|
||||
}
|
||||
})
|
||||
.collect(Collectors.toList()).toString();
|
||||
assertThat(rowsString, is(expected));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the {@code rows} using field at index {@code orderingPos} and asserts
|
||||
* it equals with the expected string {@code expected}.
|
||||
*
|
||||
* @param rows Actual result rows
|
||||
* @param expected Expected string of the sorted rows
|
||||
* @param orderingPos Field position for ordering
|
||||
*/
|
||||
public static void assertRowsEquals(List<Row> rows, String expected, int orderingPos) {
|
||||
String rowsString = rows.stream()
|
||||
.sorted(Comparator.comparing(o -> toStringSafely(o.getField(orderingPos))))
|
||||
.collect(Collectors.toList()).toString();
|
||||
assertThat(rowsString, is(expected));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the {@code rows} using field at index 0 and asserts
|
||||
* it equals with the expected row data list {@code expected}.
|
||||
*
|
||||
* @param rows Actual result rows
|
||||
* @param expected Expected row data list
|
||||
*/
|
||||
public static void assertRowsEquals(List<Row> rows, List<RowData> expected) {
|
||||
String rowsString = rows.stream()
|
||||
.sorted(Comparator.comparing(o -> toIdSafely(o.getField(0))))
|
||||
.collect(Collectors.toList()).toString();
|
||||
assertThat(rowsString, is(rowDataToString(expected)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the {@code rows} using field at index 0 and asserts
|
||||
* it equals with the expected string {@code expected}.
|
||||
*
|
||||
* @param rows Actual result rows
|
||||
* @param expected Expected string of the sorted rows
|
||||
*/
|
||||
public static void assertRowDataEquals(List<RowData> rows, String expected) {
|
||||
String rowsString = rowDataToString(rows);
|
||||
assertThat(rowsString, is(expected));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the {@code rows} using field at index 0 and asserts
|
||||
* it equals with the expected row data list {@code expected}.
|
||||
*
|
||||
* @param rows Actual result rows
|
||||
* @param expected Expected row data list
|
||||
*/
|
||||
public static void assertRowDataEquals(List<RowData> rows, List<RowData> expected) {
|
||||
String rowsString = rowDataToString(rows);
|
||||
assertThat(rowsString, is(rowDataToString(expected)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the source data set are written as expected.
|
||||
*
|
||||
* <p>Note: Replace it with the Flink reader when it is supported.
|
||||
*
|
||||
* @param baseFile The file base to check, should be a directory
|
||||
* @param expected The expected results mapping, the key should be the partition path
|
||||
* and value should be values list with the key partition
|
||||
*/
|
||||
public static void checkWrittenData(File baseFile, Map<String, String> expected) throws IOException {
|
||||
checkWrittenData(baseFile, expected, 4);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the source data set are written as expected.
|
||||
*
|
||||
* <p>Note: Replace it with the Flink reader when it is supported.
|
||||
*
|
||||
* @param baseFile The file base to check, should be a directory
|
||||
* @param expected The expected results mapping, the key should be the partition path
|
||||
* and value should be values list with the key partition
|
||||
* @param partitions The expected partition number
|
||||
*/
|
||||
public static void checkWrittenData(
|
||||
File baseFile,
|
||||
Map<String, String> expected,
|
||||
int partitions) throws IOException {
|
||||
assert baseFile.isDirectory();
|
||||
FileFilter filter = file -> !file.getName().startsWith(".");
|
||||
File[] partitionDirs = baseFile.listFiles(filter);
|
||||
assertNotNull(partitionDirs);
|
||||
assertThat(partitionDirs.length, is(partitions));
|
||||
for (File partitionDir : partitionDirs) {
|
||||
File[] dataFiles = partitionDir.listFiles(filter);
|
||||
assertNotNull(dataFiles);
|
||||
File latestDataFile = Arrays.stream(dataFiles)
|
||||
.max(Comparator.comparing(f -> FSUtils.getCommitTime(f.getName())))
|
||||
.orElse(dataFiles[0]);
|
||||
ParquetReader<GenericRecord> reader = AvroParquetReader
|
||||
.<GenericRecord>builder(new Path(latestDataFile.getAbsolutePath())).build();
|
||||
List<String> readBuffer = new ArrayList<>();
|
||||
GenericRecord nextRecord = reader.read();
|
||||
while (nextRecord != null) {
|
||||
readBuffer.add(filterOutVariables(nextRecord));
|
||||
nextRecord = reader.read();
|
||||
}
|
||||
readBuffer.sort(Comparator.naturalOrder());
|
||||
assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName())));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the MERGE_ON_READ source data are written as expected.
|
||||
*
|
||||
* <p>Note: Replace it with the Flink reader when it is supported.
|
||||
*
|
||||
* @param fs The file system
|
||||
* @param latestInstant The latest committed instant of current table
|
||||
* @param baseFile The file base to check, should be a directory
|
||||
* @param expected The expected results mapping, the key should be the partition path
|
||||
* @param partitions The expected partition number
|
||||
* @param schema The read schema
|
||||
*/
|
||||
public static void checkWrittenDataMOR(
|
||||
FileSystem fs,
|
||||
String latestInstant,
|
||||
File baseFile,
|
||||
Map<String, String> expected,
|
||||
int partitions,
|
||||
Schema schema) {
|
||||
assert baseFile.isDirectory() : "Base path should be a directory";
|
||||
FileFilter partitionFilter = file -> !file.getName().startsWith(".");
|
||||
File[] partitionDirs = baseFile.listFiles(partitionFilter);
|
||||
assertNotNull(partitionDirs);
|
||||
assertThat(partitionDirs.length, is(partitions));
|
||||
for (File partitionDir : partitionDirs) {
|
||||
File[] dataFiles = partitionDir.listFiles(file ->
|
||||
file.getName().contains(".log.") && !file.getName().startsWith(".."));
|
||||
assertNotNull(dataFiles);
|
||||
HoodieMergedLogRecordScanner scanner = getScanner(
|
||||
fs, baseFile.getPath(), Arrays.stream(dataFiles).map(File::getAbsolutePath)
|
||||
.sorted(Comparator.naturalOrder()).collect(Collectors.toList()),
|
||||
schema, latestInstant);
|
||||
List<String> readBuffer = scanner.getRecords().values().stream()
|
||||
.map(hoodieRecord -> {
|
||||
try {
|
||||
// in case it is a delete
|
||||
GenericRecord record = (GenericRecord) hoodieRecord.getData()
|
||||
.getInsertValue(schema, new Properties())
|
||||
.orElse(null);
|
||||
return record == null ? (String) null : filterOutVariables(record);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.sorted(Comparator.naturalOrder())
|
||||
.collect(Collectors.toList());
|
||||
assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName())));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the scanner to read avro log files.
|
||||
*/
|
||||
private static HoodieMergedLogRecordScanner getScanner(
|
||||
FileSystem fs,
|
||||
String basePath,
|
||||
List<String> logPaths,
|
||||
Schema readSchema,
|
||||
String instant) {
|
||||
return HoodieMergedLogRecordScanner.newBuilder()
|
||||
.withFileSystem(fs)
|
||||
.withBasePath(basePath)
|
||||
.withLogFilePaths(logPaths)
|
||||
.withReaderSchema(readSchema)
|
||||
.withLatestInstantTime(instant)
|
||||
.withReadBlocksLazily(false)
|
||||
.withReverseReader(false)
|
||||
.withBufferSize(16 * 1024 * 1024)
|
||||
.withMaxMemorySizeInBytes(1024 * 1024L)
|
||||
.withSpillableMapBasePath("/tmp/")
|
||||
.withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue())
|
||||
.withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue())
|
||||
.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter out the variables like file name.
|
||||
*/
|
||||
private static String filterOutVariables(GenericRecord genericRecord) {
|
||||
List<String> fields = new ArrayList<>();
|
||||
fields.add(genericRecord.get("_hoodie_record_key").toString());
|
||||
fields.add(genericRecord.get("_hoodie_partition_path").toString());
|
||||
fields.add(genericRecord.get("uuid").toString());
|
||||
fields.add(genericRecord.get("name").toString());
|
||||
fields.add(genericRecord.get("age").toString());
|
||||
fields.add(genericRecord.get("ts").toString());
|
||||
fields.add(genericRecord.get("partition").toString());
|
||||
return Strings.join(fields, ",");
|
||||
}
|
||||
|
||||
public static BinaryRowData insertRow(Object... fields) {
|
||||
return insertRow(QuickstartConfigurations.ROW_TYPE, fields);
|
||||
}
|
||||
|
||||
public static BinaryRowData insertRow(RowType rowType, Object... fields) {
|
||||
LogicalType[] types = rowType.getFields().stream().map(RowType.RowField::getType)
|
||||
.toArray(LogicalType[]::new);
|
||||
assertEquals(
|
||||
"Filed count inconsistent with type information",
|
||||
fields.length,
|
||||
types.length);
|
||||
BinaryRowData row = new BinaryRowData(fields.length);
|
||||
BinaryRowWriter writer = new BinaryRowWriter(row);
|
||||
writer.reset();
|
||||
for (int i = 0; i < fields.length; i++) {
|
||||
Object field = fields[i];
|
||||
if (field == null) {
|
||||
writer.setNullAt(i);
|
||||
} else {
|
||||
BinaryWriter.write(writer, i, field, types[i], InternalSerializers.create(types[i]));
|
||||
}
|
||||
}
|
||||
writer.complete();
|
||||
return row;
|
||||
}
|
||||
|
||||
private static BinaryRowData deleteRow(Object... fields) {
|
||||
BinaryRowData rowData = insertRow(fields);
|
||||
rowData.setRowKind(RowKind.DELETE);
|
||||
return rowData;
|
||||
}
|
||||
|
||||
private static BinaryRowData updateBeforeRow(Object... fields) {
|
||||
BinaryRowData rowData = insertRow(fields);
|
||||
rowData.setRowKind(RowKind.UPDATE_BEFORE);
|
||||
return rowData;
|
||||
}
|
||||
|
||||
private static BinaryRowData updateAfterRow(Object... fields) {
|
||||
BinaryRowData rowData = insertRow(fields);
|
||||
rowData.setRowKind(RowKind.UPDATE_AFTER);
|
||||
return rowData;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
org.apache.hudi.examples.quickstart.factory.ContinuousFileSourceFactory
|
||||
org.apache.hudi.examples.quickstart.factory.CollectSinkTableFactory
|
||||
@@ -0,0 +1,30 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
log4j.logger.org.apache.hadoop.hbase=ERROR
|
||||
|
||||
# CONSOLE is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# CONSOLE uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
@@ -0,0 +1,31 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=INFO, CONSOLE
|
||||
log4j.logger.org.apache=INFO
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
log4j.logger.org.apache.hadoop.hbase=ERROR
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# A1 uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=INFO
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
129
hudi-examples/hudi-examples-java/pom.xml
Normal file
129
hudi-examples/hudi-examples-java/pom.xml
Normal file
@@ -0,0 +1,129 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>hudi-examples</artifactId>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<version>0.11.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>hudi-examples-java</artifactId>
|
||||
|
||||
<properties>
|
||||
<main.basedir>${project.parent.basedir}</main.basedir>
|
||||
<checkstyle.skip>true</checkstyle.skip>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/lib</outputDirectory>
|
||||
<overWriteReleases>true</overWriteReleases>
|
||||
<overWriteSnapshots>true</overWriteSnapshots>
|
||||
<overWriteIfNewer>true</overWriteIfNewer>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>process-resources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>compile</phase>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
<phase>test-compile</phase>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<skip>false</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.rat</groupId>
|
||||
<artifactId>apache-rat-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-examples-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-client-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-java-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
283
hudi-examples/hudi-examples-spark/pom.xml
Normal file
283
hudi-examples/hudi-examples-spark/pom.xml
Normal file
@@ -0,0 +1,283 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>hudi-examples</artifactId>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<version>0.11.0-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>hudi-examples-spark</artifactId>
|
||||
|
||||
<properties>
|
||||
<main.basedir>${project.parent.basedir}</main.basedir>
|
||||
<checkstyle.skip>true</checkstyle.skip>
|
||||
</properties>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/lib</outputDirectory>
|
||||
<overWriteReleases>true</overWriteReleases>
|
||||
<overWriteSnapshots>true</overWriteSnapshots>
|
||||
<overWriteIfNewer>true</overWriteIfNewer>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>process-resources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>compile</phase>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
<phase>test-compile</phase>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<skip>false</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.rat</groupId>
|
||||
<artifactId>apache-rat-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
<version>${scala.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-examples-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>*</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-cli</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-client-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-java-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark-common_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-hadoop-mr</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-timeline-service</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Spark -->
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-avro_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Parquet -->
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-hadoop</artifactId>
|
||||
<version>${parquet.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Avro -->
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-avro</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Hive -->
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-common</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-exec</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
<scope>provided</scope>
|
||||
<classifier>${hive.exec.classifier}</classifier>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>javax.mail</groupId>
|
||||
<artifactId>mail</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.eclipse.jetty.aggregate</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<!-- Junit dependencies -->
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-api</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-engine</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.vintage</groupId>
|
||||
<artifactId>junit-vintage-engine</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter-params</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mockito</groupId>
|
||||
<artifactId>mockito-junit-jupiter</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<!-- Hudi Tests -->
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-client-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<classifier>tests</classifier>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<classifier>tests</classifier>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@@ -23,7 +23,6 @@ import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
|
||||
/**
|
||||
* the example SchemaProvider of example json data from uber.
|
||||
*/
|
||||
@@ -0,0 +1,227 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart;
|
||||
|
||||
import static org.apache.hudi.config.HoodieWriteConfig.TBL_NAME;
|
||||
import static org.apache.spark.sql.SaveMode.Append;
|
||||
import static org.apache.spark.sql.SaveMode.Overwrite;
|
||||
import java.util.List;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
||||
import org.apache.hudi.common.model.WriteOperationType;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.examples.common.HoodieExampleDataGenerator;
|
||||
import org.apache.hudi.examples.common.HoodieExampleSparkUtils;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public final class HoodieSparkQuickstart {
|
||||
|
||||
private HoodieSparkQuickstart() {
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: HoodieWriteClientExample <tablePath> <tableName>");
|
||||
System.exit(1);
|
||||
}
|
||||
String tablePath = args[0];
|
||||
String tableName = args[1];
|
||||
|
||||
SparkSession spark = HoodieExampleSparkUtils.defaultSparkSession("Hudi Spark basic example");
|
||||
SparkConf sparkConf = HoodieExampleSparkUtils.defaultSparkConf("hoodie-client-example");
|
||||
|
||||
try (JavaSparkContext jsc = new JavaSparkContext(sparkConf)) {
|
||||
final HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
|
||||
|
||||
insertData(spark, jsc, tablePath, tableName, dataGen);
|
||||
updateData(spark, jsc, tablePath, tableName, dataGen);
|
||||
queryData(spark, jsc, tablePath, tableName, dataGen);
|
||||
|
||||
incrementalQuery(spark, tablePath, tableName);
|
||||
pointInTimeQuery(spark, tablePath, tableName);
|
||||
|
||||
delete(spark, tablePath, tableName);
|
||||
deleteByPartition(spark, tablePath, tableName);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate some new trips, load them into a DataFrame and write the DataFrame into the Hudi dataset as below.
|
||||
*/
|
||||
public static void insertData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName,
|
||||
HoodieExampleDataGenerator<HoodieAvroPayload> dataGen) {
|
||||
String commitTime = Long.toString(System.currentTimeMillis());
|
||||
List<String> inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20));
|
||||
Dataset<Row> df = spark.read().json(jsc.parallelize(inserts, 1));
|
||||
df.write().format("org.apache.hudi").
|
||||
options(QuickstartUtils.getQuickstartWriteConfigs()).
|
||||
option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts").
|
||||
option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid").
|
||||
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath").
|
||||
option(TBL_NAME.key(), tableName).
|
||||
mode(Overwrite).
|
||||
save(tablePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the data files into a DataFrame.
|
||||
*/
|
||||
public static void queryData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName,
|
||||
HoodieExampleDataGenerator<HoodieAvroPayload> dataGen) {
|
||||
Dataset<Row> roViewDF = spark.
|
||||
read().
|
||||
format("org.apache.hudi").
|
||||
load(tablePath + "/*/*/*/*");
|
||||
|
||||
roViewDF.createOrReplaceTempView("hudi_ro_table");
|
||||
|
||||
spark.sql("select fare, begin_lon, begin_lat, ts from hudi_ro_table where fare > 20.0").show();
|
||||
// +-----------------+-------------------+-------------------+---+
|
||||
// | fare| begin_lon| begin_lat| ts|
|
||||
// +-----------------+-------------------+-------------------+---+
|
||||
// |98.88075495133515|0.39556048623031603|0.17851135255091155|0.0|
|
||||
// ...
|
||||
|
||||
spark.sql(
|
||||
"select _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare from hudi_ro_table")
|
||||
.show();
|
||||
// +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+
|
||||
// |_hoodie_commit_time| _hoodie_record_key|_hoodie_partition_path| rider| driver| fare|
|
||||
// +-------------------+--------------------+----------------------+-------------------+--------------------+------------------+
|
||||
// | 20191231181501|31cafb9f-0196-4b1...| 2020/01/02|rider-1577787297889|driver-1577787297889| 98.88075495133515|
|
||||
// ...
|
||||
}
|
||||
|
||||
/**
|
||||
* This is similar to inserting new data. Generate updates to existing trips using the data generator,
|
||||
* load into a DataFrame and write DataFrame into the hudi dataset.
|
||||
*/
|
||||
public static void updateData(SparkSession spark, JavaSparkContext jsc, String tablePath, String tableName,
|
||||
HoodieExampleDataGenerator<HoodieAvroPayload> dataGen) {
|
||||
|
||||
String commitTime = Long.toString(System.currentTimeMillis());
|
||||
List<String> updates = dataGen.convertToStringList(dataGen.generateUpdates(commitTime, 10));
|
||||
Dataset<Row> df = spark.read().json(jsc.parallelize(updates, 1));
|
||||
df.write().format("org.apache.hudi").
|
||||
options(QuickstartUtils.getQuickstartWriteConfigs()).
|
||||
option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts").
|
||||
option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid").
|
||||
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath").
|
||||
option(TBL_NAME.key(), tableName).
|
||||
mode(Append).
|
||||
save(tablePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deleta data based in data information.
|
||||
*/
|
||||
public static void delete(SparkSession spark, String tablePath, String tableName) {
|
||||
|
||||
Dataset<Row> roViewDF = spark.read().format("org.apache.hudi").load(tablePath + "/*/*/*/*");
|
||||
roViewDF.createOrReplaceTempView("hudi_ro_table");
|
||||
Dataset<Row> df = spark.sql("select uuid, partitionpath, ts from hudi_ro_table limit 2");
|
||||
|
||||
df.write().format("org.apache.hudi").
|
||||
options(QuickstartUtils.getQuickstartWriteConfigs()).
|
||||
option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts").
|
||||
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "uuid").
|
||||
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath").
|
||||
option(TBL_NAME.key(), tableName).
|
||||
option("hoodie.datasource.write.operation", WriteOperationType.DELETE.value()).
|
||||
mode(Append).
|
||||
save(tablePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete the data of a single or multiple partitions.
|
||||
*/
|
||||
public static void deleteByPartition(SparkSession spark, String tablePath, String tableName) {
|
||||
Dataset<Row> df = spark.emptyDataFrame();
|
||||
df.write().format("org.apache.hudi").
|
||||
options(QuickstartUtils.getQuickstartWriteConfigs()).
|
||||
option(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "ts").
|
||||
option(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "uuid").
|
||||
option(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "partitionpath").
|
||||
option(TBL_NAME.key(), tableName).
|
||||
option("hoodie.datasource.write.operation", WriteOperationType.DELETE.value()).
|
||||
option("hoodie.datasource.write.partitions.to.delete",
|
||||
ArrayUtils.toString(HoodieExampleDataGenerator.DEFAULT_PARTITION_PATHS, ",")).
|
||||
mode(Append).
|
||||
save(tablePath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hudi also provides capability to obtain a stream of records that changed since given commit timestamp.
|
||||
* This can be achieved using Hudi’s incremental view and providing a begin time from which changes need to be streamed.
|
||||
* We do not need to specify endTime, if we want all changes after the given commit (as is the common case).
|
||||
*/
|
||||
public static void incrementalQuery(SparkSession spark, String tablePath, String tableName) {
|
||||
List<String> commits =
|
||||
spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime")
|
||||
.toJavaRDD()
|
||||
.map((Function<Row, String>) row -> row.getString(0))
|
||||
.take(50);
|
||||
|
||||
String beginTime = commits.get(commits.size() - 2); // commit time we are interested in
|
||||
|
||||
// incrementally query data
|
||||
Dataset<Row> incViewDF = spark.
|
||||
read().
|
||||
format("org.apache.hudi").
|
||||
option("hoodie.datasource.query.type", "incremental").
|
||||
option("hoodie.datasource.read.begin.instanttime", beginTime).
|
||||
load(tablePath);
|
||||
|
||||
incViewDF.createOrReplaceTempView("hudi_incr_table");
|
||||
spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0")
|
||||
.show();
|
||||
}
|
||||
|
||||
/**
|
||||
* Lets look at how to query data as of a specific time.
|
||||
* The specific time can be represented by pointing endTime to a specific commit time
|
||||
* and beginTime to “000” (denoting earliest possible commit time).
|
||||
*/
|
||||
public static void pointInTimeQuery(SparkSession spark, String tablePath, String tableName) {
|
||||
List<String> commits =
|
||||
spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime")
|
||||
.toJavaRDD()
|
||||
.map((Function<Row, String>) row -> row.getString(0))
|
||||
.take(50);
|
||||
String beginTime = "000"; // Represents all commits > this time.
|
||||
String endTime = commits.get(commits.size() - 2); // commit time we are interested in
|
||||
|
||||
//incrementally query data
|
||||
Dataset<Row> incViewDF = spark.read().format("org.apache.hudi").
|
||||
option("hoodie.datasource.query.type", "incremental").
|
||||
option("hoodie.datasource.read.begin.instanttime", beginTime).
|
||||
option("hoodie.datasource.read.end.instanttime", endTime).
|
||||
load(tablePath);
|
||||
|
||||
incViewDF.createOrReplaceTempView("hudi_incr_table");
|
||||
spark.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_incr_table where fare > 20.0")
|
||||
.show();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,249 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.model.HoodieAvroRecord;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.sql.Row;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* Class to be used in quickstart guide for generating inserts and updates against a corpus. Test data uses a toy Uber
|
||||
* trips, data model.
|
||||
*/
|
||||
public class QuickstartUtils {
|
||||
|
||||
public static class DataGenerator {
|
||||
private static final String DEFAULT_FIRST_PARTITION_PATH = "americas/united_states/san_francisco";
|
||||
private static final String DEFAULT_SECOND_PARTITION_PATH = "americas/brazil/sao_paulo";
|
||||
private static final String DEFAULT_THIRD_PARTITION_PATH = "asia/india/chennai";
|
||||
|
||||
private static final String[] DEFAULT_PARTITION_PATHS =
|
||||
{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH, DEFAULT_THIRD_PARTITION_PATH};
|
||||
static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\",\"name\": \"triprec\",\"fields\": [ "
|
||||
+ "{\"name\": \"ts\",\"type\": \"long\"},{\"name\": \"uuid\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"rider\", \"type\": \"string\"},{\"name\": \"driver\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"begin_lat\", \"type\": \"double\"},{\"name\": \"begin_lon\", \"type\": \"double\"},"
|
||||
+ "{\"name\": \"end_lat\", \"type\": \"double\"},{\"name\": \"end_lon\", \"type\": \"double\"},"
|
||||
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
|
||||
static Schema avroSchema = new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA);
|
||||
|
||||
private static Random rand = new Random(46474747);
|
||||
|
||||
private final Map<Integer, HoodieKey> existingKeys;
|
||||
private final String[] partitionPaths;
|
||||
private int numExistingKeys;
|
||||
|
||||
public DataGenerator() {
|
||||
this(DEFAULT_PARTITION_PATHS, new HashMap<>());
|
||||
}
|
||||
|
||||
public DataGenerator(String[] partitionPaths) {
|
||||
this(partitionPaths, new HashMap<>());
|
||||
}
|
||||
|
||||
private DataGenerator(String[] partitionPaths, Map<Integer, HoodieKey> keyPartitionMap) {
|
||||
this.partitionPaths = Arrays.copyOf(partitionPaths, partitionPaths.length);
|
||||
this.existingKeys = keyPartitionMap;
|
||||
}
|
||||
|
||||
private static String generateRandomString() {
|
||||
int leftLimit = 48; // ascii for 0
|
||||
int rightLimit = 57; // ascii for 9
|
||||
int stringLength = 3;
|
||||
StringBuilder buffer = new StringBuilder(stringLength);
|
||||
for (int i = 0; i < stringLength; i++) {
|
||||
int randomLimitedInt = leftLimit + (int) (rand.nextFloat() * (rightLimit - leftLimit + 1));
|
||||
buffer.append((char) randomLimitedInt);
|
||||
}
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
public int getNumExistingKeys() {
|
||||
return numExistingKeys;
|
||||
}
|
||||
|
||||
public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName,
|
||||
long timestamp) {
|
||||
GenericRecord rec = new GenericData.Record(avroSchema);
|
||||
rec.put("uuid", rowKey);
|
||||
rec.put("ts", timestamp);
|
||||
rec.put("rider", riderName);
|
||||
rec.put("driver", driverName);
|
||||
rec.put("begin_lat", rand.nextDouble());
|
||||
rec.put("begin_lon", rand.nextDouble());
|
||||
rec.put("end_lat", rand.nextDouble());
|
||||
rec.put("end_lon", rand.nextDouble());
|
||||
rec.put("fare", rand.nextDouble() * 100);
|
||||
return rec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a new avro record of the above schema format, retaining the key if optionally provided. The
|
||||
* riderDriverSuffix string is a random String to simulate updates by changing the rider driver fields for records
|
||||
* belonging to the same commit. It is purely used for demo purposes. In real world, the actual updates are assumed
|
||||
* to be provided based on the application requirements.
|
||||
*/
|
||||
public static OverwriteWithLatestAvroPayload generateRandomValue(HoodieKey key, String riderDriverSuffix)
|
||||
throws IOException {
|
||||
// The timestamp generated is limited to range from 7 days before to now, to avoid generating too many
|
||||
// partitionPaths when user use timestamp as partitionPath filed.
|
||||
GenericRecord rec =
|
||||
generateGenericRecord(key.getRecordKey(), "rider-" + riderDriverSuffix, "driver-"
|
||||
+ riderDriverSuffix, generateRangeRandomTimestamp(7));
|
||||
return new OverwriteWithLatestAvroPayload(Option.of(rec));
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate timestamp range from {@param daysTillNow} before to now.
|
||||
*/
|
||||
private static long generateRangeRandomTimestamp(int daysTillNow) {
|
||||
long maxIntervalMillis = daysTillNow * 24 * 60 * 60 * 1000L;
|
||||
return System.currentTimeMillis() - (long)(Math.random() * maxIntervalMillis);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
|
||||
*/
|
||||
public Stream<HoodieRecord> generateInsertsStream(String randomString, Integer n) {
|
||||
int currSize = getNumExistingKeys();
|
||||
|
||||
return IntStream.range(0, n).boxed().map(i -> {
|
||||
String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)];
|
||||
HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath);
|
||||
existingKeys.put(currSize + i, key);
|
||||
numExistingKeys++;
|
||||
try {
|
||||
return new HoodieAvroRecord(key, generateRandomValue(key, randomString));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException(e.getMessage(), e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
|
||||
*/
|
||||
public List<HoodieRecord> generateInserts(Integer n) throws IOException {
|
||||
String randomString = generateRandomString();
|
||||
return generateInsertsStream(randomString, n).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public HoodieRecord generateUpdateRecord(HoodieKey key, String randomString) throws IOException {
|
||||
return new HoodieAvroRecord(key, generateRandomValue(key, randomString));
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates new updates, randomly distributed across the keys above. There can be duplicates within the returned
|
||||
* list
|
||||
*
|
||||
* @param n Number of updates (including dups)
|
||||
* @return list of hoodie record updates
|
||||
*/
|
||||
public List<HoodieRecord> generateUpdates(Integer n) {
|
||||
if (numExistingKeys == 0) {
|
||||
throw new HoodieException("Data must have been written before performing the update operation");
|
||||
}
|
||||
String randomString = generateRandomString();
|
||||
return IntStream.range(0, n).boxed().map(x -> {
|
||||
try {
|
||||
return generateUpdateRecord(existingKeys.get(rand.nextInt(numExistingKeys)), randomString);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException(e.getMessage(), e);
|
||||
}
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates delete records for the passed in rows.
|
||||
*
|
||||
* @param rows List of {@link Row}s for which delete record need to be generated
|
||||
* @return list of hoodie records to delete
|
||||
*/
|
||||
public List<String> generateDeletes(List<Row> rows) {
|
||||
// if row.length() == 2, then the record contains "uuid" and "partitionpath" fields, otherwise,
|
||||
// another field "ts" is available
|
||||
return rows.stream().map(row -> row.length() == 2
|
||||
? convertToString(row.getAs("uuid"), row.getAs("partitionpath"), null) :
|
||||
convertToString(row.getAs("uuid"), row.getAs("partitionpath"), row.getAs("ts"))
|
||||
).filter(os -> os.isPresent()).map(os -> os.get())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public void close() {
|
||||
existingKeys.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private static Option<String> convertToString(HoodieRecord record) {
|
||||
try {
|
||||
String str = HoodieAvroUtils
|
||||
.bytesToAvro(((OverwriteWithLatestAvroPayload) record.getData()).recordBytes, DataGenerator.avroSchema)
|
||||
.toString();
|
||||
str = "{" + str.substring(str.indexOf("\"ts\":"));
|
||||
return Option.of(str.replaceAll("}", ", \"partitionpath\": \"" + record.getPartitionPath() + "\"}"));
|
||||
} catch (IOException e) {
|
||||
return Option.empty();
|
||||
}
|
||||
}
|
||||
|
||||
private static Option<String> convertToString(String uuid, String partitionPath, Long ts) {
|
||||
StringBuffer stringBuffer = new StringBuffer();
|
||||
stringBuffer.append("{");
|
||||
stringBuffer.append("\"ts\": \"" + (ts == null ? "0.0" : ts) + "\",");
|
||||
stringBuffer.append("\"uuid\": \"" + uuid + "\",");
|
||||
stringBuffer.append("\"partitionpath\": \"" + partitionPath + "\"");
|
||||
stringBuffer.append("}");
|
||||
return Option.of(stringBuffer.toString());
|
||||
}
|
||||
|
||||
public static List<String> convertToStringList(List<HoodieRecord> records) {
|
||||
return records.stream().map(hr -> convertToString(hr)).filter(os -> os.isPresent()).map(os -> os.get())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static Map<String, String> getQuickstartWriteConfigs() {
|
||||
Map<String, String> demoConfigs = new HashMap<>();
|
||||
demoConfigs.put("hoodie.insert.shuffle.parallelism", "2");
|
||||
demoConfigs.put("hoodie.upsert.shuffle.parallelism", "2");
|
||||
demoConfigs.put("hoodie.bulkinsert.shuffle.parallelism", "2");
|
||||
demoConfigs.put("hoodie.delete.shuffle.parallelism", "2");
|
||||
return demoConfigs;
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,7 @@ package org.apache.hudi.examples.spark
|
||||
|
||||
import org.apache.hudi.DataSourceReadOptions.{BEGIN_INSTANTTIME, END_INSTANTTIME, QUERY_TYPE, QUERY_TYPE_INCREMENTAL_OPT_VAL}
|
||||
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD, PARTITIONS_TO_DELETE, OPERATION, DELETE_PARTITION_OPERATION_OPT_VAL, DELETE_OPERATION_OPT_VAL}
|
||||
import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs
|
||||
import org.apache.hudi.examples.quickstart.QuickstartUtils.getQuickstartWriteConfigs
|
||||
import org.apache.hudi.common.model.HoodieAvroPayload
|
||||
import org.apache.hudi.config.HoodieWriteConfig.TBL_NAME
|
||||
import org.apache.hudi.examples.common.{HoodieExampleDataGenerator, HoodieExampleSparkUtils}
|
||||
@@ -172,7 +172,7 @@ object HoodieDataSourceExample {
|
||||
* This can be achieved using Hudi’s incremental view and providing a begin time from which changes need to be streamed.
|
||||
* We do not need to specify endTime, if we want all changes after the given commit (as is the common case).
|
||||
*/
|
||||
def incrementalQuery(spark: SparkSession, tablePath: String, tableName: String) {
|
||||
def incrementalQuery(spark: SparkSession, tablePath: String, tableName: String): Unit = {
|
||||
import spark.implicits._
|
||||
val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50)
|
||||
val beginTime = commits(commits.length - 2) // commit time we are interested in
|
||||
@@ -193,7 +193,7 @@ object HoodieDataSourceExample {
|
||||
* The specific time can be represented by pointing endTime to a specific commit time
|
||||
* and beginTime to “000” (denoting earliest possible commit time).
|
||||
*/
|
||||
def pointInTimeQuery(spark: SparkSession, tablePath: String, tableName: String) {
|
||||
def pointInTimeQuery(spark: SparkSession, tablePath: String, tableName: String): Unit = {
|
||||
import spark.implicits._
|
||||
val commits = spark.sql("select distinct(_hoodie_commit_time) as commitTime from hudi_ro_table order by commitTime").map(k => k.getString(0)).take(50)
|
||||
val beginTime = "000" // Represents all commits > this time.
|
||||
@@ -20,7 +20,7 @@
|
||||
package org.apache.hudi.examples.spark
|
||||
|
||||
import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD, PRECOMBINE_FIELD, RECORDKEY_FIELD, TABLE_TYPE}
|
||||
import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs
|
||||
import org.apache.hudi.examples.quickstart.QuickstartUtils.getQuickstartWriteConfigs
|
||||
import org.apache.hudi.client.SparkRDDWriteClient
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext
|
||||
import org.apache.hudi.common.model.{HoodieAvroPayload, HoodieRecordPayload, HoodieTableType}
|
||||
@@ -55,6 +55,7 @@ object HoodieMorCompactionJob {
|
||||
val dataGen = new HoodieExampleDataGenerator[HoodieAvroPayload]
|
||||
val tablePath = args(0)
|
||||
val tableName = args(1)
|
||||
|
||||
insertData(spark, tablePath, tableName, dataGen, HoodieTableType.MERGE_ON_READ.name())
|
||||
updateData(spark, tablePath, tableName, dataGen, HoodieTableType.MERGE_ON_READ.name())
|
||||
val cfg = HoodieWriteConfig.newBuilder()
|
||||
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Paths;
|
||||
import org.apache.hudi.client.HoodieReadClient;
|
||||
import org.apache.hudi.client.SparkRDDWriteClient;
|
||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
||||
import org.apache.hudi.examples.common.HoodieExampleDataGenerator;
|
||||
import org.apache.hudi.testutils.providers.SparkProvider;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.Utils;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
|
||||
public class TestHoodieSparkQuickstart implements SparkProvider {
|
||||
protected static transient HoodieSparkEngineContext context;
|
||||
|
||||
private static transient SparkSession spark;
|
||||
private static transient SQLContext sqlContext;
|
||||
private static transient JavaSparkContext jsc;
|
||||
|
||||
/**
|
||||
* An indicator of the initialization status.
|
||||
*/
|
||||
protected boolean initialized = false;
|
||||
@TempDir
|
||||
protected java.nio.file.Path tempDir;
|
||||
|
||||
private static final HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
|
||||
|
||||
@Override
|
||||
public SparkSession spark() {
|
||||
return spark;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SQLContext sqlContext() {
|
||||
return sqlContext;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaSparkContext jsc() {
|
||||
return jsc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieSparkEngineContext context() {
|
||||
return context;
|
||||
}
|
||||
|
||||
public String basePath() {
|
||||
return tempDir.toAbsolutePath().toString();
|
||||
}
|
||||
|
||||
public String tablePath(String tableName) {
|
||||
return Paths.get(basePath(), tableName).toString();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public synchronized void runBeforeEach() {
|
||||
initialized = spark != null;
|
||||
if (!initialized) {
|
||||
SparkConf sparkConf = conf();
|
||||
SparkRDDWriteClient.registerClasses(sparkConf);
|
||||
HoodieReadClient.addHoodieSupport(sparkConf);
|
||||
spark = SparkSession.builder().config(sparkConf).getOrCreate();
|
||||
sqlContext = spark.sqlContext();
|
||||
jsc = new JavaSparkContext(spark.sparkContext());
|
||||
context = new HoodieSparkEngineContext(jsc);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHoodieSparkQuickstart() {
|
||||
String tableName = "spark_quick_start";
|
||||
String tablePath = tablePath(tableName);
|
||||
|
||||
try {
|
||||
HoodieSparkQuickstart.insertData(spark, jsc, tablePath, tableName, dataGen);
|
||||
HoodieSparkQuickstart.updateData(spark, jsc, tablePath, tableName, dataGen);
|
||||
|
||||
HoodieSparkQuickstart.queryData(spark, jsc, tablePath, tableName, dataGen);
|
||||
HoodieSparkQuickstart.incrementalQuery(spark, tablePath, tableName);
|
||||
HoodieSparkQuickstart.pointInTimeQuery(spark, tablePath, tableName);
|
||||
|
||||
HoodieSparkQuickstart.delete(spark, tablePath, tableName);
|
||||
HoodieSparkQuickstart.deleteByPartition(spark, tablePath, tableName);
|
||||
} finally {
|
||||
Utils.deleteRecursively(new File(tablePath));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.examples.quickstart;
|
||||
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class TestQuickstartUtils {
|
||||
|
||||
@Test
|
||||
public void testGenerateUpdates() throws Exception {
|
||||
QuickstartUtils.DataGenerator dataGenerator = new QuickstartUtils.DataGenerator();
|
||||
|
||||
//Call generateUpdates directly then throws HoodieException
|
||||
assertEquals(Assertions.assertThrows(HoodieException.class, () -> {
|
||||
dataGenerator.generateUpdates(10);
|
||||
}).getMessage(), "Data must have been written before performing the update operation");
|
||||
|
||||
//Execute generateInserts before executing generateUpdates then no exception
|
||||
assertEquals(dataGenerator.generateInserts(10).size(), 10);
|
||||
assertEquals(dataGenerator.generateUpdates(10).size(), 10);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
log4j.logger.org.apache.hadoop.hbase=ERROR
|
||||
|
||||
# CONSOLE is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# CONSOLE uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
@@ -0,0 +1,31 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=INFO, CONSOLE
|
||||
log4j.logger.org.apache=INFO
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
log4j.logger.org.apache.hadoop.hbase=ERROR
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# A1 uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=INFO
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
@@ -25,204 +25,13 @@
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>hudi-examples</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<packaging>pom</packaging>
|
||||
|
||||
<properties>
|
||||
<main.basedir>${project.parent.basedir}</main.basedir>
|
||||
<checkstyle.skip>true</checkstyle.skip>
|
||||
</properties>
|
||||
<modules>
|
||||
<module>hudi-examples-common</module>
|
||||
<module>hudi-examples-spark</module>
|
||||
<module>hudi-examples-flink</module>
|
||||
<module>hudi-examples-java</module>
|
||||
</modules>
|
||||
|
||||
<build>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>src/main/resources</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
<phase>prepare-package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/lib</outputDirectory>
|
||||
<overWriteReleases>true</overWriteReleases>
|
||||
<overWriteSnapshots>true</overWriteSnapshots>
|
||||
<overWriteIfNewer>true</overWriteIfNewer>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>process-resources</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<phase>compile</phase>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
<phase>test-compile</phase>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<skip>false</skip>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.rat</groupId>
|
||||
<artifactId>apache-rat-plugin</artifactId>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
<version>${scala.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-cli</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-client-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-java-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark-client</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-utilities_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-spark_${scala.binary.version}</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-hadoop-mr</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hudi</groupId>
|
||||
<artifactId>hudi-timeline-service</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Spark -->
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-avro_${scala.binary.version}</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Parquet -->
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-hadoop</artifactId>
|
||||
<version>${parquet.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Avro -->
|
||||
<dependency>
|
||||
<groupId>org.apache.avro</groupId>
|
||||
<artifactId>avro</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.parquet</groupId>
|
||||
<artifactId>parquet-avro</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Hive -->
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-common</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${hive.groupid}</groupId>
|
||||
<artifactId>hive-exec</artifactId>
|
||||
<version>${hive.version}</version>
|
||||
<scope>provided</scope>
|
||||
<classifier>${hive.exec.classifier}</classifier>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>javax.mail</groupId>
|
||||
<artifactId>mail</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.eclipse.jetty.aggregate</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</project>
|
||||
|
||||
Reference in New Issue
Block a user