[HUDI-251] Adds JDBC source support for DeltaStreamer (#2915)
As discussed in RFC-14, this change implements the first phase of JDBC incremental puller. It consists following changes: - JdbcSource: This class extends RowSource and implements fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) - SqlQueryBuilder: A simple utility class to build sql queries fluently. - Implements two modes of fetching: full and incremental. Full is a complete scan of RDBMS table. Incremental is delta since last checkpoint. Incremental mode falls back to full fetch in case of any exception.
This commit is contained in:
@@ -0,0 +1,146 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities;
|
||||
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
|
||||
/**
|
||||
* SQL query builder. Current support for: SELECT, FROM, JOIN, ON, WHERE, ORDER BY, LIMIT clauses.
|
||||
*/
|
||||
public class SqlQueryBuilder {
|
||||
|
||||
private final StringBuilder sqlBuilder;
|
||||
|
||||
private SqlQueryBuilder(StringBuilder sqlBuilder) {
|
||||
this.sqlBuilder = sqlBuilder;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a SELECT query.
|
||||
*
|
||||
* @param columns The column names to select.
|
||||
* @return The new {@link SqlQueryBuilder} instance.
|
||||
*/
|
||||
public static SqlQueryBuilder select(String... columns) {
|
||||
if (columns == null || columns.length == 0) {
|
||||
throw new IllegalArgumentException("No columns provided with SELECT statement. Please mention column names or '*' to select all columns.");
|
||||
}
|
||||
StringBuilder sqlBuilder = new StringBuilder();
|
||||
sqlBuilder.append("select ");
|
||||
sqlBuilder.append(String.join(", ", columns));
|
||||
return new SqlQueryBuilder(sqlBuilder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a FROM clause to a query.
|
||||
*
|
||||
* @param tables The table names to select from.
|
||||
* @return The {@link SqlQueryBuilder} instance.
|
||||
*/
|
||||
public SqlQueryBuilder from(String... tables) {
|
||||
if (tables == null || tables.length == 0) {
|
||||
throw new IllegalArgumentException("No table name provided with FROM clause. Please provide a table name to select from.");
|
||||
}
|
||||
sqlBuilder.append(" from ");
|
||||
sqlBuilder.append(String.join(", ", tables));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a JOIN clause to a query.
|
||||
*
|
||||
* @param table The table to join with.
|
||||
* @return The {@link SqlQueryBuilder} instance.
|
||||
*/
|
||||
public SqlQueryBuilder join(String table) {
|
||||
if (StringUtils.isNullOrEmpty(table)) {
|
||||
throw new IllegalArgumentException("No table name provided with JOIN clause. Please provide a table name to join with.");
|
||||
}
|
||||
sqlBuilder.append(" join ");
|
||||
sqlBuilder.append(table);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends an ON clause to a query.
|
||||
*
|
||||
* @param predicate The predicate to join on.
|
||||
* @return The {@link SqlQueryBuilder} instance.
|
||||
*/
|
||||
public SqlQueryBuilder on(String predicate) {
|
||||
if (StringUtils.isNullOrEmpty(predicate)) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
sqlBuilder.append(" on ");
|
||||
sqlBuilder.append(predicate);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a WHERE clause to a query.
|
||||
*
|
||||
* @param predicate The predicate for WHERE clause.
|
||||
* @return The {@link SqlQueryBuilder} instance.
|
||||
*/
|
||||
public SqlQueryBuilder where(String predicate) {
|
||||
if (StringUtils.isNullOrEmpty(predicate)) {
|
||||
throw new IllegalArgumentException("No predicate provided with WHERE clause. Please provide a predicate to filter records.");
|
||||
}
|
||||
sqlBuilder.append(" where ");
|
||||
sqlBuilder.append(predicate);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends an ORDER BY clause to a query. By default, records are ordered in ascending order by the given column.
|
||||
* To order in descending order use DESC after the column name, e.g. queryBuilder.orderBy("update_time desc").
|
||||
*
|
||||
* @param columns Column names to order by.
|
||||
* @return The {@link SqlQueryBuilder} instance.
|
||||
*/
|
||||
public SqlQueryBuilder orderBy(String... columns) {
|
||||
if (columns == null || columns.length == 0) {
|
||||
throw new IllegalArgumentException("No columns provided with ORDER BY clause. Please provide a column name to order records.");
|
||||
}
|
||||
sqlBuilder.append(" order by ");
|
||||
sqlBuilder.append(String.join(", ", columns));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends a "limit" clause to a query.
|
||||
*
|
||||
* @param count The limit count.
|
||||
* @return The {@link SqlQueryBuilder} instance.
|
||||
*/
|
||||
public SqlQueryBuilder limit(long count) {
|
||||
if (count < 0) {
|
||||
throw new IllegalArgumentException("Please provide a positive integer for the LIMIT clause.");
|
||||
}
|
||||
sqlBuilder.append(" limit ");
|
||||
sqlBuilder.append(count);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return sqlBuilder.toString();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,343 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.sources;
|
||||
|
||||
import org.apache.hudi.DataSourceUtils;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.utilities.SqlQueryBuilder;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Column;
|
||||
import org.apache.spark.sql.DataFrameReader;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.functions;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
|
||||
import java.net.URI;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Reads data from RDBMS data sources.
|
||||
*/
|
||||
|
||||
public class JdbcSource extends RowSource {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(JdbcSource.class);
|
||||
private static final List<String> DB_LIMIT_CLAUSE = Arrays.asList("mysql", "postgresql", "h2");
|
||||
private static final String URI_JDBC_PREFIX = "jdbc:";
|
||||
|
||||
public JdbcSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates all user properties and prepares the {@link DataFrameReader} to read from RDBMS.
|
||||
*
|
||||
* @param session The {@link SparkSession}.
|
||||
* @param properties The JDBC connection properties and data source options.
|
||||
* @return The {@link DataFrameReader} to read from RDBMS
|
||||
* @throws HoodieException
|
||||
*/
|
||||
private static DataFrameReader validatePropsAndGetDataFrameReader(final SparkSession session,
|
||||
final TypedProperties properties)
|
||||
throws HoodieException {
|
||||
DataFrameReader dataFrameReader;
|
||||
FSDataInputStream passwordFileStream = null;
|
||||
try {
|
||||
dataFrameReader = session.read().format("jdbc");
|
||||
dataFrameReader = dataFrameReader.option(Config.URL_PROP, properties.getString(Config.URL));
|
||||
dataFrameReader = dataFrameReader.option(Config.USER_PROP, properties.getString(Config.USER));
|
||||
dataFrameReader = dataFrameReader.option(Config.DRIVER_PROP, properties.getString(Config.DRIVER_CLASS));
|
||||
dataFrameReader = dataFrameReader
|
||||
.option(Config.RDBMS_TABLE_PROP, properties.getString(Config.RDBMS_TABLE_NAME));
|
||||
|
||||
if (properties.containsKey(Config.PASSWORD)) {
|
||||
LOG.info("Reading JDBC password from properties file....");
|
||||
dataFrameReader = dataFrameReader.option(Config.PASSWORD_PROP, properties.getString(Config.PASSWORD));
|
||||
} else if (properties.containsKey(Config.PASSWORD_FILE)
|
||||
&& !StringUtils.isNullOrEmpty(properties.getString(Config.PASSWORD_FILE))) {
|
||||
LOG.info(String.format("Reading JDBC password from password file %s", properties.getString(Config.PASSWORD_FILE)));
|
||||
FileSystem fileSystem = FileSystem.get(session.sparkContext().hadoopConfiguration());
|
||||
passwordFileStream = fileSystem.open(new Path(properties.getString(Config.PASSWORD_FILE)));
|
||||
byte[] bytes = new byte[passwordFileStream.available()];
|
||||
passwordFileStream.read(bytes);
|
||||
dataFrameReader = dataFrameReader.option(Config.PASSWORD_PROP, new String(bytes));
|
||||
} else {
|
||||
throw new IllegalArgumentException(String.format("JDBCSource needs either a %s or %s to connect to RDBMS "
|
||||
+ "datasource", Config.PASSWORD_FILE, Config.PASSWORD));
|
||||
}
|
||||
|
||||
addExtraJdbcOptions(properties, dataFrameReader);
|
||||
|
||||
if (properties.getBoolean(Config.IS_INCREMENTAL)) {
|
||||
DataSourceUtils.checkRequiredProperties(properties, Collections.singletonList(Config.INCREMENTAL_COLUMN));
|
||||
}
|
||||
return dataFrameReader;
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Failed to validate properties", e);
|
||||
} finally {
|
||||
IOUtils.closeStream(passwordFileStream);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Accepts spark JDBC options from the user in terms of EXTRA_OPTIONS adds them to {@link DataFrameReader} Example: In
|
||||
* a normal spark code you would do something like: session.read.format('jdbc') .option(fetchSize,1000)
|
||||
* .option(timestampFormat,"yyyy-mm-dd hh:mm:ss")
|
||||
* <p>
|
||||
* The way to pass these properties to HUDI is through the config file. Any property starting with
|
||||
* hoodie.deltastreamer.jdbc.extra.options. will be added.
|
||||
* <p>
|
||||
* Example: hoodie.deltastreamer.jdbc.extra.options.fetchSize=100
|
||||
* hoodie.deltastreamer.jdbc.extra.options.upperBound=1
|
||||
* hoodie.deltastreamer.jdbc.extra.options.lowerBound=100
|
||||
*
|
||||
* @param properties The JDBC connection properties and data source options.
|
||||
* @param dataFrameReader The {@link DataFrameReader} to which data source options will be added.
|
||||
*/
|
||||
private static void addExtraJdbcOptions(TypedProperties properties, DataFrameReader dataFrameReader) {
|
||||
Set<Object> objects = properties.keySet();
|
||||
for (Object property : objects) {
|
||||
String prop = property.toString();
|
||||
if (prop.startsWith(Config.EXTRA_OPTIONS)) {
|
||||
String key = String.join("", prop.split(Config.EXTRA_OPTIONS));
|
||||
String value = properties.getString(prop);
|
||||
if (!StringUtils.isNullOrEmpty(value)) {
|
||||
LOG.info(String.format("Adding %s -> %s to jdbc options", key, value));
|
||||
dataFrameReader.option(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) throws HoodieException {
|
||||
try {
|
||||
DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.URL, Config.DRIVER_CLASS, Config.USER, Config.RDBMS_TABLE_NAME, Config.IS_INCREMENTAL));
|
||||
return fetch(lastCkptStr, sourceLimit);
|
||||
} catch (HoodieException e) {
|
||||
LOG.error("Exception while running JDBCSource ", e);
|
||||
throw e;
|
||||
} catch (Exception e) {
|
||||
LOG.error("Exception while running JDBCSource ", e);
|
||||
throw new HoodieException("Error fetching next batch from JDBC source. Last checkpoint: " + lastCkptStr.orElse(null), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decide to do a full RDBMS table scan or an incremental scan based on the lastCkptStr. If previous checkpoint
|
||||
* value exists then we do an incremental scan with a PPD query or else we do a full scan. In certain cases where the
|
||||
* incremental query fails, we fallback to a full scan.
|
||||
*
|
||||
* @param lastCkptStr Last checkpoint.
|
||||
* @return The pair of {@link Dataset} and current checkpoint.
|
||||
*/
|
||||
private Pair<Option<Dataset<Row>>, String> fetch(Option<String> lastCkptStr, long sourceLimit) {
|
||||
Dataset<Row> dataset;
|
||||
if (lastCkptStr.isPresent() && !StringUtils.isNullOrEmpty(lastCkptStr.get())) {
|
||||
dataset = incrementalFetch(lastCkptStr, sourceLimit);
|
||||
} else {
|
||||
LOG.info("No checkpoint references found. Doing a full rdbms table fetch");
|
||||
dataset = fullFetch(sourceLimit);
|
||||
}
|
||||
dataset.persist(StorageLevel.fromString(props.getString(Config.STORAGE_LEVEL, "MEMORY_AND_DISK_SER")));
|
||||
boolean isIncremental = props.getBoolean(Config.IS_INCREMENTAL);
|
||||
Pair<Option<Dataset<Row>>, String> pair = Pair.of(Option.of(dataset), checkpoint(dataset, isIncremental, lastCkptStr));
|
||||
dataset.unpersist();
|
||||
return pair;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does an incremental scan with PPQ query prepared on the bases of previous checkpoint.
|
||||
*
|
||||
* @param lastCheckpoint Last checkpoint.
|
||||
* Note that the records fetched will be exclusive of the last checkpoint (i.e. incremental column value > lastCheckpoint).
|
||||
* @return The {@link Dataset} after incremental fetch from RDBMS.
|
||||
*/
|
||||
private Dataset<Row> incrementalFetch(Option<String> lastCheckpoint, long sourceLimit) {
|
||||
try {
|
||||
final String ppdQuery = "(%s) rdbms_table";
|
||||
final SqlQueryBuilder queryBuilder = SqlQueryBuilder.select("*")
|
||||
.from(props.getString(Config.RDBMS_TABLE_NAME))
|
||||
.where(String.format(" %s > '%s'", props.getString(Config.INCREMENTAL_COLUMN), lastCheckpoint.get()));
|
||||
|
||||
if (sourceLimit > 0) {
|
||||
URI jdbcURI = URI.create(props.getString(Config.URL).substring(URI_JDBC_PREFIX.length()));
|
||||
if (DB_LIMIT_CLAUSE.contains(jdbcURI.getScheme())) {
|
||||
queryBuilder.orderBy(props.getString(Config.INCREMENTAL_COLUMN)).limit(sourceLimit);
|
||||
}
|
||||
}
|
||||
String query = String.format(ppdQuery, queryBuilder.toString());
|
||||
LOG.info("PPD QUERY: " + query);
|
||||
LOG.info(String.format("Referenced last checkpoint and prepared new predicate pushdown query for jdbc pull %s", query));
|
||||
return validatePropsAndGetDataFrameReader(sparkSession, props).option(Config.RDBMS_TABLE_PROP, query).load();
|
||||
} catch (Exception e) {
|
||||
LOG.error("Error while performing an incremental fetch. Not all database support the PPD query we generate to do an incremental scan", e);
|
||||
if (props.containsKey(Config.FALLBACK_TO_FULL_FETCH) && props.getBoolean(Config.FALLBACK_TO_FULL_FETCH)) {
|
||||
LOG.warn("Falling back to full scan.");
|
||||
return fullFetch(sourceLimit);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Does a full scan on the RDBMS data source.
|
||||
*
|
||||
* @return The {@link Dataset} after running full scan.
|
||||
*/
|
||||
private Dataset<Row> fullFetch(long sourceLimit) {
|
||||
final String ppdQuery = "(%s) rdbms_table";
|
||||
final SqlQueryBuilder queryBuilder = SqlQueryBuilder.select("*")
|
||||
.from(props.getString(Config.RDBMS_TABLE_NAME));
|
||||
if (sourceLimit > 0) {
|
||||
URI jdbcURI = URI.create(props.getString(Config.URL).substring(URI_JDBC_PREFIX.length()));
|
||||
if (DB_LIMIT_CLAUSE.contains(jdbcURI.getScheme())) {
|
||||
if (props.containsKey(Config.INCREMENTAL_COLUMN)) {
|
||||
queryBuilder.orderBy(props.getString(Config.INCREMENTAL_COLUMN)).limit(sourceLimit);
|
||||
} else {
|
||||
queryBuilder.limit(sourceLimit);
|
||||
}
|
||||
}
|
||||
}
|
||||
String query = String.format(ppdQuery, queryBuilder.toString());
|
||||
return validatePropsAndGetDataFrameReader(sparkSession, props).option(Config.RDBMS_TABLE_PROP, query).load();
|
||||
}
|
||||
|
||||
private String checkpoint(Dataset<Row> rowDataset, boolean isIncremental, Option<String> lastCkptStr) {
|
||||
try {
|
||||
if (isIncremental) {
|
||||
Column incrementalColumn = rowDataset.col(props.getString(Config.INCREMENTAL_COLUMN));
|
||||
final String max = rowDataset.agg(functions.max(incrementalColumn).cast(DataTypes.StringType)).first().getString(0);
|
||||
LOG.info(String.format("Checkpointing column %s with value: %s ", incrementalColumn, max));
|
||||
if (max != null) {
|
||||
return max;
|
||||
}
|
||||
return lastCkptStr.isPresent() && !StringUtils.isNullOrEmpty(lastCkptStr.get()) ? lastCkptStr.get() : StringUtils.EMPTY_STRING;
|
||||
} else {
|
||||
return StringUtils.EMPTY_STRING;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
LOG.error("Failed to checkpoint");
|
||||
throw new HoodieException("Failed to checkpoint. Last checkpoint: " + lastCkptStr.orElse(null), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inner class with config keys.
|
||||
*/
|
||||
protected static class Config {
|
||||
|
||||
/**
|
||||
* {@value #URL} is the jdbc url for the Hoodie datasource.
|
||||
*/
|
||||
private static final String URL = "hoodie.deltastreamer.jdbc.url";
|
||||
|
||||
private static final String URL_PROP = "url";
|
||||
|
||||
/**
|
||||
* {@value #USER} is the username used for JDBC connection.
|
||||
*/
|
||||
private static final String USER = "hoodie.deltastreamer.jdbc.user";
|
||||
|
||||
/**
|
||||
* {@value #USER_PROP} used internally to build jdbc params.
|
||||
*/
|
||||
private static final String USER_PROP = "user";
|
||||
|
||||
/**
|
||||
* {@value #PASSWORD} is the password used for JDBC connection.
|
||||
*/
|
||||
private static final String PASSWORD = "hoodie.deltastreamer.jdbc.password";
|
||||
|
||||
/**
|
||||
* {@value #PASSWORD_FILE} is the base-path for the JDBC password file.
|
||||
*/
|
||||
private static final String PASSWORD_FILE = "hoodie.deltastreamer.jdbc.password.file";
|
||||
|
||||
/**
|
||||
* {@value #PASSWORD_PROP} used internally to build jdbc params.
|
||||
*/
|
||||
private static final String PASSWORD_PROP = "password";
|
||||
|
||||
/**
|
||||
* {@value #DRIVER_CLASS} used for JDBC connection.
|
||||
*/
|
||||
private static final String DRIVER_CLASS = "hoodie.deltastreamer.jdbc.driver.class";
|
||||
|
||||
/**
|
||||
* {@value #DRIVER_PROP} used internally to build jdbc params.
|
||||
*/
|
||||
private static final String DRIVER_PROP = "driver";
|
||||
|
||||
/**
|
||||
* {@value #RDBMS_TABLE_NAME} RDBMS table to pull.
|
||||
*/
|
||||
private static final String RDBMS_TABLE_NAME = "hoodie.deltastreamer.jdbc.table.name";
|
||||
|
||||
/**
|
||||
* {@value #RDBMS_TABLE_PROP} used internally for jdbc.
|
||||
*/
|
||||
private static final String RDBMS_TABLE_PROP = "dbtable";
|
||||
|
||||
/**
|
||||
* {@value #INCREMENTAL_COLUMN} if ran in incremental mode, this field will be used to pull new data incrementally.
|
||||
*/
|
||||
private static final String INCREMENTAL_COLUMN = "hoodie.deltastreamer.jdbc.table.incr.column.name";
|
||||
|
||||
/**
|
||||
* {@value #IS_INCREMENTAL} will the JDBC source do an incremental pull?
|
||||
*/
|
||||
private static final String IS_INCREMENTAL = "hoodie.deltastreamer.jdbc.incr.pull";
|
||||
|
||||
/**
|
||||
* {@value #EXTRA_OPTIONS} used to set any extra options the user specifies for jdbc.
|
||||
*/
|
||||
private static final String EXTRA_OPTIONS = "hoodie.deltastreamer.jdbc.extra.options.";
|
||||
|
||||
/**
|
||||
* {@value #STORAGE_LEVEL} is used to control the persistence level. Default value: MEMORY_AND_DISK_SER.
|
||||
*/
|
||||
private static final String STORAGE_LEVEL = "hoodie.deltastreamer.jdbc.storage.level";
|
||||
|
||||
/**
|
||||
* {@value #FALLBACK_TO_FULL_FETCH} is a boolean, which if set true, makes incremental fetch to fallback to full fetch in case of any error.
|
||||
*/
|
||||
private static final String FALLBACK_TO_FULL_FETCH = "hoodie.deltastreamer.jdbc.incr.fallback.to.full.fetch";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
/**
|
||||
* Tests {@link SqlQueryBuilder}.
|
||||
*/
|
||||
public class TestSqlQueryBuilder {
|
||||
|
||||
@Test
|
||||
public void testSelect() {
|
||||
String sql = SqlQueryBuilder.select("id", "rider", "time")
|
||||
.from("trips")
|
||||
.join("users").on("trips.rider = users.id")
|
||||
.where("(trips.time > 100 or trips.time < 200)")
|
||||
.orderBy("id", "time")
|
||||
.limit(10).toString();
|
||||
|
||||
assertEquals("select id, rider, time from trips "
|
||||
+ "join users on trips.rider = users.id "
|
||||
+ "where (trips.time > 100 or trips.time < 200) "
|
||||
+ "order by id, time "
|
||||
+ "limit 10", sql);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIncorrectQueries() {
|
||||
assertThrows(IllegalArgumentException.class, () -> SqlQueryBuilder.select().toString());
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> SqlQueryBuilder.select("*").from().toString());
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> SqlQueryBuilder.select("id").from("trips").where("").toString());
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> SqlQueryBuilder.select("id").from("trips").join("").toString());
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> SqlQueryBuilder.select("id").from("trips").join("riders").on("").toString());
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> SqlQueryBuilder.select("id").from("trips").join("riders").where("id > 0").orderBy().toString());
|
||||
|
||||
assertThrows(IllegalArgumentException.class, () -> SqlQueryBuilder.select("id").from("trips").join("riders").where("id > 0").orderBy("id").limit(-1).toString());
|
||||
}
|
||||
}
|
||||
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.utilities.functional;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.util.ConcurrentModificationException;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import org.apache.hudi.DataSourceWriteOptions;
|
||||
@@ -50,10 +52,12 @@ import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
||||
import org.apache.hudi.utilities.sources.CsvDFSSource;
|
||||
import org.apache.hudi.utilities.sources.HoodieIncrSource;
|
||||
import org.apache.hudi.utilities.sources.InputBatch;
|
||||
import org.apache.hudi.utilities.sources.JdbcSource;
|
||||
import org.apache.hudi.utilities.sources.JsonKafkaSource;
|
||||
import org.apache.hudi.utilities.sources.ParquetDFSSource;
|
||||
import org.apache.hudi.utilities.sources.TestDataSource;
|
||||
import org.apache.hudi.utilities.sources.helpers.KafkaOffsetGen.Config;
|
||||
import org.apache.hudi.utilities.testutils.JdbcTestUtils;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.apache.hudi.utilities.testutils.sources.DistributedTestDataSource;
|
||||
import org.apache.hudi.utilities.testutils.sources.config.SourceConfigs;
|
||||
@@ -109,6 +113,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
|
||||
/**
|
||||
* Basic tests against {@link HoodieDeltaStreamer}, by issuing bulk_inserts, upserts, inserts. Check counts at the end.
|
||||
@@ -1594,6 +1599,45 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
||||
testCsvDFSSource(false, '\t', true, Collections.singletonList(TripsWithDistanceTransformer.class.getName()));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJdbcSourceIncrementalFetchInContinuousMode() {
|
||||
try (Connection connection = DriverManager.getConnection("jdbc:h2:mem:test_mem", "test", "jdbc")) {
|
||||
TypedProperties props = new TypedProperties();
|
||||
props.setProperty("hoodie.deltastreamer.jdbc.url", "jdbc:h2:mem:test_mem");
|
||||
props.setProperty("hoodie.deltastreamer.jdbc.driver.class", "org.h2.Driver");
|
||||
props.setProperty("hoodie.deltastreamer.jdbc.user", "test");
|
||||
props.setProperty("hoodie.deltastreamer.jdbc.password", "jdbc");
|
||||
props.setProperty("hoodie.deltastreamer.jdbc.table.name", "triprec");
|
||||
props.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
props.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "id");
|
||||
|
||||
props.setProperty("hoodie.datasource.write.keygenerator.class", SimpleKeyGenerator.class.getName());
|
||||
props.setProperty("hoodie.datasource.write.recordkey.field", "ID");
|
||||
props.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
|
||||
|
||||
UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-jdbc-source.properties");
|
||||
|
||||
int numRecords = 1000;
|
||||
int sourceLimit = 100;
|
||||
String tableBasePath = dfsBasePath + "/triprec";
|
||||
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, JdbcSource.class.getName(),
|
||||
null, "test-jdbc-source.properties", false,
|
||||
false, sourceLimit, false, null, null, "timestamp");
|
||||
cfg.continuousMode = true;
|
||||
// Add 1000 records
|
||||
JdbcTestUtils.clearAndInsert("000", numRecords, connection, new HoodieTestDataGenerator(), props);
|
||||
|
||||
HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(cfg, jsc);
|
||||
deltaStreamerTestRunner(deltaStreamer, cfg, (r) -> {
|
||||
TestHelpers.assertAtleastNCompactionCommits(numRecords / sourceLimit + ((numRecords % sourceLimit == 0) ? 0 : 1), tableBasePath, dfs);
|
||||
TestHelpers.assertRecordCount(numRecords, tableBasePath + "/*/*.parquet", sqlContext);
|
||||
return true;
|
||||
});
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* UDF to calculate Haversine distance.
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,443 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.sources;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.sql.Column;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.functions;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.utilities.testutils.JdbcTestUtils.clearAndInsert;
|
||||
import static org.apache.hudi.utilities.testutils.JdbcTestUtils.close;
|
||||
import static org.apache.hudi.utilities.testutils.JdbcTestUtils.count;
|
||||
import static org.apache.hudi.utilities.testutils.JdbcTestUtils.insert;
|
||||
import static org.apache.hudi.utilities.testutils.JdbcTestUtils.update;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
|
||||
/**
|
||||
* Tests {@link JdbcSource}.
|
||||
*/
|
||||
public class TestJdbcSource extends UtilitiesTestBase {
|
||||
|
||||
private static final TypedProperties PROPS = new TypedProperties();
|
||||
private static final HoodieTestDataGenerator DATA_GENERATOR = new HoodieTestDataGenerator();
|
||||
private static Connection connection;
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.url", "jdbc:h2:mem:test_mem");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.driver.class", "org.h2.Driver");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.user", "test");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.password", "jdbc");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.name", "triprec");
|
||||
connection = DriverManager.getConnection("jdbc:h2:mem:test_mem", "test", "jdbc");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() throws Exception {
|
||||
super.teardown();
|
||||
close(connection);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleCommit() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert");
|
||||
|
||||
try {
|
||||
int numRecords = 100;
|
||||
String commitTime = "000";
|
||||
|
||||
// Insert 100 records with commit time
|
||||
clearAndInsert(commitTime, numRecords, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Validate if we have specified records in db
|
||||
assertEquals(numRecords, count(connection, "triprec"));
|
||||
|
||||
// Start JdbcSource
|
||||
Dataset<Row> rowDataset = runSource(Option.empty(), numRecords).getBatch().get();
|
||||
assertEquals(numRecords, rowDataset.count());
|
||||
} catch (SQLException e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInsertAndUpdate() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert");
|
||||
|
||||
try {
|
||||
final String commitTime = "000";
|
||||
final int numRecords = 100;
|
||||
|
||||
// Add 100 records. Update half of them with commit time "007".
|
||||
update("007",
|
||||
clearAndInsert(commitTime, numRecords, connection, DATA_GENERATOR, PROPS)
|
||||
.stream()
|
||||
.limit(50)
|
||||
.collect(Collectors.toList()),
|
||||
connection, DATA_GENERATOR, PROPS
|
||||
);
|
||||
// Check if database has 100 records
|
||||
assertEquals(numRecords, count(connection, "triprec"));
|
||||
|
||||
// Start JdbcSource
|
||||
Dataset<Row> rowDataset = runSource(Option.empty(), 100).getBatch().get();
|
||||
assertEquals(100, rowDataset.count());
|
||||
|
||||
Dataset<Row> firstCommit = rowDataset.where("commit_time=000");
|
||||
assertEquals(50, firstCommit.count());
|
||||
|
||||
Dataset<Row> secondCommit = rowDataset.where("commit_time=007");
|
||||
assertEquals(50, secondCommit.count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoCommits() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert");
|
||||
|
||||
try {
|
||||
// Add 10 records with commit time "000"
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start JdbcSource
|
||||
Dataset<Row> rowDataset = runSource(Option.empty(), 10).getBatch().get();
|
||||
assertEquals(10, rowDataset.where("commit_time=000").count());
|
||||
|
||||
// Add 10 records with commit time 001
|
||||
insert("001", 5, connection, DATA_GENERATOR, PROPS);
|
||||
rowDataset = runSource(Option.empty(), 15).getBatch().get();
|
||||
assertEquals(15, rowDataset.count());
|
||||
assertEquals(5, rowDataset.where("commit_time=001").count());
|
||||
assertEquals(10, rowDataset.where("commit_time=000").count());
|
||||
|
||||
// Start second commit and check if all records are pulled
|
||||
rowDataset = runSource(Option.empty(), 15).getBatch().get();
|
||||
assertEquals(15, rowDataset.count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIncrementalFetchWithCommitTime() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert");
|
||||
|
||||
try {
|
||||
// Add 10 records with commit time "000"
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start JdbcSource
|
||||
InputBatch<Dataset<Row>> batch = runSource(Option.empty(), 10);
|
||||
Dataset<Row> rowDataset = batch.getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
|
||||
// Add 10 records with commit time "001"
|
||||
insert("001", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start incremental scan
|
||||
rowDataset = runSource(Option.of(batch.getCheckpointForNextBatch()), 10).getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
assertEquals(10, rowDataset.where("commit_time=001").count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIncrementalFetchWithNoMatchingRows() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert");
|
||||
|
||||
try {
|
||||
// Add 10 records with commit time "000"
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start JdbcSource
|
||||
InputBatch<Dataset<Row>> batch = runSource(Option.empty(), 10);
|
||||
Dataset<Row> rowDataset = batch.getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
|
||||
// Start incremental scan
|
||||
rowDataset = runSource(Option.of(batch.getCheckpointForNextBatch()), 10).getBatch().get();
|
||||
assertEquals(0, rowDataset.count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIncrementalFetchWhenTableRecordsMoreThanSourceLimit() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "id");
|
||||
|
||||
try {
|
||||
// Add 100 records with commit time "000"
|
||||
clearAndInsert("000", 100, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start JdbcSource
|
||||
InputBatch<Dataset<Row>> batch = runSource(Option.empty(), 100);
|
||||
Dataset<Row> rowDataset = batch.getBatch().get();
|
||||
assertEquals(100, rowDataset.count());
|
||||
|
||||
// Add 100 records with commit time "001"
|
||||
insert("001", 100, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start incremental scan. Now there are 100 more records but with sourceLimit set to 60, only fetch 60 records should be fetched.
|
||||
// Those 50 records should be of the commit_time=001 because records with commit_time=000 have already been processed.
|
||||
batch = runSource(Option.of(batch.getCheckpointForNextBatch()), 60);
|
||||
rowDataset = batch.getBatch().get();
|
||||
assertEquals(60, rowDataset.count());
|
||||
assertEquals(60, rowDataset.where("commit_time=001").count());
|
||||
// No more records added, but sourceLimit is now set to 75. Still, only the remaining 40 records should be fetched.
|
||||
rowDataset = runSource(Option.of(batch.getCheckpointForNextBatch()), 75).getBatch().get();
|
||||
assertEquals(40, rowDataset.count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIncrementalFetchWhenLastCheckpointMoreThanTableRecords() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "id");
|
||||
|
||||
try {
|
||||
// Add 100 records with commit time "000"
|
||||
clearAndInsert("000", 100, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start JdbcSource
|
||||
InputBatch<Dataset<Row>> batch = runSource(Option.empty(), 100);
|
||||
Dataset<Row> rowDataset = batch.getBatch().get();
|
||||
assertEquals(100, rowDataset.count());
|
||||
assertEquals("100", batch.getCheckpointForNextBatch());
|
||||
|
||||
// Add 100 records with commit time "001"
|
||||
insert("001", 100, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start incremental scan. With checkpoint greater than the number of records, there should not be any dataset to fetch.
|
||||
batch = runSource(Option.of("200"), 50);
|
||||
rowDataset = batch.getBatch().get();
|
||||
assertEquals(0, rowDataset.count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIncrementalFetchFallbackToFullFetchWhenError() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "true");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert");
|
||||
|
||||
try {
|
||||
// Add 10 records with commit time "000"
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start JdbcSource
|
||||
InputBatch<Dataset<Row>> batch = runSource(Option.empty(), 10);
|
||||
Dataset<Row> rowDataset = batch.getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
|
||||
// Add 10 records with commit time "001"
|
||||
insert("001", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "dummy_col");
|
||||
assertThrows(HoodieException.class, () -> {
|
||||
// Start incremental scan with a dummy column that does not exist.
|
||||
// This will throw an exception as the default behavior is to not fallback to full fetch.
|
||||
runSource(Option.of(batch.getCheckpointForNextBatch()), -1);
|
||||
});
|
||||
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.fallback.to.full.fetch", "true");
|
||||
|
||||
// Start incremental scan with a dummy column that does not exist.
|
||||
// This will fallback to full fetch mode but still throw an exception checkpointing will fail.
|
||||
Exception exception = assertThrows(HoodieException.class, () -> {
|
||||
runSource(Option.of(batch.getCheckpointForNextBatch()), -1);
|
||||
});
|
||||
assertTrue(exception.getMessage().contains("Failed to checkpoint"));
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullFetchWithCommitTime() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false");
|
||||
|
||||
try {
|
||||
// Add 10 records with commit time "000"
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start JdbcSource
|
||||
Dataset<Row> rowDataset = runSource(Option.empty(), 10).getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
// Add 10 records with commit time "001"
|
||||
insert("001", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start full fetch
|
||||
rowDataset = runSource(Option.empty(), 20).getBatch().get();
|
||||
assertEquals(20, rowDataset.count());
|
||||
assertEquals(10, rowDataset.where("commit_time=000").count());
|
||||
assertEquals(10, rowDataset.where("commit_time=001").count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullFetchWithCheckpoint() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.table.incr.column.name", "last_insert");
|
||||
|
||||
try {
|
||||
// Add 10 records with commit time "000"
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start JdbcSource
|
||||
InputBatch<Dataset<Row>> batch = runSource(Option.empty(), 10);
|
||||
Dataset<Row> rowDataset = batch.getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
assertEquals("", batch.getCheckpointForNextBatch());
|
||||
|
||||
// Get max of incremental column
|
||||
Column incrementalColumn = rowDataset
|
||||
.col(PROPS.getString("hoodie.deltastreamer.jdbc.table.incr.column.name"));
|
||||
final String max = rowDataset.agg(functions.max(incrementalColumn).cast(DataTypes.StringType)).first()
|
||||
.getString(0);
|
||||
|
||||
// Add 10 records with commit time "001"
|
||||
insert("001", 10, connection, DATA_GENERATOR, PROPS);
|
||||
|
||||
// Start incremental scan
|
||||
rowDataset = runSource(Option.of(max), 10).getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
assertEquals(10, rowDataset.where("commit_time=001").count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSourceWithPasswordOnFs() {
|
||||
try {
|
||||
// Write secret string to fs in a file
|
||||
writeSecretToFs();
|
||||
// Remove secret string from props
|
||||
PROPS.remove("hoodie.deltastreamer.jdbc.password");
|
||||
// Set property to read secret from fs file
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.password.file", "file:///tmp/hudi/config/secret");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false");
|
||||
// Add 10 records with commit time 000
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
Dataset<Row> rowDataset = runSource(Option.empty(), 10).getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSourceWithNoPasswordThrowsException() {
|
||||
assertThrows(HoodieException.class, () -> {
|
||||
// Write secret string to fs in a file
|
||||
writeSecretToFs();
|
||||
// Remove secret string from props
|
||||
PROPS.remove("hoodie.deltastreamer.jdbc.password");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false");
|
||||
// Add 10 records with commit time 000
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
runSource(Option.empty(), 10);
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSourceWithExtraOptions() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.extra.options.fetchsize", "10");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false");
|
||||
PROPS.remove("hoodie.deltastreamer.jdbc.table.incr.column.name");
|
||||
try {
|
||||
// Add 20 records with commit time 000
|
||||
clearAndInsert("000", 20, connection, DATA_GENERATOR, PROPS);
|
||||
Dataset<Row> rowDataset = runSource(Option.empty(), 10).getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSourceWithStorageLevel() {
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.storage.level", "NONE");
|
||||
PROPS.setProperty("hoodie.deltastreamer.jdbc.incr.pull", "false");
|
||||
try {
|
||||
// Add 10 records with commit time 000
|
||||
clearAndInsert("000", 10, connection, DATA_GENERATOR, PROPS);
|
||||
Dataset<Row> rowDataset = runSource(Option.empty(), 10).getBatch().get();
|
||||
assertEquals(10, rowDataset.count());
|
||||
assertEquals(StorageLevel.NONE(), rowDataset.storageLevel());
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
private void writeSecretToFs() throws IOException {
|
||||
FileSystem fs = FileSystem.get(new Configuration());
|
||||
FSDataOutputStream outputStream = fs.create(new Path("file:///tmp/hudi/config/secret"));
|
||||
outputStream.writeBytes("jdbc");
|
||||
outputStream.close();
|
||||
}
|
||||
|
||||
private InputBatch<Dataset<Row>> runSource(Option<String> lastCkptStr, long sourceLimit) {
|
||||
Source<Dataset<Row>> jdbcSource = new JdbcSource(PROPS, jsc, sparkSession, null);
|
||||
return jdbcSource.fetchNewData(lastCkptStr, sourceLimit);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,195 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.testutils;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Helper class used in testing {@link org.apache.hudi.utilities.sources.JdbcSource}.
|
||||
*/
|
||||
public class JdbcTestUtils {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(JdbcTestUtils.class);
|
||||
|
||||
public static List<HoodieRecord> clearAndInsert(String commitTime, int numRecords, Connection connection, HoodieTestDataGenerator dataGenerator, TypedProperties props)
|
||||
throws SQLException {
|
||||
execute(connection, "DROP TABLE triprec", "Table does not exists");
|
||||
execute(connection, "CREATE TABLE triprec ("
|
||||
+ "id INT NOT NULL AUTO_INCREMENT(1, 1),"
|
||||
+ "commit_time VARCHAR(50),"
|
||||
+ "_row_key VARCHAR(50),"
|
||||
+ "rider VARCHAR(50),"
|
||||
+ "driver VARCHAR(50),"
|
||||
+ "begin_lat DOUBLE PRECISION,"
|
||||
+ "begin_lon DOUBLE PRECISION,"
|
||||
+ "end_lat DOUBLE PRECISION,"
|
||||
+ "end_lon DOUBLE PRECISION,"
|
||||
+ "fare DOUBLE PRECISION,"
|
||||
+ "last_insert TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP)", "Table already exists");
|
||||
|
||||
return insert(commitTime, numRecords, connection, dataGenerator, props);
|
||||
}
|
||||
|
||||
public static List<HoodieRecord> insert(String commitTime, int numRecords, Connection connection, HoodieTestDataGenerator dataGenerator, TypedProperties props) throws SQLException {
|
||||
PreparedStatement insertStatement =
|
||||
connection.prepareStatement("INSERT INTO triprec ("
|
||||
+ "commit_time,"
|
||||
+ "_row_key,"
|
||||
+ "rider,"
|
||||
+ "driver,"
|
||||
+ "begin_lat,"
|
||||
+ "begin_lon,"
|
||||
+ "end_lat,"
|
||||
+ "end_lon,"
|
||||
+ "fare) "
|
||||
+ "values(?,?,?,?,?,?,?,?,?)");
|
||||
List<HoodieRecord> hoodieRecords = dataGenerator.generateInserts(commitTime, numRecords);
|
||||
|
||||
hoodieRecords
|
||||
.stream()
|
||||
.map(r -> {
|
||||
try {
|
||||
return ((GenericRecord) r.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get());
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(record -> {
|
||||
try {
|
||||
insertStatement.setString(1, commitTime);
|
||||
insertStatement.setString(2, record.get("_row_key").toString());
|
||||
insertStatement.setString(3, record.get("rider").toString());
|
||||
insertStatement.setString(4, record.get("driver").toString());
|
||||
insertStatement.setDouble(5, Double.parseDouble(record.get("begin_lat").toString()));
|
||||
insertStatement.setDouble(6, Double.parseDouble(record.get("begin_lon").toString()));
|
||||
insertStatement.setDouble(7, Double.parseDouble(record.get("end_lat").toString()));
|
||||
insertStatement.setDouble(8, Double.parseDouble(record.get("end_lon").toString()));
|
||||
insertStatement.setDouble(9, Double.parseDouble(((GenericRecord) record.get("fare")).get("amount").toString()));
|
||||
insertStatement.addBatch();
|
||||
} catch (SQLException e) {
|
||||
LOG.warn(e.getMessage());
|
||||
}
|
||||
});
|
||||
insertStatement.executeBatch();
|
||||
close(insertStatement);
|
||||
return hoodieRecords;
|
||||
}
|
||||
|
||||
public static List<HoodieRecord> update(String commitTime, List<HoodieRecord> inserts, Connection connection, HoodieTestDataGenerator dataGenerator, TypedProperties props)
|
||||
throws SQLException, IOException {
|
||||
PreparedStatement updateStatement =
|
||||
connection.prepareStatement("UPDATE triprec set commit_time=?,"
|
||||
+ "_row_key=?,"
|
||||
+ "rider=?,"
|
||||
+ "driver=?,"
|
||||
+ "begin_lat=?,"
|
||||
+ "begin_lon=?,"
|
||||
+ "end_lat=?,"
|
||||
+ "end_lon=?,"
|
||||
+ "fare=?"
|
||||
+ "where _row_key=?");
|
||||
|
||||
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates(commitTime, inserts);
|
||||
updateRecords.stream().map(m -> {
|
||||
try {
|
||||
return m.getData().getInsertValue(HoodieTestDataGenerator.AVRO_SCHEMA, props).get();
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}).filter(Objects::nonNull)
|
||||
.map(r -> ((GenericRecord) r))
|
||||
.sequential()
|
||||
.forEach(r -> {
|
||||
try {
|
||||
updateStatement.setString(1, commitTime);
|
||||
updateStatement.setString(2, r.get("_row_key").toString());
|
||||
updateStatement.setString(3, r.get("rider").toString());
|
||||
updateStatement.setString(4, r.get("driver").toString());
|
||||
updateStatement.setDouble(5, Double.parseDouble(r.get("begin_lat").toString()));
|
||||
updateStatement.setDouble(6, Double.parseDouble(r.get("begin_lon").toString()));
|
||||
updateStatement.setDouble(7, Double.parseDouble(r.get("end_lat").toString()));
|
||||
updateStatement.setDouble(8, Double.parseDouble(r.get("end_lon").toString()));
|
||||
updateStatement.setDouble(9, Double.parseDouble(((GenericRecord) r.get("fare")).get("amount").toString()));
|
||||
updateStatement.setString(10, r.get("_row_key").toString());
|
||||
updateStatement.addBatch();
|
||||
} catch (SQLException e) {
|
||||
LOG.warn(e.getMessage());
|
||||
}
|
||||
});
|
||||
updateStatement.executeBatch();
|
||||
close(updateStatement);
|
||||
return updateRecords;
|
||||
}
|
||||
|
||||
private static void execute(Connection connection, String query, String message) {
|
||||
try (Statement statement = connection.createStatement()) {
|
||||
statement.executeUpdate(query);
|
||||
} catch (SQLException e) {
|
||||
LOG.error(message);
|
||||
}
|
||||
}
|
||||
|
||||
private static void close(Statement statement) {
|
||||
try {
|
||||
if (statement != null) {
|
||||
statement.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.error("Error while closing statement. " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public static void close(Connection connection) {
|
||||
try {
|
||||
if (connection != null) {
|
||||
connection.close();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.error("Error while closing connection. " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
public static int count(Connection connection, String tableName) {
|
||||
try (Statement statement = connection.createStatement()) {
|
||||
ResultSet rs = statement.executeQuery(String.format("select count(*) from %s", tableName));
|
||||
rs.next();
|
||||
return rs.getInt(1);
|
||||
} catch (SQLException e) {
|
||||
LOG.warn("Error while counting records. " + e.getMessage());
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user