[HUDI-1743] Added support for SqlFileBasedTransformer (#2747)
This commit is contained in:
committed by
GitHub
parent
919590988a
commit
57611d10b5
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.transform;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Scanner;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* A transformer that allows a sql template file be used to transform the source before writing to
|
||||
* Hudi data-set.
|
||||
*
|
||||
* <p>The query should reference the source as a table named "\<SRC\>"
|
||||
*
|
||||
* <p>The final sql statement result is used as the write payload.
|
||||
*
|
||||
* <p>The SQL file is configured with this hoodie property:
|
||||
* hoodie.deltastreamer.transformer.sql.file
|
||||
*
|
||||
* <p>Example Spark SQL Query:
|
||||
*
|
||||
* <p>CACHE TABLE tmp_personal_trips AS
|
||||
* SELECT * FROM <SRC> WHERE trip_type='personal_trips';
|
||||
* <p>
|
||||
* SELECT * FROM tmp_personal_trips;
|
||||
*/
|
||||
public class SqlFileBasedTransformer implements Transformer {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(SqlFileBasedTransformer.class);
|
||||
|
||||
private static final String SRC_PATTERN = "<SRC>";
|
||||
private static final String TMP_TABLE = "HOODIE_SRC_TMP_TABLE_";
|
||||
|
||||
@Override
|
||||
public Dataset<Row> apply(
|
||||
final JavaSparkContext jsc,
|
||||
final SparkSession sparkSession,
|
||||
final Dataset<Row> rowDataset,
|
||||
final TypedProperties props) {
|
||||
|
||||
final String sqlFile = props.getString(Config.TRANSFORMER_SQL_FILE);
|
||||
if (null == sqlFile) {
|
||||
throw new IllegalArgumentException(
|
||||
"Missing required configuration : (" + Config.TRANSFORMER_SQL_FILE + ")");
|
||||
}
|
||||
|
||||
final FileSystem fs = FSUtils.getFs(sqlFile, jsc.hadoopConfiguration(), true);
|
||||
// tmp table name doesn't like dashes
|
||||
final String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_"));
|
||||
LOG.info("Registering tmp table : " + tmpTable);
|
||||
rowDataset.registerTempTable(tmpTable);
|
||||
|
||||
try (final Scanner scanner = new Scanner(fs.open(new Path(sqlFile)), "UTF-8")) {
|
||||
Dataset<Row> rows = null;
|
||||
// each sql statement is separated with semicolon hence set that as delimiter.
|
||||
scanner.useDelimiter(";");
|
||||
LOG.info("SQL Query for transformation : ");
|
||||
while (scanner.hasNext()) {
|
||||
String sqlStr = scanner.next();
|
||||
sqlStr = sqlStr.replaceAll(SRC_PATTERN, tmpTable).trim();
|
||||
if (!sqlStr.isEmpty()) {
|
||||
LOG.info(sqlStr);
|
||||
// overwrite the same dataset object until the last statement then return.
|
||||
rows = sparkSession.sql(sqlStr);
|
||||
}
|
||||
}
|
||||
return rows;
|
||||
} catch (final IOException ioe) {
|
||||
throw new HoodieIOException("Error reading transformer SQL file.", ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/** Configs supported. */
|
||||
private static class Config {
|
||||
|
||||
private static final String TRANSFORMER_SQL_FILE = "hoodie.deltastreamer.transformer.sql.file";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,178 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.utilities.transform;
|
||||
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.RowFactory;
|
||||
import org.apache.spark.sql.catalyst.parser.ParseException;
|
||||
import org.apache.spark.sql.types.DataTypes;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
public class TestSqlFileBasedTransformer extends UtilitiesTestBase {
|
||||
private TypedProperties props;
|
||||
private SqlFileBasedTransformer sqlFileTransformer;
|
||||
private Dataset<Row> inputDatasetRows;
|
||||
private Dataset<Row> emptyDatasetRow;
|
||||
|
||||
@BeforeAll
|
||||
public static void initClass() throws Exception {
|
||||
UtilitiesTestBase.initClass();
|
||||
UtilitiesTestBase.Helpers.copyToDFS(
|
||||
"delta-streamer-config/sql-file-transformer.sql",
|
||||
UtilitiesTestBase.dfs,
|
||||
UtilitiesTestBase.dfsBasePath + "/sql-file-transformer.sql");
|
||||
UtilitiesTestBase.Helpers.copyToDFS(
|
||||
"delta-streamer-config/sql-file-transformer-invalid.sql",
|
||||
UtilitiesTestBase.dfs,
|
||||
UtilitiesTestBase.dfsBasePath + "/sql-file-transformer-invalid.sql");
|
||||
UtilitiesTestBase.Helpers.copyToDFS(
|
||||
"delta-streamer-config/sql-file-transformer-empty.sql",
|
||||
UtilitiesTestBase.dfs,
|
||||
UtilitiesTestBase.dfsBasePath + "/sql-file-transformer-empty.sql");
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanupClass() {
|
||||
UtilitiesTestBase.cleanupClass();
|
||||
}
|
||||
|
||||
@Override
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
props = new TypedProperties();
|
||||
sqlFileTransformer = new SqlFileBasedTransformer();
|
||||
inputDatasetRows = getInputDatasetRows();
|
||||
emptyDatasetRow = getEmptyDatasetRow();
|
||||
}
|
||||
|
||||
@Override
|
||||
@AfterEach
|
||||
public void teardown() throws Exception {
|
||||
super.teardown();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSqlFileBasedTransformerIllegalArguments() {
|
||||
// Test if the class throws illegal argument exception when argument not present.
|
||||
assertThrows(
|
||||
IllegalArgumentException.class,
|
||||
() -> sqlFileTransformer.apply(jsc, sparkSession, inputDatasetRows, props));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSqlFileBasedTransformerIncorrectConfig() {
|
||||
// Test if the class throws hoodie IO exception correctly when given a incorrect config.
|
||||
props.setProperty(
|
||||
"hoodie.deltastreamer.transformer.sql.file",
|
||||
UtilitiesTestBase.dfsBasePath + "/non-exist-sql-file.sql");
|
||||
assertThrows(
|
||||
HoodieIOException.class,
|
||||
() -> sqlFileTransformer.apply(jsc, sparkSession, inputDatasetRows, props));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSqlFileBasedTransformerInvalidSQL() {
|
||||
// Test if the SQL file based transformer works as expected for the invalid SQL statements.
|
||||
props.setProperty(
|
||||
"hoodie.deltastreamer.transformer.sql.file",
|
||||
UtilitiesTestBase.dfsBasePath + "/sql-file-transformer-invalid.sql");
|
||||
assertThrows(
|
||||
ParseException.class,
|
||||
() -> sqlFileTransformer.apply(jsc, sparkSession, inputDatasetRows, props));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSqlFileBasedTransformerEmptyDataset() {
|
||||
// Test if the SQL file based transformer works as expected for the empty SQL statements.
|
||||
props.setProperty(
|
||||
"hoodie.deltastreamer.transformer.sql.file",
|
||||
UtilitiesTestBase.dfsBasePath + "/sql-file-transformer-empty.sql");
|
||||
Dataset<Row> emptyRow = sqlFileTransformer.apply(jsc, sparkSession, inputDatasetRows, props);
|
||||
String[] actualRows = emptyRow.as(Encoders.STRING()).collectAsList().toArray(new String[0]);
|
||||
String[] expectedRows = emptyDatasetRow.collectAsList().toArray(new String[0]);
|
||||
assertArrayEquals(expectedRows, actualRows);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSqlFileBasedTransformer() {
|
||||
// Test if the SQL file based transformer works as expected for the correct input.
|
||||
props.setProperty(
|
||||
"hoodie.deltastreamer.transformer.sql.file",
|
||||
UtilitiesTestBase.dfsBasePath + "/sql-file-transformer.sql");
|
||||
Dataset<Row> transformedRow =
|
||||
sqlFileTransformer.apply(jsc, sparkSession, inputDatasetRows, props);
|
||||
|
||||
// Called distinct() and sort() to match the transformation in this file:
|
||||
// hudi-utilities/src/test/resources/delta-streamer-config/sql-file-transformer.sql
|
||||
String[] expectedRows =
|
||||
inputDatasetRows
|
||||
.distinct()
|
||||
.sort("col1")
|
||||
.as(Encoders.STRING())
|
||||
.collectAsList()
|
||||
.toArray(new String[0]);
|
||||
String[] actualRows = transformedRow.as(Encoders.STRING()).collectAsList().toArray(new String[0]);
|
||||
assertArrayEquals(expectedRows, actualRows);
|
||||
}
|
||||
|
||||
private Dataset<Row> getInputDatasetRows() {
|
||||
// Create few rows with duplicate data.
|
||||
List<Row> list = new ArrayList<>();
|
||||
list.add(RowFactory.create("one"));
|
||||
list.add(RowFactory.create("two"));
|
||||
list.add(RowFactory.create("three"));
|
||||
list.add(RowFactory.create("four"));
|
||||
list.add(RowFactory.create("four"));
|
||||
// Create the schema struct.
|
||||
List<org.apache.spark.sql.types.StructField> listOfStructField = new ArrayList<>();
|
||||
listOfStructField.add(DataTypes.createStructField("col1", DataTypes.StringType, true));
|
||||
StructType structType = DataTypes.createStructType(listOfStructField);
|
||||
// Create the data frame with the rows and schema.
|
||||
return sparkSession.createDataFrame(list, structType);
|
||||
}
|
||||
|
||||
private Dataset<Row> getEmptyDatasetRow() {
|
||||
// Create the schema struct.
|
||||
List<org.apache.spark.sql.types.StructField> listOfStructField = new ArrayList<>();
|
||||
listOfStructField.add(DataTypes.createStructField("col1", DataTypes.StringType, true));
|
||||
StructType structType = DataTypes.createStructType(listOfStructField);
|
||||
// Create the data frame with the rows and schema.
|
||||
List<Row> list = new ArrayList<>();
|
||||
// Create empty dataframe with the schema.
|
||||
return sparkSession.createDataFrame(list, structType);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
-- Licensed to the Apache Software Foundation (ASF) under one
|
||||
-- or more contributor license agreements. See the NOTICE file
|
||||
-- distributed with this work for additional information
|
||||
-- regarding copyright ownership. The ASF licenses this file
|
||||
-- to you under the Apache License, Version 2.0 (the
|
||||
-- "License"), you may not use this file except in compliance
|
||||
-- with the License. You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
CACHE
|
||||
TABLE tmp_trips
|
||||
SELECT *
|
||||
FROM <SRC>
|
||||
WHERE
|
||||
True = False;
|
||||
|
||||
SELECT *
|
||||
FROM tmp_trips;
|
||||
@@ -0,0 +1,22 @@
|
||||
-- Licensed to the Apache Software Foundation (ASF) under one
|
||||
-- or more contributor license agreements. See the NOTICE file
|
||||
-- distributed with this work for additional information
|
||||
-- regarding copyright ownership. The ASF licenses this file
|
||||
-- to you under the Apache License, Version 2.0 (the
|
||||
-- "License"), you may not use this file except in compliance
|
||||
-- with the License. You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
SEECT
|
||||
DISTINCT col1
|
||||
FROM
|
||||
<SRC>
|
||||
ORDER BY col1;
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
|
||||
-- Licensed to the Apache Software Foundation (ASF) under one
|
||||
-- or more contributor license agreements. See the NOTICE file
|
||||
-- distributed with this work for additional information
|
||||
-- regarding copyright ownership. The ASF licenses this file
|
||||
-- to you under the Apache License, Version 2.0 (the
|
||||
-- "License"), you may not use this file except in compliance
|
||||
-- with the License. You may obtain a copy of the License at
|
||||
--
|
||||
-- http://www.apache.org/licenses/LICENSE-2.0
|
||||
--
|
||||
-- Unless required by applicable law or agreed to in writing, software
|
||||
-- distributed under the License is distributed on an "AS IS" BASIS,
|
||||
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
-- See the License for the specific language governing permissions and
|
||||
-- limitations under the License.
|
||||
|
||||
SELECT RAND();
|
||||
|
||||
SELECT 1;
|
||||
|
||||
SELECT
|
||||
DISTINCT col1
|
||||
FROM
|
||||
<SRC>
|
||||
ORDER BY col1;
|
||||
|
||||
Reference in New Issue
Block a user