[HUDI-253]: added validations for schema provider class (#995)
This commit is contained in:
committed by
vinoth chandar
parent
1483b97018
commit
5f1309407a
@@ -19,6 +19,7 @@
|
|||||||
package org.apache.hudi.utilities.sources;
|
package org.apache.hudi.utilities.sources;
|
||||||
|
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||||
|
|
||||||
public class InputBatch<T> {
|
public class InputBatch<T> {
|
||||||
@@ -48,6 +49,9 @@ public class InputBatch<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public SchemaProvider getSchemaProvider() {
|
public SchemaProvider getSchemaProvider() {
|
||||||
|
if (schemaProvider == null) {
|
||||||
|
throw new HoodieException("Please provide a valid schema provider class!");
|
||||||
|
}
|
||||||
return schemaProvider;
|
return schemaProvider;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ import java.util.concurrent.Future;
|
|||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
@@ -46,12 +47,14 @@ import org.apache.hudi.common.util.Option;
|
|||||||
import org.apache.hudi.common.util.TypedProperties;
|
import org.apache.hudi.common.util.TypedProperties;
|
||||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||||
import org.apache.hudi.exception.DatasetNotFoundException;
|
import org.apache.hudi.exception.DatasetNotFoundException;
|
||||||
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.hive.HiveSyncConfig;
|
import org.apache.hudi.hive.HiveSyncConfig;
|
||||||
import org.apache.hudi.hive.HoodieHiveClient;
|
import org.apache.hudi.hive.HoodieHiveClient;
|
||||||
import org.apache.hudi.hive.MultiPartKeysValueExtractor;
|
import org.apache.hudi.hive.MultiPartKeysValueExtractor;
|
||||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
|
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
|
||||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Operation;
|
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Operation;
|
||||||
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
||||||
|
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||||
import org.apache.hudi.utilities.sources.DistributedTestDataSource;
|
import org.apache.hudi.utilities.sources.DistributedTestDataSource;
|
||||||
import org.apache.hudi.utilities.sources.HoodieIncrSource;
|
import org.apache.hudi.utilities.sources.HoodieIncrSource;
|
||||||
import org.apache.hudi.utilities.sources.InputBatch;
|
import org.apache.hudi.utilities.sources.InputBatch;
|
||||||
@@ -170,6 +173,11 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
|
|
||||||
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
|
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
|
||||||
String propsFilename, boolean enableHiveSync) {
|
String propsFilename, boolean enableHiveSync) {
|
||||||
|
return makeConfig(basePath, op, transformerClassName, propsFilename, enableHiveSync, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String transformerClassName,
|
||||||
|
String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass) {
|
||||||
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
||||||
cfg.targetBasePath = basePath;
|
cfg.targetBasePath = basePath;
|
||||||
cfg.targetTableName = "hoodie_trips";
|
cfg.targetTableName = "hoodie_trips";
|
||||||
@@ -181,12 +189,14 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
cfg.sourceOrderingField = "timestamp";
|
cfg.sourceOrderingField = "timestamp";
|
||||||
cfg.propsFilePath = dfsBasePath + "/" + propsFilename;
|
cfg.propsFilePath = dfsBasePath + "/" + propsFilename;
|
||||||
cfg.sourceLimit = 1000;
|
cfg.sourceLimit = 1000;
|
||||||
|
if (useSchemaProviderClass) {
|
||||||
cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
|
cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
|
||||||
|
}
|
||||||
return cfg;
|
return cfg;
|
||||||
}
|
}
|
||||||
|
|
||||||
static HoodieDeltaStreamer.Config makeConfigForHudiIncrSrc(String srcBasePath, String basePath, Operation op,
|
static HoodieDeltaStreamer.Config makeConfigForHudiIncrSrc(String srcBasePath, String basePath, Operation op,
|
||||||
boolean addReadLatestOnMissingCkpt) {
|
boolean addReadLatestOnMissingCkpt, String schemaProviderClassName) {
|
||||||
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
||||||
cfg.targetBasePath = basePath;
|
cfg.targetBasePath = basePath;
|
||||||
cfg.targetTableName = "hoodie_trips_copy";
|
cfg.targetTableName = "hoodie_trips_copy";
|
||||||
@@ -196,6 +206,9 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
cfg.sourceOrderingField = "timestamp";
|
cfg.sourceOrderingField = "timestamp";
|
||||||
cfg.propsFilePath = dfsBasePath + "/test-downstream-source.properties";
|
cfg.propsFilePath = dfsBasePath + "/test-downstream-source.properties";
|
||||||
cfg.sourceLimit = 1000;
|
cfg.sourceLimit = 1000;
|
||||||
|
if (null != schemaProviderClassName) {
|
||||||
|
cfg.schemaProviderClassName = schemaProviderClassName;
|
||||||
|
}
|
||||||
List<String> cfgs = new ArrayList<>();
|
List<String> cfgs = new ArrayList<>();
|
||||||
cfgs.add("hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt=" + addReadLatestOnMissingCkpt);
|
cfgs.add("hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt=" + addReadLatestOnMissingCkpt);
|
||||||
cfgs.add("hoodie.deltastreamer.source.hoodieincr.path=" + srcBasePath);
|
cfgs.add("hoodie.deltastreamer.source.hoodieincr.path=" + srcBasePath);
|
||||||
@@ -412,7 +425,8 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
|
|
||||||
// Now incrementally pull from the above hudi table and ingest to second table
|
// Now incrementally pull from the above hudi table and ingest to second table
|
||||||
HoodieDeltaStreamer.Config downstreamCfg =
|
HoodieDeltaStreamer.Config downstreamCfg =
|
||||||
TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.BULK_INSERT, true);
|
TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.BULK_INSERT,
|
||||||
|
true, null);
|
||||||
new HoodieDeltaStreamer(downstreamCfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
new HoodieDeltaStreamer(downstreamCfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
||||||
TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
||||||
TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
||||||
@@ -428,7 +442,10 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);
|
TestHelpers.assertCommitMetadata("00000", datasetBasePath, dfs, 1);
|
||||||
|
|
||||||
// with no change in upstream table, no change in downstream too when pulled.
|
// with no change in upstream table, no change in downstream too when pulled.
|
||||||
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
|
HoodieDeltaStreamer.Config downstreamCfg1 =
|
||||||
|
TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath,
|
||||||
|
Operation.BULK_INSERT, true, DummySchemaProvider.class.getName());
|
||||||
|
new HoodieDeltaStreamer(downstreamCfg1, jsc).sync();
|
||||||
TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
TestHelpers.assertRecordCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
||||||
TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
TestHelpers.assertDistanceCount(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
||||||
TestHelpers.assertDistanceCountWithExactValue(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
TestHelpers.assertDistanceCountWithExactValue(1000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
||||||
@@ -447,7 +464,8 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
|
|
||||||
// Incrementally pull changes in upstream hudi table and apply to downstream table
|
// Incrementally pull changes in upstream hudi table and apply to downstream table
|
||||||
downstreamCfg =
|
downstreamCfg =
|
||||||
TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.UPSERT, false);
|
TestHelpers.makeConfigForHudiIncrSrc(datasetBasePath, downstreamDatasetBasePath, Operation.UPSERT,
|
||||||
|
false, null);
|
||||||
downstreamCfg.sourceLimit = 2000;
|
downstreamCfg.sourceLimit = 2000;
|
||||||
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
|
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
|
||||||
TestHelpers.assertRecordCount(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
TestHelpers.assertRecordCount(2000, downstreamDatasetBasePath + "/*/*.parquet", sqlContext);
|
||||||
@@ -467,6 +485,21 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
hiveClient.getLastCommitTimeSynced().get());
|
hiveClient.getLastCommitTimeSynced().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNullSchemaProvider() throws Exception {
|
||||||
|
String dataSetBasePath = dfsBasePath + "/test_dataset";
|
||||||
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(dataSetBasePath, Operation.BULK_INSERT,
|
||||||
|
SqlQueryBasedTransformer.class.getName(), PROPS_FILENAME_TEST_SOURCE, true,
|
||||||
|
false);
|
||||||
|
try {
|
||||||
|
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
|
||||||
|
fail("Should error out when schema provider is not provided");
|
||||||
|
} catch (HoodieException e) {
|
||||||
|
log.error("Expected error during reading data from source ", e);
|
||||||
|
assertTrue(e.getMessage().contains("Please provide a valid schema provider class!"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFilterDupes() throws Exception {
|
public void testFilterDupes() throws Exception {
|
||||||
String datasetBasePath = dfsBasePath + "/test_dupes_dataset";
|
String datasetBasePath = dfsBasePath + "/test_dupes_dataset";
|
||||||
@@ -577,4 +610,16 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
return sparkSession.createDataFrame(jsc.emptyRDD(), rowDataset.schema());
|
return sparkSession.createDataFrame(jsc.emptyRDD(), rowDataset.schema());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class DummySchemaProvider extends SchemaProvider {
|
||||||
|
|
||||||
|
public DummySchemaProvider(TypedProperties props, JavaSparkContext jssc) {
|
||||||
|
super(props, jssc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Schema getSourceSchema() {
|
||||||
|
return Schema.create(Schema.Type.NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user