New Features in DeltaStreamer :
(1) Apply transformation when using delta-streamer to ingest data. (2) Add Hudi Incremental Source for Delta Streamer (3) Allow delta-streamer config-property to be passed as command-line (4) Add Hive Integration to Delta-Streamer and address Review comments (5) Ensure MultiPartKeysValueExtractor handle hive style partition description (6) Reuse same spark session on both source and transformer (7) Support extracting partition fields from _hoodie_partition_path for HoodieIncrSource (8) Reuse Binary Avro coders (9) Add push down filter for Incremental source (10) Add Hoodie DeltaStreamer metrics to track total time taken
This commit is contained in:
committed by
vinoth chandar
parent
c70dbc13e9
commit
3a0044216c
@@ -0,0 +1,115 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import com.beust.jcommander.Parameter;
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.utilities.deltastreamer.HoodieDeltaStreamer;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
public class HoodieCleaner {
|
||||
|
||||
private static volatile Logger log = LogManager.getLogger(HoodieDeltaStreamer.class);
|
||||
|
||||
/**
|
||||
* Config for Cleaner
|
||||
*/
|
||||
private final Config cfg;
|
||||
|
||||
/**
|
||||
* Filesystem used
|
||||
*/
|
||||
private transient FileSystem fs;
|
||||
|
||||
/**
|
||||
* Spark context
|
||||
*/
|
||||
private transient JavaSparkContext jssc;
|
||||
|
||||
/**
|
||||
* Bag of properties with source, hoodie client, key generator etc.
|
||||
*/
|
||||
TypedProperties props;
|
||||
|
||||
public HoodieCleaner(Config cfg, JavaSparkContext jssc) throws IOException {
|
||||
this.cfg = cfg;
|
||||
this.jssc = jssc;
|
||||
this.fs = FSUtils.getFs(cfg.basePath, jssc.hadoopConfiguration());
|
||||
|
||||
this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
|
||||
log.info("Creating Cleaner with configs : " + props.toString());
|
||||
}
|
||||
|
||||
public void run() throws Exception {
|
||||
HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
|
||||
HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg, false);
|
||||
client.clean();
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getHoodieClientConfig() throws Exception {
|
||||
return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.basePath)
|
||||
.withAutoCommit(false)
|
||||
.withProps(props).build();
|
||||
}
|
||||
|
||||
public static class Config implements Serializable {
|
||||
|
||||
@Parameter(names = {"--target-base-path"}, description = "base path for the hoodie dataset to be cleaner.",
|
||||
required = true)
|
||||
public String basePath;
|
||||
|
||||
@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
|
||||
+ "hoodie client for cleaning")
|
||||
public String propsFilePath =
|
||||
"file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties";
|
||||
|
||||
@Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
|
||||
+ "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter")
|
||||
public List<String> configs = new ArrayList<>();
|
||||
|
||||
@Parameter(names = {"--spark-master"}, description = "spark master to use.")
|
||||
public String sparkMaster = "local[2]";
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public Boolean help = false;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final Config cfg = new Config();
|
||||
JCommander cmd = new JCommander(cfg, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
String dirName = new Path(cfg.basePath).getName();
|
||||
JavaSparkContext jssc = UtilHelpers.buildSparkContext("hoodie-cleaner-" + dirName, cfg.sparkMaster);
|
||||
new HoodieCleaner(cfg, jssc).run();
|
||||
}
|
||||
}
|
||||
@@ -30,9 +30,13 @@ import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import com.uber.hoodie.utilities.sources.Source;
|
||||
import com.uber.hoodie.utilities.transform.Transformer;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -43,6 +47,7 @@ import org.apache.spark.Accumulator;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* Bunch of helper methods
|
||||
@@ -51,12 +56,12 @@ public class UtilHelpers {
|
||||
private static Logger logger = LogManager.getLogger(UtilHelpers.class);
|
||||
|
||||
public static Source createSource(String sourceClass, TypedProperties cfg,
|
||||
JavaSparkContext jssc, SchemaProvider schemaProvider)
|
||||
JavaSparkContext jssc, SparkSession sparkSession, SchemaProvider schemaProvider)
|
||||
throws IOException {
|
||||
try {
|
||||
return (Source) ReflectionUtils.loadClass(sourceClass,
|
||||
new Class<?>[]{TypedProperties.class, JavaSparkContext.class, SchemaProvider.class},
|
||||
cfg, jssc, schemaProvider);
|
||||
new Class<?>[]{TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class},
|
||||
cfg, jssc, sparkSession, schemaProvider);
|
||||
} catch (Throwable e) {
|
||||
throw new IOException("Could not load source class " + sourceClass, e);
|
||||
}
|
||||
@@ -65,17 +70,31 @@ public class UtilHelpers {
|
||||
public static SchemaProvider createSchemaProvider(String schemaProviderClass,
|
||||
TypedProperties cfg, JavaSparkContext jssc) throws IOException {
|
||||
try {
|
||||
return (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
|
||||
return schemaProviderClass == null ? null :
|
||||
(SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
|
||||
} catch (Throwable e) {
|
||||
throw new IOException("Could not load schema provider class " + schemaProviderClass, e);
|
||||
}
|
||||
}
|
||||
|
||||
public static Transformer createTransformer(String transformerClass) throws IOException {
|
||||
try {
|
||||
return transformerClass == null ? null : (Transformer) ReflectionUtils.loadClass(transformerClass);
|
||||
} catch (Throwable e) {
|
||||
throw new IOException("Could not load transformer class " + transformerClass, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*/
|
||||
public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath) {
|
||||
public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath, List<String> overriddenProps) {
|
||||
try {
|
||||
return new DFSPropertiesConfiguration(fs, cfgPath);
|
||||
DFSPropertiesConfiguration conf = new DFSPropertiesConfiguration(fs, cfgPath);
|
||||
if (!overriddenProps.isEmpty()) {
|
||||
logger.info("Adding overridden properties to file properties.");
|
||||
conf.addProperties(new BufferedReader(new StringReader(String.join("\n", overriddenProps))));
|
||||
}
|
||||
return conf;
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Unable to read props file at :" + cfgPath, e);
|
||||
}
|
||||
@@ -109,7 +128,7 @@ public class UtilHelpers {
|
||||
sparkConf.set("spark.eventLog.overwrite", "true");
|
||||
sparkConf.set("spark.eventLog.enabled", "true");
|
||||
}
|
||||
sparkConf.set("spark.driver.maxResultSize", "2g");
|
||||
sparkConf.setIfMissing("spark.driver.maxResultSize", "2g");
|
||||
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compress", "true");
|
||||
sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
|
||||
|
||||
@@ -18,10 +18,15 @@
|
||||
|
||||
package com.uber.hoodie.utilities.deltastreamer;
|
||||
|
||||
import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE;
|
||||
import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME;
|
||||
|
||||
import com.beust.jcommander.IStringConverter;
|
||||
import com.beust.jcommander.JCommander;
|
||||
import com.beust.jcommander.Parameter;
|
||||
import com.beust.jcommander.ParameterException;
|
||||
import com.codahale.metrics.Timer;
|
||||
import com.uber.hoodie.AvroConversionUtils;
|
||||
import com.uber.hoodie.DataSourceUtils;
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.KeyGenerator;
|
||||
@@ -36,32 +41,40 @@ import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.hive.HiveSyncConfig;
|
||||
import com.uber.hoodie.hive.HiveSyncTool;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.utilities.HiveIncrementalPuller;
|
||||
import com.uber.hoodie.utilities.UtilHelpers;
|
||||
import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
|
||||
import com.uber.hoodie.utilities.schema.FilebasedSchemaProvider;
|
||||
import com.uber.hoodie.utilities.schema.RowBasedSchemaProvider;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import com.uber.hoodie.utilities.sources.InputBatch;
|
||||
import com.uber.hoodie.utilities.sources.JsonDFSSource;
|
||||
import com.uber.hoodie.utilities.sources.Source;
|
||||
import com.uber.hoodie.utilities.transform.Transformer;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.collection.JavaConversions;
|
||||
|
||||
/**
|
||||
@@ -81,7 +94,7 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
/**
|
||||
* Source to pull deltas from
|
||||
*/
|
||||
private transient Source source;
|
||||
private transient SourceFormatAdapter formatAdapter;
|
||||
|
||||
/**
|
||||
* Schema provider that supplies the command for reading the input and writing out the target
|
||||
@@ -89,6 +102,11 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
*/
|
||||
private transient SchemaProvider schemaProvider;
|
||||
|
||||
/**
|
||||
* Allows transforming source to target dataset before writing
|
||||
*/
|
||||
private transient Transformer transformer;
|
||||
|
||||
/**
|
||||
* Extract the key for the target dataset
|
||||
*/
|
||||
@@ -109,16 +127,30 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
*/
|
||||
private transient JavaSparkContext jssc;
|
||||
|
||||
/**
|
||||
* Spark Session
|
||||
*/
|
||||
private transient SparkSession sparkSession;
|
||||
|
||||
/**
|
||||
* Hive Config
|
||||
*/
|
||||
private transient HiveConf hiveConf;
|
||||
|
||||
/**
|
||||
* Bag of properties with source, hoodie client, key generator etc.
|
||||
*/
|
||||
TypedProperties props;
|
||||
|
||||
|
||||
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException {
|
||||
this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
|
||||
getDefaultHiveConf(jssc.hadoopConfiguration()));
|
||||
}
|
||||
|
||||
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf) throws IOException {
|
||||
this.cfg = cfg;
|
||||
this.jssc = jssc;
|
||||
this.sparkSession = SparkSession.builder().config(jssc.getConf()).getOrCreate();
|
||||
this.fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration());
|
||||
|
||||
if (fs.exists(new Path(cfg.targetBasePath))) {
|
||||
@@ -129,19 +161,28 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
this.commitTimelineOpt = Optional.empty();
|
||||
}
|
||||
|
||||
this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath)).getConfig();
|
||||
this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
|
||||
log.info("Creating delta streamer with configs : " + props.toString());
|
||||
this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jssc);
|
||||
this.transformer = UtilHelpers.createTransformer(cfg.transformerClassName);
|
||||
this.keyGenerator = DataSourceUtils.createKeyGenerator(cfg.keyGeneratorClass, props);
|
||||
this.source = UtilHelpers.createSource(cfg.sourceClassName, props, jssc, schemaProvider);
|
||||
|
||||
// register the schemas, so that shuffle does not serialize the full schemas
|
||||
List<Schema> schemas = Arrays.asList(schemaProvider.getSourceSchema(),
|
||||
schemaProvider.getTargetSchema());
|
||||
jssc.sc().getConf().registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList());
|
||||
this.formatAdapter =
|
||||
new SourceFormatAdapter(UtilHelpers.createSource(cfg.sourceClassName, props, jssc, sparkSession,
|
||||
schemaProvider));
|
||||
|
||||
this.hiveConf = hiveConf;
|
||||
}
|
||||
|
||||
private static HiveConf getDefaultHiveConf(Configuration cfg) {
|
||||
HiveConf hiveConf = new HiveConf();
|
||||
hiveConf.addResource(cfg);
|
||||
return hiveConf;
|
||||
}
|
||||
|
||||
public void sync() throws Exception {
|
||||
HoodieDeltaStreamerMetrics metrics = new HoodieDeltaStreamerMetrics(getHoodieClientConfig(null));
|
||||
Timer.Context overallTimerContext = metrics.getOverallTimerContext();
|
||||
// Retrieve the previous round checkpoints, if any
|
||||
Optional<String> resumeCheckpointStr = Optional.empty();
|
||||
if (commitTimelineOpt.isPresent()) {
|
||||
@@ -163,16 +204,42 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
}
|
||||
log.info("Checkpoint to resume from : " + resumeCheckpointStr);
|
||||
|
||||
// Pull the data from the source & prepare the write
|
||||
Pair<Optional<JavaRDD<GenericRecord>>, String> dataAndCheckpoint = source.fetchNewData(
|
||||
resumeCheckpointStr, cfg.sourceLimit);
|
||||
final Optional<JavaRDD<GenericRecord>> avroRDDOptional;
|
||||
final String checkpointStr;
|
||||
final SchemaProvider schemaProvider;
|
||||
if (transformer != null) {
|
||||
// Transformation is needed. Fetch New rows in Row Format, apply transformation and then convert them
|
||||
// to generic records for writing
|
||||
InputBatch<Dataset<Row>> dataAndCheckpoint = formatAdapter.fetchNewDataInRowFormat(
|
||||
resumeCheckpointStr, cfg.sourceLimit);
|
||||
|
||||
if (!dataAndCheckpoint.getKey().isPresent()) {
|
||||
Optional<Dataset<Row>> transformed =
|
||||
dataAndCheckpoint.getBatch().map(data -> transformer.apply(jssc, sparkSession, data, props));
|
||||
checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
|
||||
avroRDDOptional = transformed.map(t ->
|
||||
AvroConversionUtils.createRdd(t, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()
|
||||
);
|
||||
// Use Transformed Row's schema if not overridden
|
||||
schemaProvider =
|
||||
this.schemaProvider == null ? transformed.map(r -> (SchemaProvider)new RowBasedSchemaProvider(r.schema()))
|
||||
.orElse(dataAndCheckpoint.getSchemaProvider()) : this.schemaProvider;
|
||||
} else {
|
||||
// Pull the data from the source & prepare the write
|
||||
InputBatch<JavaRDD<GenericRecord>> dataAndCheckpoint =
|
||||
formatAdapter.fetchNewDataInAvroFormat(resumeCheckpointStr, cfg.sourceLimit);
|
||||
avroRDDOptional = dataAndCheckpoint.getBatch();
|
||||
checkpointStr = dataAndCheckpoint.getCheckpointForNextBatch();
|
||||
schemaProvider = dataAndCheckpoint.getSchemaProvider();
|
||||
}
|
||||
|
||||
if ((!avroRDDOptional.isPresent()) || (avroRDDOptional.get().isEmpty())) {
|
||||
log.info("No new data, nothing to commit.. ");
|
||||
return;
|
||||
}
|
||||
|
||||
JavaRDD<GenericRecord> avroRDD = dataAndCheckpoint.getKey().get();
|
||||
registerAvroSchemas(schemaProvider);
|
||||
|
||||
JavaRDD<GenericRecord> avroRDD = avroRDDOptional.get();
|
||||
JavaRDD<HoodieRecord> records = avroRDD.map(gr -> {
|
||||
HoodieRecordPayload payload = DataSourceUtils.createPayload(cfg.payloadClassName, gr,
|
||||
(Comparable) gr.get(cfg.sourceOrderingField));
|
||||
@@ -180,20 +247,20 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
});
|
||||
|
||||
// filter dupes if needed
|
||||
HoodieWriteConfig hoodieCfg = getHoodieClientConfig();
|
||||
HoodieWriteConfig hoodieCfg = getHoodieClientConfig(schemaProvider);
|
||||
if (cfg.filterDupes) {
|
||||
// turn upserts to insert
|
||||
cfg.operation = cfg.operation == Operation.UPSERT ? Operation.INSERT : cfg.operation;
|
||||
records = DataSourceUtils.dropDuplicates(jssc, records, hoodieCfg);
|
||||
}
|
||||
|
||||
if (records.isEmpty()) {
|
||||
log.info("No new data, nothing to commit.. ");
|
||||
return;
|
||||
if (records.isEmpty()) {
|
||||
log.info("No new data, nothing to commit.. ");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Perform the write
|
||||
HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg);
|
||||
HoodieWriteClient client = new HoodieWriteClient<>(jssc, hoodieCfg, true);
|
||||
String commitTime = client.startCommit();
|
||||
log.info("Starting commit : " + commitTime);
|
||||
|
||||
@@ -210,7 +277,7 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
|
||||
// Simply commit for now. TODO(vc): Support better error handlers later on
|
||||
HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
|
||||
checkpointCommitMetadata.put(CHECKPOINT_KEY, dataAndCheckpoint.getValue());
|
||||
checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr);
|
||||
|
||||
boolean success = client.commit(commitTime, writeStatusRDD,
|
||||
Optional.of(checkpointCommitMetadata));
|
||||
@@ -220,17 +287,54 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
} else {
|
||||
log.info("Commit " + commitTime + " failed!");
|
||||
}
|
||||
|
||||
// Sync to hive if enabled
|
||||
Timer.Context hiveSyncContext = metrics.getHiveSyncTimerContext();
|
||||
syncHive();
|
||||
long hiveSyncTimeMs = hiveSyncContext != null ? hiveSyncContext.stop() : 0;
|
||||
|
||||
client.close();
|
||||
long overallTimeMs = overallTimerContext != null ? overallTimerContext.stop() : 0;
|
||||
|
||||
// Send DeltaStreamer Metrics
|
||||
metrics.updateDeltaStreamerMetrics(overallTimeMs, hiveSyncTimeMs);
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getHoodieClientConfig() throws Exception {
|
||||
return HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
|
||||
.withAutoCommit(false)
|
||||
.withSchema(schemaProvider.getTargetSchema().toString())
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
|
||||
.forTable(cfg.targetTableName)
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.withProps(props).build();
|
||||
public void syncHive() {
|
||||
if (cfg.enableHiveSync) {
|
||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath);
|
||||
log.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName
|
||||
+ "). Hive metastore URL :" + hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
|
||||
|
||||
new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Register Avro Schemas
|
||||
* @param schemaProvider Schema Provider
|
||||
*/
|
||||
private void registerAvroSchemas(SchemaProvider schemaProvider) {
|
||||
// register the schemas, so that shuffle does not serialize the full schemas
|
||||
if (null != schemaProvider) {
|
||||
List<Schema> schemas = Arrays.asList(schemaProvider.getSourceSchema(), schemaProvider.getTargetSchema());
|
||||
log.info("Registering Schema :" + schemas);
|
||||
jssc.sc().getConf().registerAvroSchemas(JavaConversions.asScalaBuffer(schemas).toList());
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getHoodieClientConfig(SchemaProvider schemaProvider) throws Exception {
|
||||
HoodieWriteConfig.Builder builder =
|
||||
HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
|
||||
.withAutoCommit(false)
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
|
||||
.forTable(cfg.targetTableName)
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.withProps(props);
|
||||
if (null != schemaProvider) {
|
||||
builder = builder.withSchema(schemaProvider.getTargetSchema().toString());
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
public enum Operation {
|
||||
@@ -266,6 +370,10 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
public String propsFilePath =
|
||||
"file://" + System.getProperty("user.dir") + "/src/test/resources/delta-streamer-config/dfs-source.properties";
|
||||
|
||||
@Parameter(names = {"--hoodie-conf"}, description = "Any configuration that can be set in the properties file "
|
||||
+ "(using the CLI parameter \"--propsFilePath\") can also be passed command line using this parameter")
|
||||
public List<String> configs = new ArrayList<>();
|
||||
|
||||
@Parameter(names = {"--source-class"}, description = "Subclass of com.uber.hoodie.utilities.sources to read data. "
|
||||
+ "Built-in options: com.uber.hoodie.utilities.sources.{JsonDFSSource (default), AvroDFSSource, "
|
||||
+ "JsonKafkaSource, AvroKafkaSource, HiveIncrPullSource}")
|
||||
@@ -285,11 +393,22 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
public String payloadClassName = OverwriteWithLatestAvroPayload.class.getName();
|
||||
|
||||
@Parameter(names = {"--schemaprovider-class"}, description = "subclass of com.uber.hoodie.utilities.schema"
|
||||
+ ".SchemaProvider to attach schemas to input & target table data, built in options: FilebasedSchemaProvider")
|
||||
public String schemaProviderClassName = FilebasedSchemaProvider.class.getName();
|
||||
+ ".SchemaProvider to attach schemas to input & target table data, built in options: "
|
||||
+ "com.uber.hoodie.utilities.schema.FilebasedSchemaProvider."
|
||||
+ "Source (See com.uber.hoodie.utilities.sources.Source) implementation can implement their own SchemaProvider."
|
||||
+ " For Sources that return Dataset<Row>, the schema is obtained implicitly. "
|
||||
+ "However, this CLI option allows overriding the schemaprovider returned by Source.")
|
||||
public String schemaProviderClassName = null;
|
||||
|
||||
@Parameter(names = {"--transformer-class"},
|
||||
description = "subclass of com.uber.hoodie.utilities.transform.Transformer"
|
||||
+ ". Allows transforming raw source dataset to a target dataset (conforming to target schema) before writing."
|
||||
+ " Default : Not set. E:g - com.uber.hoodie.utilities.transform.SqlQueryBasedTransformer (which allows"
|
||||
+ "a SQL query templated to be passed as a transformation function)")
|
||||
public String transformerClassName = null;
|
||||
|
||||
@Parameter(names = {"--source-limit"}, description = "Maximum amount of data to read from source. "
|
||||
+ "Default: No limit For e.g: DFSSource => max bytes to read, KafkaSource => max events to read")
|
||||
+ "Default: No limit For e.g: DFS-Source => max bytes to read, Kafka-Source => max events to read")
|
||||
public long sourceLimit = Long.MAX_VALUE;
|
||||
|
||||
@Parameter(names = {"--op"}, description = "Takes one of these values : UPSERT (default), INSERT (use when input "
|
||||
@@ -301,6 +420,9 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
+ "before insert/bulk-insert")
|
||||
public Boolean filterDupes = false;
|
||||
|
||||
@Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive")
|
||||
public Boolean enableHiveSync = false;
|
||||
|
||||
@Parameter(names = {"--spark-master"}, description = "spark master to use.")
|
||||
public String sparkMaster = "local[2]";
|
||||
|
||||
@@ -319,4 +441,44 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
JavaSparkContext jssc = UtilHelpers.buildSparkContext("delta-streamer-" + cfg.targetTableName, cfg.sparkMaster);
|
||||
new HoodieDeltaStreamer(cfg, jssc).sync();
|
||||
}
|
||||
|
||||
public SourceFormatAdapter getFormatAdapter() {
|
||||
return formatAdapter;
|
||||
}
|
||||
|
||||
public SchemaProvider getSchemaProvider() {
|
||||
return schemaProvider;
|
||||
}
|
||||
|
||||
public Transformer getTransformer() {
|
||||
return transformer;
|
||||
}
|
||||
|
||||
public KeyGenerator getKeyGenerator() {
|
||||
return keyGenerator;
|
||||
}
|
||||
|
||||
public FileSystem getFs() {
|
||||
return fs;
|
||||
}
|
||||
|
||||
public Optional<HoodieTimeline> getCommitTimelineOpt() {
|
||||
return commitTimelineOpt;
|
||||
}
|
||||
|
||||
public JavaSparkContext getJssc() {
|
||||
return jssc;
|
||||
}
|
||||
|
||||
public SparkSession getSparkSession() {
|
||||
return sparkSession;
|
||||
}
|
||||
|
||||
public HiveConf getHiveConf() {
|
||||
return hiveConf;
|
||||
}
|
||||
|
||||
public TypedProperties getProps() {
|
||||
return props;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
package com.uber.hoodie.utilities.deltastreamer;
|
||||
|
||||
import static com.uber.hoodie.metrics.Metrics.registerGauge;
|
||||
|
||||
import com.codahale.metrics.Timer;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.metrics.Metrics;
|
||||
|
||||
public class HoodieDeltaStreamerMetrics {
|
||||
|
||||
private HoodieWriteConfig config = null;
|
||||
private String tableName = null;
|
||||
|
||||
public String overallTimerName = null;
|
||||
public String hiveSyncTimerName = null;
|
||||
private Timer overallTimer = null;
|
||||
public Timer hiveSyncTimer = null;
|
||||
|
||||
public HoodieDeltaStreamerMetrics(HoodieWriteConfig config) {
|
||||
this.config = config;
|
||||
this.tableName = config.getTableName();
|
||||
if (config.isMetricsOn()) {
|
||||
Metrics.init(config);
|
||||
this.overallTimerName = getMetricsName("timer", "deltastreamer");
|
||||
this.hiveSyncTimerName = getMetricsName("timer", "deltastreamerHiveSync");
|
||||
}
|
||||
}
|
||||
|
||||
public Timer.Context getOverallTimerContext() {
|
||||
if (config.isMetricsOn() && overallTimer == null) {
|
||||
overallTimer = createTimer(overallTimerName);
|
||||
}
|
||||
return overallTimer == null ? null : overallTimer.time();
|
||||
}
|
||||
|
||||
public Timer.Context getHiveSyncTimerContext() {
|
||||
if (config.isMetricsOn() && hiveSyncTimer == null) {
|
||||
hiveSyncTimer = createTimer(hiveSyncTimerName);
|
||||
}
|
||||
return hiveSyncTimer == null ? null : hiveSyncTimer.time();
|
||||
}
|
||||
|
||||
private Timer createTimer(String name) {
|
||||
return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null;
|
||||
}
|
||||
|
||||
String getMetricsName(String action, String metric) {
|
||||
return config == null ? null : String.format("%s.%s.%s", tableName, action, metric);
|
||||
}
|
||||
|
||||
public void updateDeltaStreamerMetrics(long durationInNs, long hiveSyncNs) {
|
||||
if (config.isMetricsOn()) {
|
||||
registerGauge(getMetricsName("deltastreamer", "duration"), getDurationInMs(durationInNs));
|
||||
registerGauge(getMetricsName("deltastreamer", "hiveSyncDuration"), getDurationInMs(hiveSyncNs));
|
||||
}
|
||||
}
|
||||
|
||||
public long getDurationInMs(long ctxDuration) {
|
||||
return ctxDuration / 1000000;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,112 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.deltastreamer;
|
||||
|
||||
import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE;
|
||||
import static com.uber.hoodie.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME;
|
||||
|
||||
import com.uber.hoodie.AvroConversionUtils;
|
||||
import com.uber.hoodie.utilities.sources.AvroSource;
|
||||
import com.uber.hoodie.utilities.sources.InputBatch;
|
||||
import com.uber.hoodie.utilities.sources.JsonSource;
|
||||
import com.uber.hoodie.utilities.sources.RowSource;
|
||||
import com.uber.hoodie.utilities.sources.Source;
|
||||
import com.uber.hoodie.utilities.sources.helpers.AvroConvertor;
|
||||
import java.util.Optional;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
/**
|
||||
* Adapts data-format provided by the source to the data-format required by the client (DeltaStreamer)
|
||||
*/
|
||||
public final class SourceFormatAdapter {
|
||||
|
||||
private final Source source;
|
||||
|
||||
|
||||
public SourceFormatAdapter(Source source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch new data in avro format. If the source provides data in different format, they are translated
|
||||
* to Avro format
|
||||
* @param lastCkptStr
|
||||
* @param sourceLimit
|
||||
* @return
|
||||
*/
|
||||
public InputBatch<JavaRDD<GenericRecord>> fetchNewDataInAvroFormat(Optional<String> lastCkptStr,
|
||||
long sourceLimit) {
|
||||
switch (source.getSourceType()) {
|
||||
case AVRO:
|
||||
return ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit);
|
||||
case JSON: {
|
||||
InputBatch<JavaRDD<String>> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit);
|
||||
AvroConvertor convertor = new AvroConvertor(r.getSchemaProvider().getSourceSchema());
|
||||
return new InputBatch<>(Optional.ofNullable(
|
||||
r.getBatch().map(rdd -> rdd.map(convertor::fromJson))
|
||||
.orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
|
||||
}
|
||||
case ROW: {
|
||||
InputBatch<Dataset<Row>> r = ((RowSource)source).fetchNext(lastCkptStr, sourceLimit);
|
||||
return new InputBatch<>(Optional.ofNullable(r.getBatch().map(
|
||||
rdd -> (AvroConversionUtils.createRdd(rdd, HOODIE_RECORD_STRUCT_NAME, HOODIE_RECORD_NAMESPACE).toJavaRDD()))
|
||||
.orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
|
||||
}
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch new data in row format. If the source provides data in different format, they are translated
|
||||
* to Row format
|
||||
* @param lastCkptStr
|
||||
* @param sourceLimit
|
||||
* @return
|
||||
*/
|
||||
public InputBatch<Dataset<Row>> fetchNewDataInRowFormat(Optional<String> lastCkptStr, long sourceLimit) {
|
||||
switch (source.getSourceType()) {
|
||||
case ROW:
|
||||
return ((RowSource)source).fetchNext(lastCkptStr, sourceLimit);
|
||||
case AVRO: {
|
||||
InputBatch<JavaRDD<GenericRecord>> r = ((AvroSource)source).fetchNext(lastCkptStr, sourceLimit);
|
||||
Schema sourceSchema = r.getSchemaProvider().getSourceSchema();
|
||||
return new InputBatch<>(Optional.ofNullable(
|
||||
r.getBatch().map(rdd -> AvroConversionUtils.createDataFrame(JavaRDD.toRDD(rdd),
|
||||
sourceSchema.toString(), source.getSparkSession()))
|
||||
.orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
|
||||
}
|
||||
case JSON: {
|
||||
InputBatch<JavaRDD<String>> r = ((JsonSource)source).fetchNext(lastCkptStr, sourceLimit);
|
||||
Schema sourceSchema = r.getSchemaProvider().getSourceSchema();
|
||||
StructType dataType = AvroConversionUtils.convertAvroSchemaToStructType(sourceSchema);
|
||||
return new InputBatch<>(Optional.ofNullable(
|
||||
r.getBatch().map(rdd -> source.getSparkSession().read().schema(dataType).json(rdd))
|
||||
.orElse(null)), r.getCheckpointForNextBatch(), r.getSchemaProvider());
|
||||
}
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown source type (" + source.getSourceType() + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package com.uber.hoodie.utilities.schema;
|
||||
|
||||
import com.uber.hoodie.AvroConversionUtils;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
public class RowBasedSchemaProvider extends SchemaProvider {
|
||||
|
||||
// Used in GenericRecord conversions
|
||||
public static final String HOODIE_RECORD_NAMESPACE = "hoodie.source";
|
||||
public static final String HOODIE_RECORD_STRUCT_NAME = "hoodie_source";
|
||||
|
||||
private StructType rowStruct;
|
||||
|
||||
public RowBasedSchemaProvider(StructType rowStruct) {
|
||||
super(null, null);
|
||||
this.rowStruct = rowStruct;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Schema getSourceSchema() {
|
||||
return AvroConversionUtils.convertStructTypeToAvroSchema(rowStruct, HOODIE_RECORD_STRUCT_NAME,
|
||||
HOODIE_RECORD_NAMESPACE);
|
||||
}
|
||||
}
|
||||
@@ -42,12 +42,15 @@ public class SchemaRegistryProvider extends SchemaProvider {
|
||||
*/
|
||||
public static class Config {
|
||||
|
||||
private static final String SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
|
||||
private static final String SRC_SCHEMA_REGISTRY_URL_PROP = "hoodie.deltastreamer.schemaprovider.registry.url";
|
||||
private static final String TARGET_SCHEMA_REGISTRY_URL_PROP =
|
||||
"hoodie.deltastreamer.schemaprovider.registry.targetUrl";
|
||||
}
|
||||
|
||||
private final Schema schema;
|
||||
private final Schema targetSchema;
|
||||
|
||||
private String fetchSchemaFromRegistry(String registryUrl) throws IOException {
|
||||
private static String fetchSchemaFromRegistry(String registryUrl) throws IOException {
|
||||
URL registry = new URL(registryUrl);
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
JsonNode node = mapper.readTree(registry.openStream());
|
||||
@@ -56,17 +59,32 @@ public class SchemaRegistryProvider extends SchemaProvider {
|
||||
|
||||
public SchemaRegistryProvider(TypedProperties props, JavaSparkContext jssc) {
|
||||
super(props, jssc);
|
||||
DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SCHEMA_REGISTRY_URL_PROP));
|
||||
String registryUrl = props.getString(Config.SCHEMA_REGISTRY_URL_PROP);
|
||||
DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.SRC_SCHEMA_REGISTRY_URL_PROP));
|
||||
String registryUrl = props.getString(Config.SRC_SCHEMA_REGISTRY_URL_PROP);
|
||||
String targetRegistryUrl = props.getString(Config.TARGET_SCHEMA_REGISTRY_URL_PROP, registryUrl);
|
||||
try {
|
||||
this.schema = new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl));
|
||||
this.schema = getSchema(registryUrl);
|
||||
if (!targetRegistryUrl.equals(registryUrl)) {
|
||||
this.targetSchema = getSchema(targetRegistryUrl);
|
||||
} else {
|
||||
this.targetSchema = schema;
|
||||
}
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieIOException("Error reading schema from registry :" + registryUrl, ioe);
|
||||
}
|
||||
}
|
||||
|
||||
private static Schema getSchema(String registryUrl) throws IOException {
|
||||
return new Schema.Parser().parse(fetchSchemaFromRegistry(registryUrl));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Schema getSourceSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Schema getTargetSchema() {
|
||||
return targetSchema;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,7 +19,10 @@
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import com.uber.hoodie.utilities.sources.helpers.DFSPathSelector;
|
||||
import java.util.Optional;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.mapred.AvroKey;
|
||||
import org.apache.avro.mapreduce.AvroKeyInputFormat;
|
||||
@@ -27,18 +30,33 @@ import org.apache.hadoop.io.NullWritable;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* DFS Source that reads avro data
|
||||
*/
|
||||
public class AvroDFSSource extends DFSSource {
|
||||
public class AvroDFSSource extends AvroSource {
|
||||
|
||||
public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, schemaProvider);
|
||||
private final DFSPathSelector pathSelector;
|
||||
|
||||
public AvroDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider);
|
||||
this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
|
||||
protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Optional<String> lastCkptStr,
|
||||
long sourceLimit) {
|
||||
Pair<Optional<String>, String> selectPathsWithMaxModificationTime =
|
||||
pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit);
|
||||
return selectPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>(
|
||||
Optional.of(fromFiles(pathStr)),
|
||||
selectPathsWithMaxModificationTime.getRight()))
|
||||
.orElseGet(() -> new InputBatch<>(Optional.empty(), selectPathsWithMaxModificationTime.getRight()));
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> fromFiles(String pathStr) {
|
||||
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
|
||||
AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
|
||||
sparkContext.hadoopConfiguration());
|
||||
|
||||
@@ -20,27 +20,55 @@ package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen;
|
||||
import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
||||
import io.confluent.kafka.serializers.KafkaAvroDecoder;
|
||||
import java.util.Optional;
|
||||
import kafka.serializer.StringDecoder;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.streaming.kafka.KafkaUtils;
|
||||
import org.apache.spark.streaming.kafka.OffsetRange;
|
||||
|
||||
/**
|
||||
* Reads avro serialized Kafka data, based on the confluent schema-registry
|
||||
*/
|
||||
public class AvroKafkaSource extends KafkaSource {
|
||||
public class AvroKafkaSource extends AvroSource {
|
||||
|
||||
public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, schemaProvider);
|
||||
private static Logger log = LogManager.getLogger(AvroKafkaSource.class);
|
||||
|
||||
private final KafkaOffsetGen offsetGen;
|
||||
|
||||
public AvroKafkaSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider);
|
||||
offsetGen = new KafkaOffsetGen(props);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor) {
|
||||
return KafkaUtils
|
||||
.createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class, kafkaParams,
|
||||
offsetRanges).values().map(obj -> (GenericRecord) obj);
|
||||
protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Optional<String> lastCheckpointStr,
|
||||
long sourceLimit) {
|
||||
OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit);
|
||||
long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
|
||||
if (totalNewMsgs <= 0) {
|
||||
return new InputBatch<>(Optional.empty(),
|
||||
lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
|
||||
} else {
|
||||
log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
|
||||
}
|
||||
JavaRDD<GenericRecord> newDataRDD = toRDD(offsetRanges);
|
||||
return new InputBatch<>(Optional.of(newDataRDD),
|
||||
KafkaOffsetGen.CheckpointUtils.offsetsToStr(offsetRanges));
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> toRDD(OffsetRange[] offsetRanges) {
|
||||
JavaRDD<GenericRecord> recordRDD = KafkaUtils
|
||||
.createRDD(sparkContext, String.class, Object.class, StringDecoder.class, KafkaAvroDecoder.class,
|
||||
offsetGen.getKafkaParams(), offsetRanges).values().map(obj -> (GenericRecord) obj);
|
||||
return recordRDD;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public abstract class AvroSource extends Source<JavaRDD<GenericRecord>> {
|
||||
|
||||
public AvroSource(TypedProperties props,
|
||||
JavaSparkContext sparkContext,
|
||||
SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO);
|
||||
}
|
||||
}
|
||||
@@ -21,8 +21,6 @@ package com.uber.hoodie.utilities.sources;
|
||||
import com.uber.hoodie.DataSourceUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.ImmutablePair;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import java.io.IOException;
|
||||
@@ -44,19 +42,20 @@ import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit
|
||||
* by commit and apply to the target table
|
||||
* Source to read deltas produced by {@link com.uber.hoodie.utilities.HiveIncrementalPuller}, commit by commit and apply
|
||||
* to the target table
|
||||
* <p>
|
||||
* The general idea here is to have commits sync across the data pipeline.
|
||||
* <p>
|
||||
* [Source Tables(s)] ====> HiveIncrementalScanner ==> incrPullRootPath ==> targetTable
|
||||
* {c1,c2,c3,...} {c1,c2,c3,...} {c1,c2,c3,...}
|
||||
* [Source Tables(s)] ====> HiveIncrementalScanner ==> incrPullRootPath ==> targetTable {c1,c2,c3,...}
|
||||
* {c1,c2,c3,...} {c1,c2,c3,...}
|
||||
* <p>
|
||||
* This produces beautiful causality, that makes data issues in ETLs very easy to debug
|
||||
*/
|
||||
public class HiveIncrPullSource extends Source {
|
||||
public class HiveIncrPullSource extends AvroSource {
|
||||
|
||||
private static volatile Logger log = LogManager.getLogger(HiveIncrPullSource.class);
|
||||
|
||||
@@ -73,9 +72,9 @@ public class HiveIncrPullSource extends Source {
|
||||
private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.incrpull.root";
|
||||
}
|
||||
|
||||
public HiveIncrPullSource(TypedProperties props, JavaSparkContext sparkContext,
|
||||
public HiveIncrPullSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, schemaProvider);
|
||||
super(props, sparkContext, sparkSession, schemaProvider);
|
||||
DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.ROOT_INPUT_PATH_PROP));
|
||||
this.incrPullRootPath = props.getString(Config.ROOT_INPUT_PATH_PROP);
|
||||
this.fs = FSUtils.getFs(incrPullRootPath, sparkContext.hadoopConfiguration());
|
||||
@@ -113,15 +112,15 @@ public class HiveIncrPullSource extends Source {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
|
||||
protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(
|
||||
Optional<String> lastCheckpointStr, long sourceLimit) {
|
||||
try {
|
||||
// find the source commit to pull
|
||||
Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);
|
||||
|
||||
if (!commitToPull.isPresent()) {
|
||||
return new ImmutablePair<>(Optional.empty(),
|
||||
lastCheckpointStr.orElse(""));
|
||||
return new InputBatch<>(Optional.empty(),
|
||||
lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
|
||||
}
|
||||
|
||||
// read the files out.
|
||||
@@ -132,7 +131,7 @@ public class HiveIncrPullSource extends Source {
|
||||
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
|
||||
AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
|
||||
sparkContext.hadoopConfiguration());
|
||||
return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
|
||||
return new InputBatch<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
|
||||
String.valueOf(commitToPull.get()));
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieIOException(
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.DataSourceReadOptions;
|
||||
import com.uber.hoodie.DataSourceUtils;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.hive.SlashEncodedDayPartitionValueExtractor;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import com.uber.hoodie.utilities.sources.helpers.IncrSourceHelper;
|
||||
import java.util.Arrays;
|
||||
import java.util.Optional;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.DataFrameReader;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public class HoodieIncrSource extends RowSource {
|
||||
|
||||
/**
|
||||
* Configs supported
|
||||
*/
|
||||
protected static class Config {
|
||||
|
||||
/**
|
||||
* {@value #HOODIE_SRC_BASE_PATH} is the base-path for the source Hoodie table
|
||||
*/
|
||||
private static final String HOODIE_SRC_BASE_PATH = "hoodie.deltastreamer.source.hoodieincr.path";
|
||||
|
||||
/**
|
||||
* {@value #NUM_INSTANTS_PER_FETCH} allows the max number of instants whose changes can be incrementally fetched
|
||||
*/
|
||||
private static final String NUM_INSTANTS_PER_FETCH = "hoodie.deltastreamer.source.hoodieincr.num_instants";
|
||||
private static final Integer DEFAULT_NUM_INSTANTS_PER_FETCH = 1;
|
||||
|
||||
/**
|
||||
* {@value #HOODIE_SRC_PARTITION_FIELDS} specifies partition fields that needs to be added to source table after
|
||||
* parsing _hoodie_partition_path
|
||||
*/
|
||||
private static final String HOODIE_SRC_PARTITION_FIELDS = "hoodie.deltastreamer.source.hoodieincr.partition.fields";
|
||||
|
||||
/**
|
||||
* {@value #HOODIE_SRC_PARTITION_EXTRACTORCLASS} PartitionValueExtractor class to extract partition fields from
|
||||
* _hoodie_partition_path
|
||||
*/
|
||||
private static final String HOODIE_SRC_PARTITION_EXTRACTORCLASS =
|
||||
"hoodie.deltastreamer.source.hoodieincr.partition.extractor.class";
|
||||
private static final String DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS =
|
||||
SlashEncodedDayPartitionValueExtractor.class.getCanonicalName();
|
||||
|
||||
/**
|
||||
* {@value #READ_LATEST_INSTANT_ON_MISSING_CKPT} allows delta-streamer to incrementally fetch from latest committed
|
||||
* instant when checkpoint is not provided.
|
||||
*/
|
||||
private static final String READ_LATEST_INSTANT_ON_MISSING_CKPT =
|
||||
"hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt";
|
||||
private static final Boolean DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT = false;
|
||||
}
|
||||
|
||||
public HoodieIncrSource(TypedProperties props,
|
||||
JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Pair<Optional<Dataset<Row>>, String> fetchNextBatch(Optional<String> lastCkptStr, long sourceLimit) {
|
||||
|
||||
DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH));
|
||||
|
||||
/**
|
||||
DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH,
|
||||
Config.HOODIE_SRC_PARTITION_FIELDS));
|
||||
List<String> partitionFields = props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",",
|
||||
new ArrayList<>());
|
||||
PartitionValueExtractor extractor = DataSourceUtils.createPartitionExtractor(props.getString(
|
||||
Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS, Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS));
|
||||
**/
|
||||
String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH);
|
||||
int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH);
|
||||
boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT,
|
||||
Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);
|
||||
|
||||
// Use begin Instant if set and non-empty
|
||||
Optional<String> beginInstant =
|
||||
lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Optional.empty() : lastCkptStr : Optional.empty();
|
||||
|
||||
Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath,
|
||||
numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt);
|
||||
|
||||
if (instantEndpts.getKey().equals(instantEndpts.getValue())) {
|
||||
log.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey());
|
||||
return Pair.of(Optional.empty(), instantEndpts.getKey());
|
||||
}
|
||||
|
||||
// Do Incr pull. Set end instant if available
|
||||
DataFrameReader reader = sparkSession.read().format("com.uber.hoodie")
|
||||
.option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
|
||||
.option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft())
|
||||
.option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight());
|
||||
|
||||
Dataset<Row> source = reader.load(srcPath);
|
||||
|
||||
/**
|
||||
log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema());
|
||||
|
||||
StructType newSchema = new StructType(source.schema().fields());
|
||||
for (String field : partitionFields) {
|
||||
newSchema = newSchema.add(field, DataTypes.StringType, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if
|
||||
* configured
|
||||
*
|
||||
Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> {
|
||||
// _hoodie_instant_time
|
||||
String instantTime = row.getString(0);
|
||||
IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(), instantEndpts.getValue());
|
||||
if (!partitionFields.isEmpty()) {
|
||||
// _hoodie_partition_path
|
||||
String hoodiePartitionPath = row.getString(3);
|
||||
List<Object> partitionVals = extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream()
|
||||
.map(o -> (Object) o).collect(Collectors.toList());
|
||||
Preconditions.checkArgument(partitionVals.size() == partitionFields.size(),
|
||||
"#partition-fields != #partition-values-extracted");
|
||||
List<Object> rowObjs = new ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq()));
|
||||
rowObjs.addAll(partitionVals);
|
||||
return RowFactory.create(rowObjs.toArray());
|
||||
}
|
||||
return row;
|
||||
}, RowEncoder.apply(newSchema));
|
||||
|
||||
log.info("Validated Source Schema :" + validated.schema());
|
||||
**/
|
||||
|
||||
// Remove Hoodie meta columns except partition path from input source
|
||||
final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream()
|
||||
.filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new));
|
||||
//log.info("Final Schema from Source is :" + src.schema());
|
||||
return Pair.of(Optional.of(src), instantEndpts.getRight());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import java.util.Optional;
|
||||
|
||||
public class InputBatch<T> {
|
||||
|
||||
private final Optional<T> batch;
|
||||
private final String checkpointForNextBatch;
|
||||
private final SchemaProvider schemaProvider;
|
||||
|
||||
public InputBatch(Optional<T> batch, String checkpointForNextBatch,
|
||||
SchemaProvider schemaProvider) {
|
||||
this.batch = batch;
|
||||
this.checkpointForNextBatch = checkpointForNextBatch;
|
||||
this.schemaProvider = schemaProvider;
|
||||
}
|
||||
|
||||
public InputBatch(Optional<T> batch, String checkpointForNextBatch) {
|
||||
this.batch = batch;
|
||||
this.checkpointForNextBatch = checkpointForNextBatch;
|
||||
this.schemaProvider = null;
|
||||
}
|
||||
|
||||
public Optional<T> getBatch() {
|
||||
return batch;
|
||||
}
|
||||
|
||||
public String getCheckpointForNextBatch() {
|
||||
return checkpointForNextBatch;
|
||||
}
|
||||
|
||||
public SchemaProvider getSchemaProvider() {
|
||||
return schemaProvider;
|
||||
}
|
||||
}
|
||||
@@ -19,22 +19,38 @@
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import com.uber.hoodie.utilities.sources.helpers.DFSPathSelector;
|
||||
import java.util.Optional;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* DFS Source that reads json data
|
||||
*/
|
||||
public class JsonDFSSource extends DFSSource {
|
||||
public class JsonDFSSource extends JsonSource {
|
||||
|
||||
public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, schemaProvider);
|
||||
private final DFSPathSelector pathSelector;
|
||||
|
||||
public JsonDFSSource(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider);
|
||||
this.pathSelector = new DFSPathSelector(props, sparkContext.hadoopConfiguration());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
|
||||
return sparkContext.textFile(pathStr).map(convertor::fromJson);
|
||||
protected InputBatch<JavaRDD<String>> fetchNewData(Optional<String> lastCkptStr,
|
||||
long sourceLimit) {
|
||||
Pair<Optional<String>, String> selPathsWithMaxModificationTime =
|
||||
pathSelector.getNextFilePathsAndMaxModificationTime(lastCkptStr, sourceLimit);
|
||||
return selPathsWithMaxModificationTime.getLeft().map(pathStr -> new InputBatch<>(
|
||||
Optional.of(fromFiles(pathStr)), selPathsWithMaxModificationTime.getRight()))
|
||||
.orElse(new InputBatch<>(Optional.empty(), selPathsWithMaxModificationTime.getRight()));
|
||||
}
|
||||
|
||||
private JavaRDD<String> fromFiles(String pathStr) {
|
||||
return sparkContext.textFile(pathStr);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,26 +20,49 @@ package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen;
|
||||
import com.uber.hoodie.utilities.sources.helpers.KafkaOffsetGen.CheckpointUtils;
|
||||
import java.util.Optional;
|
||||
import kafka.serializer.StringDecoder;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.streaming.kafka.KafkaUtils;
|
||||
import org.apache.spark.streaming.kafka.OffsetRange;
|
||||
|
||||
/**
|
||||
* Read json kafka data
|
||||
*/
|
||||
public class JsonKafkaSource extends KafkaSource {
|
||||
public class JsonKafkaSource extends JsonSource {
|
||||
|
||||
public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
|
||||
super(properties, sparkContext, schemaProvider);
|
||||
private static Logger log = LogManager.getLogger(JsonKafkaSource.class);
|
||||
|
||||
private final KafkaOffsetGen offsetGen;
|
||||
|
||||
public JsonKafkaSource(TypedProperties properties, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(properties, sparkContext, sparkSession, schemaProvider);
|
||||
offsetGen = new KafkaOffsetGen(properties);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor) {
|
||||
protected InputBatch<JavaRDD<String>> fetchNewData(Optional<String> lastCheckpointStr,
|
||||
long sourceLimit) {
|
||||
OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit);
|
||||
long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
|
||||
if (totalNewMsgs <= 0) {
|
||||
return new InputBatch<>(Optional.empty(),
|
||||
lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
|
||||
}
|
||||
log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
|
||||
JavaRDD<String> newDataRDD = toRDD(offsetRanges);
|
||||
return new InputBatch<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
|
||||
}
|
||||
|
||||
private JavaRDD<String> toRDD(OffsetRange[] offsetRanges) {
|
||||
return KafkaUtils.createRDD(sparkContext, String.class, String.class, StringDecoder.class, StringDecoder.class,
|
||||
kafkaParams, offsetRanges)
|
||||
.values().map(avroConvertor::fromJson);
|
||||
offsetGen.getKafkaParams(), offsetRanges).values();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public abstract class JsonSource extends Source<JavaRDD<String>> {
|
||||
|
||||
public JsonSource(TypedProperties props,
|
||||
JavaSparkContext sparkContext,
|
||||
SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider, SourceType.JSON);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.utilities.schema.RowBasedSchemaProvider;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import java.util.Optional;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public abstract class RowSource extends Source<Dataset<Row>> {
|
||||
|
||||
public RowSource(TypedProperties props,
|
||||
JavaSparkContext sparkContext,
|
||||
SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, sparkSession, schemaProvider, SourceType.ROW);
|
||||
}
|
||||
|
||||
protected abstract Pair<Optional<Dataset<Row>>, String> fetchNextBatch(Optional<String> lastCkptStr,
|
||||
long sourceLimit);
|
||||
|
||||
@Override
|
||||
protected final InputBatch<Dataset<Row>> fetchNewData(Optional<String> lastCkptStr, long sourceLimit) {
|
||||
Pair<Optional<Dataset<Row>>, String> res = fetchNextBatch(lastCkptStr, sourceLimit);
|
||||
return res.getKey().map(dsr -> {
|
||||
SchemaProvider rowSchemaProvider = new RowBasedSchemaProvider(dsr.schema());
|
||||
return new InputBatch<>(res.getKey(), res.getValue(), rowSchemaProvider);
|
||||
}).orElseGet(() -> new InputBatch<>(res.getKey(), res.getValue()));
|
||||
}
|
||||
}
|
||||
@@ -19,36 +19,67 @@
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* Represents a source from which we can tail data. Assumes a constructor that takes properties.
|
||||
*/
|
||||
public abstract class Source implements Serializable {
|
||||
public abstract class Source<T> implements Serializable {
|
||||
protected static volatile Logger log = LogManager.getLogger(Source.class);
|
||||
|
||||
protected transient TypedProperties props;
|
||||
|
||||
protected transient JavaSparkContext sparkContext;
|
||||
|
||||
protected transient SchemaProvider schemaProvider;
|
||||
|
||||
|
||||
protected Source(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
|
||||
this.props = props;
|
||||
this.sparkContext = sparkContext;
|
||||
this.schemaProvider = schemaProvider;
|
||||
public enum SourceType {
|
||||
JSON,
|
||||
AVRO,
|
||||
ROW
|
||||
}
|
||||
|
||||
protected transient TypedProperties props;
|
||||
protected transient JavaSparkContext sparkContext;
|
||||
protected transient SparkSession sparkSession;
|
||||
private transient SchemaProvider overriddenSchemaProvider;
|
||||
|
||||
private final SourceType sourceType;
|
||||
|
||||
protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider) {
|
||||
this(props, sparkContext, sparkSession, schemaProvider, SourceType.AVRO);
|
||||
}
|
||||
|
||||
protected Source(TypedProperties props, JavaSparkContext sparkContext, SparkSession sparkSession,
|
||||
SchemaProvider schemaProvider, SourceType sourceType) {
|
||||
this.props = props;
|
||||
this.sparkContext = sparkContext;
|
||||
this.sparkSession = sparkSession;
|
||||
this.overriddenSchemaProvider = schemaProvider;
|
||||
this.sourceType = sourceType;
|
||||
}
|
||||
|
||||
protected abstract InputBatch<T> fetchNewData(Optional<String> lastCkptStr, long sourceLimit);
|
||||
|
||||
/**
|
||||
* Fetches new data upto sourceLimit, from the provided checkpoint and returns an RDD of the
|
||||
* data, as well as the checkpoint to be written as a result of that.
|
||||
* Main API called by Hoodie Delta Streamer to fetch records
|
||||
* @param lastCkptStr Last Checkpoint
|
||||
* @param sourceLimit Source Limit
|
||||
* @return
|
||||
*/
|
||||
public abstract Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
|
||||
Optional<String> lastCheckpointStr, long sourceLimit);
|
||||
public final InputBatch<T> fetchNext(Optional<String> lastCkptStr, long sourceLimit) {
|
||||
InputBatch<T> batch = fetchNewData(lastCkptStr, sourceLimit);
|
||||
// If overriddenSchemaProvider is passed in CLI, use it
|
||||
return overriddenSchemaProvider == null ? batch : new InputBatch<>(batch.getBatch(),
|
||||
batch.getCheckpointForNextBatch(), overriddenSchemaProvider);
|
||||
}
|
||||
|
||||
public SourceType getSourceType() {
|
||||
return sourceType;
|
||||
}
|
||||
|
||||
public SparkSession getSparkSession() {
|
||||
return sparkSession;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
package com.uber.hoodie.utilities.sources.helpers;
|
||||
|
||||
import com.twitter.bijection.Injection;
|
||||
import com.twitter.bijection.avro.GenericAvroCodecs;
|
||||
@@ -55,6 +55,10 @@ public class AvroConvertor implements Serializable {
|
||||
this.schemaStr = schemaStr;
|
||||
}
|
||||
|
||||
public AvroConvertor(Schema schema) {
|
||||
this.schemaStr = schema.toString();
|
||||
this.schema = schema;
|
||||
}
|
||||
|
||||
private void initSchema() {
|
||||
if (schema == null) {
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -16,7 +16,7 @@
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
package com.uber.hoodie.utilities.sources.helpers;
|
||||
|
||||
import com.uber.hoodie.DataSourceUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
@@ -24,45 +24,38 @@ import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.ImmutablePair;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* Source to read data from a given DFS directory structure, incrementally
|
||||
*/
|
||||
public abstract class DFSSource extends Source {
|
||||
public class DFSPathSelector {
|
||||
|
||||
/**
|
||||
* Configs supported
|
||||
*/
|
||||
static class Config {
|
||||
|
||||
private static final String ROOT_INPUT_PATH_PROP = "hoodie.deltastreamer.source.dfs.root";
|
||||
}
|
||||
|
||||
private static final List<String> IGNORE_FILEPREFIX_LIST = Arrays.asList(".", "_");
|
||||
|
||||
private final transient FileSystem fs;
|
||||
private final TypedProperties props;
|
||||
|
||||
public DFSSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, schemaProvider);
|
||||
DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.ROOT_INPUT_PATH_PROP));
|
||||
this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), sparkContext.hadoopConfiguration());
|
||||
public DFSPathSelector(TypedProperties props, Configuration hadoopConf) {
|
||||
DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.ROOT_INPUT_PATH_PROP));
|
||||
this.props = props;
|
||||
this.fs = FSUtils.getFs(props.getString(Config.ROOT_INPUT_PATH_PROP), hadoopConf);
|
||||
}
|
||||
|
||||
protected abstract JavaRDD<GenericRecord> fromFiles(final AvroConvertor convertor, String pathStr);
|
||||
|
||||
@Override
|
||||
public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
|
||||
public Pair<Optional<String>, String> getNextFilePathsAndMaxModificationTime(
|
||||
Optional<String> lastCheckpointStr, long sourceLimit) {
|
||||
|
||||
try {
|
||||
@@ -111,11 +104,9 @@ public abstract class DFSSource extends Source {
|
||||
// read the files out.
|
||||
String pathStr = filteredFiles.stream().map(f -> f.getPath().toString())
|
||||
.collect(Collectors.joining(","));
|
||||
String schemaStr = schemaProvider.getSourceSchema().toString();
|
||||
final AvroConvertor avroConvertor = new AvroConvertor(schemaStr);
|
||||
|
||||
return new ImmutablePair<>(
|
||||
Optional.of(fromFiles(avroConvertor, pathStr)),
|
||||
Optional.ofNullable(pathStr),
|
||||
String.valueOf(maxModificationTime));
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieIOException(
|
||||
@@ -0,0 +1,88 @@
|
||||
package com.uber.hoodie.utilities.sources.helpers;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import java.util.Optional;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Row;
|
||||
|
||||
|
||||
/**
|
||||
* Helper for Hudi Incremental Source. Has APIs to
|
||||
* (a) calculate begin and end instant time for incrementally pulling from Hudi source
|
||||
* (b) Find max seen instant to be set as checkpoint for next fetch.
|
||||
*/
|
||||
public class IncrSourceHelper {
|
||||
|
||||
/**
|
||||
* Get a timestamp which is the next value in a descending sequence
|
||||
*
|
||||
* @param timestamp Timestamp
|
||||
*/
|
||||
private static String getStrictlyLowerTimestamp(String timestamp) {
|
||||
long ts = Long.parseLong(timestamp);
|
||||
Preconditions.checkArgument(ts > 0, "Timestamp must be positive");
|
||||
Long lower = ts - 1;
|
||||
return "" + lower;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find begin and end instants to be set for the next fetch
|
||||
*
|
||||
* @param jssc Java Spark Context
|
||||
* @param srcBasePath Base path of Hudi source table
|
||||
* @param numInstantsPerFetch Max Instants per fetch
|
||||
* @param beginInstant Last Checkpoint String
|
||||
* @param readLatestOnMissingBeginInstant when begin instant is missing, allow reading from latest committed instant
|
||||
* @return begin and end instants
|
||||
*/
|
||||
public static Pair<String, String> calculateBeginAndEndInstants(
|
||||
JavaSparkContext jssc, String srcBasePath, int numInstantsPerFetch, Optional<String> beginInstant,
|
||||
boolean readLatestOnMissingBeginInstant) {
|
||||
Preconditions.checkArgument(numInstantsPerFetch > 0, "Make sure the config"
|
||||
+ " hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value");
|
||||
HoodieTableMetaClient srcMetaClient = new HoodieTableMetaClient(jssc.hadoopConfiguration(),
|
||||
srcBasePath, true);
|
||||
|
||||
final HoodieTimeline activeCommitTimeline =
|
||||
srcMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
|
||||
|
||||
String beginInstantTime = beginInstant.orElseGet(() -> {
|
||||
if (readLatestOnMissingBeginInstant) {
|
||||
Optional<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant();
|
||||
return lastInstant.map(hoodieInstant -> getStrictlyLowerTimestamp(hoodieInstant.getTimestamp())).orElse("000");
|
||||
} else {
|
||||
throw new IllegalArgumentException("Missing begin instant for incremental pull. For reading from latest "
|
||||
+ "committed instant set hoodie.deltastreamer.source.hoodie.read_latest_on_midding_ckpt to true");
|
||||
}
|
||||
});
|
||||
|
||||
Optional<HoodieInstant> nthInstant =
|
||||
activeCommitTimeline.findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y);
|
||||
return Pair.of(beginInstantTime, nthInstant.map(instant -> instant.getTimestamp()).orElse(beginInstantTime));
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate instant time seen in the incoming row
|
||||
*
|
||||
* @param row Input Row
|
||||
* @param instantTime Hoodie Instant time of the row
|
||||
* @param sinceInstant begin instant of the batch
|
||||
* @param endInstant end instant of the batch
|
||||
*/
|
||||
public static void validateInstantTime(Row row, String instantTime, String sinceInstant, String endInstant) {
|
||||
Preconditions.checkNotNull(instantTime);
|
||||
Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime,
|
||||
sinceInstant, HoodieTimeline.GREATER),
|
||||
"Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime
|
||||
+ "but expected to be between " + sinceInstant + "(excl) - "
|
||||
+ endInstant + "(incl)");
|
||||
Preconditions.checkArgument(HoodieTimeline.compareTimestamps(instantTime,
|
||||
endInstant, HoodieTimeline.LESSER_OR_EQUAL),
|
||||
"Instant time(_hoodie_commit_time) in row (" + row + ") was : " + instantTime
|
||||
+ "but expected to be between " + sinceInstant + "(excl) - " + endInstant + "(incl)");
|
||||
}
|
||||
}
|
||||
@@ -16,24 +16,22 @@
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.sources;
|
||||
package com.uber.hoodie.utilities.sources.helpers;
|
||||
|
||||
import com.uber.hoodie.DataSourceUtils;
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import com.uber.hoodie.common.util.collection.ImmutablePair;
|
||||
import com.uber.hoodie.common.util.collection.Pair;
|
||||
import com.uber.hoodie.exception.HoodieNotSupportedException;
|
||||
import com.uber.hoodie.utilities.exception.HoodieDeltaStreamerException;
|
||||
import com.uber.hoodie.utilities.schema.SchemaProvider;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import kafka.common.TopicAndPartition;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.streaming.kafka.KafkaCluster;
|
||||
import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset;
|
||||
import org.apache.spark.streaming.kafka.OffsetRange;
|
||||
@@ -49,14 +47,13 @@ import scala.util.Either;
|
||||
/**
|
||||
* Source to read data from Kafka, incrementally
|
||||
*/
|
||||
public abstract class KafkaSource extends Source {
|
||||
public class KafkaOffsetGen {
|
||||
|
||||
private static volatile Logger log = LogManager.getLogger(KafkaSource.class);
|
||||
private static volatile Logger log = LogManager.getLogger(KafkaOffsetGen.class);
|
||||
|
||||
private static long DEFAULT_MAX_EVENTS_TO_READ = 1000000; // 1M events max
|
||||
|
||||
|
||||
static class CheckpointUtils {
|
||||
public static class CheckpointUtils {
|
||||
|
||||
/**
|
||||
* Reconstruct checkpoint from string.
|
||||
@@ -90,7 +87,6 @@ public abstract class KafkaSource extends Source {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compute the offset ranges to read from Kafka, while handling newly added partitions, skews, event limits.
|
||||
*
|
||||
@@ -174,19 +170,18 @@ public abstract class KafkaSource extends Source {
|
||||
* Configs to be passed for this source. All standard Kafka consumer configs are also respected
|
||||
*/
|
||||
static class Config {
|
||||
|
||||
private static final String KAFKA_TOPIC_NAME = "hoodie.deltastreamer.source.kafka.topic";
|
||||
private static final KafkaResetOffsetStrategies DEFAULT_AUTO_RESET_OFFSET = KafkaResetOffsetStrategies.LARGEST;
|
||||
}
|
||||
|
||||
|
||||
protected HashMap<String, String> kafkaParams;
|
||||
|
||||
private final HashMap<String, String> kafkaParams;
|
||||
private final TypedProperties props;
|
||||
protected final String topicName;
|
||||
|
||||
public KafkaSource(TypedProperties props, JavaSparkContext sparkContext, SchemaProvider schemaProvider) {
|
||||
super(props, sparkContext, schemaProvider);
|
||||
|
||||
kafkaParams = new HashMap<>();
|
||||
public KafkaOffsetGen(TypedProperties props) {
|
||||
this.props = props;
|
||||
kafkaParams = new HashMap<String, String>();
|
||||
for (Object prop : props.keySet()) {
|
||||
kafkaParams.put(prop.toString(), props.getString(prop.toString()));
|
||||
}
|
||||
@@ -194,11 +189,7 @@ public abstract class KafkaSource extends Source {
|
||||
topicName = props.getString(Config.KAFKA_TOPIC_NAME);
|
||||
}
|
||||
|
||||
protected abstract JavaRDD<GenericRecord> toAvroRDD(OffsetRange[] offsetRanges, AvroConvertor avroConvertor);
|
||||
|
||||
@Override
|
||||
public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
|
||||
Optional<String> lastCheckpointStr, long sourceLimit) {
|
||||
public OffsetRange[] getNextOffsetRanges(Optional<String> lastCheckpointStr, long sourceLimit) {
|
||||
|
||||
// Obtain current metadata for the topic
|
||||
KafkaCluster cluster = new KafkaCluster(ScalaHelpers.toScalaMap(kafkaParams));
|
||||
@@ -240,16 +231,15 @@ public abstract class KafkaSource extends Source {
|
||||
// Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
|
||||
long numEvents = Math.min(DEFAULT_MAX_EVENTS_TO_READ, sourceLimit);
|
||||
OffsetRange[] offsetRanges = CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
|
||||
long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
|
||||
if (totalNewMsgs <= 0) {
|
||||
return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.orElse(""));
|
||||
} else {
|
||||
log.info("About to read " + totalNewMsgs + " from Kafka for topic :" + topicName);
|
||||
}
|
||||
|
||||
// Produce a RDD[GenericRecord]
|
||||
final AvroConvertor avroConvertor = new AvroConvertor(schemaProvider.getSourceSchema().toString());
|
||||
JavaRDD<GenericRecord> newDataRDD = toAvroRDD(offsetRanges, avroConvertor);
|
||||
return new ImmutablePair<>(Optional.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
|
||||
return offsetRanges;
|
||||
}
|
||||
|
||||
public String getTopicName() {
|
||||
return topicName;
|
||||
}
|
||||
|
||||
public HashMap<String, String> getKafkaParams() {
|
||||
return kafkaParams;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.transform;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* Identity transformer
|
||||
*/
|
||||
public class IdentityTransformer implements Transformer {
|
||||
|
||||
@Override
|
||||
public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
|
||||
Dataset<Row> rowDataset, TypedProperties properties) {
|
||||
return rowDataset;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.transform;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import java.util.UUID;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* A transformer that allows a sql-query template be used to transform the source before writing to Hudi data-set.
|
||||
*
|
||||
* The query should reference the source as a table named "\<SRC\>"
|
||||
*/
|
||||
public class SqlQueryBasedTransformer implements Transformer {
|
||||
|
||||
private static volatile Logger log = LogManager.getLogger(SqlQueryBasedTransformer.class);
|
||||
|
||||
private static final String SRC_PATTERN = "<SRC>";
|
||||
private static final String TMP_TABLE = "HOODIE_SRC_TMP_TABLE_";
|
||||
|
||||
/**
|
||||
* Configs supported
|
||||
*/
|
||||
static class Config {
|
||||
|
||||
private static final String TRANSFORMER_SQL = "hoodie.deltastreamer.transformer.sql";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
|
||||
Dataset<Row> rowDataset, TypedProperties properties) {
|
||||
String transformerSQL = properties.getString(Config.TRANSFORMER_SQL);
|
||||
if (null == transformerSQL) {
|
||||
throw new IllegalArgumentException("Missing configuration : (" + Config.TRANSFORMER_SQL + ")");
|
||||
}
|
||||
|
||||
// tmp table name doesn't like dashes
|
||||
String tmpTable = TMP_TABLE.concat(UUID.randomUUID().toString().replace("-", "_"));
|
||||
log.info("Registering tmp table : " + tmpTable);
|
||||
rowDataset.registerTempTable(tmpTable);
|
||||
String sqlStr = transformerSQL.replaceAll(SRC_PATTERN, tmpTable);
|
||||
log.info("SQL Query for transformation : (" + sqlStr + ")");
|
||||
return sparkSession.sql(sqlStr);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Copyright (c) 2018 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.utilities.transform;
|
||||
|
||||
import com.uber.hoodie.common.util.TypedProperties;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* Transform source to target dataset before writing
|
||||
*/
|
||||
public interface Transformer {
|
||||
|
||||
/**
|
||||
* Transform source RDD to target RDD
|
||||
*
|
||||
* @param jsc JavaSparkContext
|
||||
* @param rowDataset Source DataSet
|
||||
* @param sparkSession Spark Session
|
||||
* @param properties Config properties
|
||||
* @return Transformed Dataset
|
||||
*/
|
||||
Dataset<Row> apply(JavaSparkContext jsc, SparkSession sparkSession,
|
||||
Dataset<Row> rowDataset, TypedProperties properties);
|
||||
}
|
||||
Reference in New Issue
Block a user