[HUDI-759] Integrate checkpoint provider with delta streamer (#1486)
This commit is contained in:
@@ -32,6 +32,7 @@ import org.apache.hudi.config.HoodieWriteConfig;
|
|||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.index.HoodieIndex;
|
import org.apache.hudi.index.HoodieIndex;
|
||||||
|
import org.apache.hudi.utilities.checkpointing.InitialCheckPointProvider;
|
||||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||||
import org.apache.hudi.utilities.sources.Source;
|
import org.apache.hudi.utilities.sources.Source;
|
||||||
import org.apache.hudi.utilities.transform.ChainedTransformer;
|
import org.apache.hudi.utilities.transform.ChainedTransformer;
|
||||||
@@ -85,7 +86,7 @@ public class UtilHelpers {
|
|||||||
private static final Logger LOG = LogManager.getLogger(UtilHelpers.class);
|
private static final Logger LOG = LogManager.getLogger(UtilHelpers.class);
|
||||||
|
|
||||||
public static Source createSource(String sourceClass, TypedProperties cfg, JavaSparkContext jssc,
|
public static Source createSource(String sourceClass, TypedProperties cfg, JavaSparkContext jssc,
|
||||||
SparkSession sparkSession, SchemaProvider schemaProvider) throws IOException {
|
SparkSession sparkSession, SchemaProvider schemaProvider) throws IOException {
|
||||||
try {
|
try {
|
||||||
return (Source) ReflectionUtils.loadClass(sourceClass,
|
return (Source) ReflectionUtils.loadClass(sourceClass,
|
||||||
new Class<?>[] {TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class}, cfg,
|
new Class<?>[] {TypedProperties.class, JavaSparkContext.class, SparkSession.class, SchemaProvider.class}, cfg,
|
||||||
@@ -96,7 +97,7 @@ public class UtilHelpers {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg,
|
public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg,
|
||||||
JavaSparkContext jssc) throws IOException {
|
JavaSparkContext jssc) throws IOException {
|
||||||
try {
|
try {
|
||||||
return schemaProviderClass == null ? null
|
return schemaProviderClass == null ? null
|
||||||
: (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
|
: (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
|
||||||
@@ -117,7 +118,17 @@ public class UtilHelpers {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static InitialCheckPointProvider createInitialCheckpointProvider(
|
||||||
|
String className, TypedProperties props) throws IOException {
|
||||||
|
try {
|
||||||
|
return (InitialCheckPointProvider) ReflectionUtils.loadClass(className, new Class<?>[] {TypedProperties.class}, props);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
throw new IOException("Could not load initial checkpoint provider class " + className, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath, List<String> overriddenProps) {
|
public static DFSPropertiesConfiguration readConfig(FileSystem fs, Path cfgPath, List<String> overriddenProps) {
|
||||||
DFSPropertiesConfiguration conf;
|
DFSPropertiesConfiguration conf;
|
||||||
@@ -157,7 +168,7 @@ public class UtilHelpers {
|
|||||||
/**
|
/**
|
||||||
* Parse Schema from file.
|
* Parse Schema from file.
|
||||||
*
|
*
|
||||||
* @param fs File System
|
* @param fs File System
|
||||||
* @param schemaFile Schema File
|
* @param schemaFile Schema File
|
||||||
*/
|
*/
|
||||||
public static String parseSchema(FileSystem fs, String schemaFile) throws Exception {
|
public static String parseSchema(FileSystem fs, String schemaFile) throws Exception {
|
||||||
@@ -219,13 +230,13 @@ public class UtilHelpers {
|
|||||||
/**
|
/**
|
||||||
* Build Hoodie write client.
|
* Build Hoodie write client.
|
||||||
*
|
*
|
||||||
* @param jsc Java Spark Context
|
* @param jsc Java Spark Context
|
||||||
* @param basePath Base Path
|
* @param basePath Base Path
|
||||||
* @param schemaStr Schema
|
* @param schemaStr Schema
|
||||||
* @param parallelism Parallelism
|
* @param parallelism Parallelism
|
||||||
*/
|
*/
|
||||||
public static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr,
|
public static HoodieWriteClient createHoodieClient(JavaSparkContext jsc, String basePath, String schemaStr,
|
||||||
int parallelism, Option<String> compactionStrategyClass, TypedProperties properties) {
|
int parallelism, Option<String> compactionStrategyClass, TypedProperties properties) {
|
||||||
HoodieCompactionConfig compactionConfig = compactionStrategyClass
|
HoodieCompactionConfig compactionConfig = compactionStrategyClass
|
||||||
.map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false)
|
.map(strategy -> HoodieCompactionConfig.newBuilder().withInlineCompaction(false)
|
||||||
.withCompactionStrategy(ReflectionUtils.loadClass(strategy)).build())
|
.withCompactionStrategy(ReflectionUtils.loadClass(strategy)).build())
|
||||||
@@ -264,6 +275,7 @@ public class UtilHelpers {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a factory for creating connections to the given JDBC URL.
|
* Returns a factory for creating connections to the given JDBC URL.
|
||||||
|
*
|
||||||
* @param options - JDBC options that contains url, table and other information.
|
* @param options - JDBC options that contains url, table and other information.
|
||||||
* @return
|
* @return
|
||||||
* @throws SQLException if the driver could not open a JDBC connection.
|
* @throws SQLException if the driver could not open a JDBC connection.
|
||||||
@@ -323,7 +335,7 @@ public class UtilHelpers {
|
|||||||
Connection conn = createConnectionFactory(options);
|
Connection conn = createConnectionFactory(options);
|
||||||
String url = options.get(JDBCOptions.JDBC_URL());
|
String url = options.get(JDBCOptions.JDBC_URL());
|
||||||
String table = options.get(JDBCOptions.JDBC_TABLE_NAME());
|
String table = options.get(JDBCOptions.JDBC_TABLE_NAME());
|
||||||
boolean tableExists = tableExists(conn,options);
|
boolean tableExists = tableExists(conn, options);
|
||||||
|
|
||||||
if (tableExists) {
|
if (tableExists) {
|
||||||
JdbcDialect dialect = JdbcDialects.get(url);
|
JdbcDialect dialect = JdbcDialects.get(url);
|
||||||
|
|||||||
@@ -18,14 +18,43 @@
|
|||||||
|
|
||||||
package org.apache.hudi.utilities.checkpointing;
|
package org.apache.hudi.utilities.checkpointing;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provide the initial checkpoint for delta streamer.
|
* Provide the initial checkpoint for delta streamer.
|
||||||
*/
|
*/
|
||||||
public interface InitialCheckPointProvider {
|
public abstract class InitialCheckPointProvider {
|
||||||
|
protected transient Path path;
|
||||||
|
protected transient FileSystem fs;
|
||||||
|
protected transient TypedProperties props;
|
||||||
|
|
||||||
|
static class Config {
|
||||||
|
private static String CHECKPOINT_PROVIDER_PATH_PROP = "hoodie.deltastreamer.checkpoint.provider.path";
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Construct InitialCheckPointProvider.
|
||||||
|
* @param props All properties passed to Delta Streamer
|
||||||
|
*/
|
||||||
|
public InitialCheckPointProvider(TypedProperties props) {
|
||||||
|
this.props = props;
|
||||||
|
this.path = new Path(props.getString(Config.CHECKPOINT_PROVIDER_PATH_PROP));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the class with the current filesystem.
|
||||||
|
*
|
||||||
|
* @param config Hadoop configuration
|
||||||
|
*/
|
||||||
|
public abstract void init(Configuration config) throws HoodieException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get checkpoint string recognizable for delta streamer.
|
* Get checkpoint string recognizable for delta streamer.
|
||||||
*/
|
*/
|
||||||
String getCheckpoint() throws HoodieException;
|
public abstract String getCheckpoint() throws HoodieException;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,8 +18,10 @@
|
|||||||
|
|
||||||
package org.apache.hudi.utilities.checkpointing;
|
package org.apache.hudi.utilities.checkpointing;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
@@ -35,15 +37,20 @@ import java.util.regex.Pattern;
|
|||||||
* Generate checkpoint from Kafka-Connect-HDFS managed data set.
|
* Generate checkpoint from Kafka-Connect-HDFS managed data set.
|
||||||
* Documentation: https://docs.confluent.io/current/connect/kafka-connect-hdfs/index.html
|
* Documentation: https://docs.confluent.io/current/connect/kafka-connect-hdfs/index.html
|
||||||
*/
|
*/
|
||||||
public class KafkaConnectHdfsProvider implements InitialCheckPointProvider {
|
public class KafkaConnectHdfsProvider extends InitialCheckPointProvider {
|
||||||
private final Path path;
|
private static String FILENAME_SEPARATOR = "[\\+\\.]";
|
||||||
private final FileSystem fs;
|
|
||||||
|
|
||||||
private static final String FILENAME_SEPARATOR = "[\\+\\.]";
|
public KafkaConnectHdfsProvider(TypedProperties props) {
|
||||||
|
super(props);
|
||||||
|
}
|
||||||
|
|
||||||
public KafkaConnectHdfsProvider(final Path basePath, final FileSystem fileSystem) {
|
@Override
|
||||||
this.path = basePath;
|
public void init(Configuration config) throws HoodieException {
|
||||||
this.fs = fileSystem;
|
try {
|
||||||
|
this.fs = FileSystem.get(config);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieException("KafkaConnectHdfsProvider initialization failed");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -72,7 +79,8 @@ public class KafkaConnectHdfsProvider implements InitialCheckPointProvider {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert map contains max offset of each partition to string.
|
* Convert map contains max offset of each partition to string.
|
||||||
* @param topic Topic name
|
*
|
||||||
|
* @param topic Topic name
|
||||||
* @param checkpoint Map with partition as key and max offset as value
|
* @param checkpoint Map with partition as key and max offset as value
|
||||||
* @return Checkpoint string
|
* @return Checkpoint string
|
||||||
*/
|
*/
|
||||||
@@ -88,8 +96,9 @@ public class KafkaConnectHdfsProvider implements InitialCheckPointProvider {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* List file status recursively.
|
* List file status recursively.
|
||||||
|
*
|
||||||
* @param curPath Current Path
|
* @param curPath Current Path
|
||||||
* @param filter PathFilter
|
* @param filter PathFilter
|
||||||
* @return All file status match kafka connect naming convention
|
* @return All file status match kafka connect naming convention
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -130,7 +130,7 @@ public class DeltaSync implements Serializable {
|
|||||||
/**
|
/**
|
||||||
* Hive Config.
|
* Hive Config.
|
||||||
*/
|
*/
|
||||||
private transient HiveConf hiveConf;
|
private transient Configuration conf;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Bag of properties with source, hoodie client, key generator etc.
|
* Bag of properties with source, hoodie client, key generator etc.
|
||||||
@@ -153,7 +153,7 @@ public class DeltaSync implements Serializable {
|
|||||||
private transient HoodieWriteClient writeClient;
|
private transient HoodieWriteClient writeClient;
|
||||||
|
|
||||||
public DeltaSync(HoodieDeltaStreamer.Config cfg, SparkSession sparkSession, SchemaProvider schemaProvider,
|
public DeltaSync(HoodieDeltaStreamer.Config cfg, SparkSession sparkSession, SchemaProvider schemaProvider,
|
||||||
TypedProperties props, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf,
|
TypedProperties props, JavaSparkContext jssc, FileSystem fs, Configuration conf,
|
||||||
Function<HoodieWriteClient, Boolean> onInitializingHoodieWriteClient) throws IOException {
|
Function<HoodieWriteClient, Boolean> onInitializingHoodieWriteClient) throws IOException {
|
||||||
|
|
||||||
this.cfg = cfg;
|
this.cfg = cfg;
|
||||||
@@ -172,7 +172,7 @@ public class DeltaSync implements Serializable {
|
|||||||
this.formatAdapter = new SourceFormatAdapter(
|
this.formatAdapter = new SourceFormatAdapter(
|
||||||
UtilHelpers.createSource(cfg.sourceClassName, props, jssc, sparkSession, schemaProvider));
|
UtilHelpers.createSource(cfg.sourceClassName, props, jssc, sparkSession, schemaProvider));
|
||||||
|
|
||||||
this.hiveConf = hiveConf;
|
this.conf = conf;
|
||||||
|
|
||||||
// If schemaRegistry already resolved, setup write-client
|
// If schemaRegistry already resolved, setup write-client
|
||||||
setupWriteClient();
|
setupWriteClient();
|
||||||
@@ -449,8 +449,7 @@ public class DeltaSync implements Serializable {
|
|||||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath);
|
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath);
|
||||||
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
|
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
|
||||||
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
|
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
|
||||||
|
new HiveSyncTool(hiveSyncConfig, new HiveConf(conf, HiveConf.class), fs).syncHoodieTable();
|
||||||
new HiveSyncTool(hiveSyncConfig, hiveConf, fs).syncHoodieTable();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ import org.apache.hudi.exception.HoodieException;
|
|||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.utilities.HiveIncrementalPuller;
|
import org.apache.hudi.utilities.HiveIncrementalPuller;
|
||||||
import org.apache.hudi.utilities.UtilHelpers;
|
import org.apache.hudi.utilities.UtilHelpers;
|
||||||
|
import org.apache.hudi.utilities.checkpointing.InitialCheckPointProvider;
|
||||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||||
import org.apache.hudi.utilities.sources.JsonDFSSource;
|
import org.apache.hudi.utilities.sources.JsonDFSSource;
|
||||||
|
|
||||||
@@ -45,7 +46,6 @@ import com.beust.jcommander.ParameterException;
|
|||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.hive.conf.HiveConf;
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
@@ -90,35 +90,34 @@ public class HoodieDeltaStreamer implements Serializable {
|
|||||||
|
|
||||||
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException {
|
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc) throws IOException {
|
||||||
this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
|
this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
|
||||||
getDefaultHiveConf(jssc.hadoopConfiguration()));
|
jssc.hadoopConfiguration(), null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, TypedProperties props) throws IOException {
|
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, TypedProperties props) throws IOException {
|
||||||
this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
|
this(cfg, jssc, FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()),
|
||||||
getDefaultHiveConf(jssc.hadoopConfiguration()), props);
|
jssc.hadoopConfiguration(), props);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf,
|
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf) throws IOException {
|
||||||
|
this(cfg, jssc, fs, conf, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf,
|
||||||
TypedProperties properties) throws IOException {
|
TypedProperties properties) throws IOException {
|
||||||
|
if (cfg.initialCheckpointProvider != null && cfg.checkpoint == null) {
|
||||||
|
InitialCheckPointProvider checkPointProvider =
|
||||||
|
UtilHelpers.createInitialCheckpointProvider(cfg.initialCheckpointProvider, properties);
|
||||||
|
checkPointProvider.init(conf);
|
||||||
|
cfg.checkpoint = checkPointProvider.getCheckpoint();
|
||||||
|
}
|
||||||
this.cfg = cfg;
|
this.cfg = cfg;
|
||||||
this.deltaSyncService = new DeltaSyncService(cfg, jssc, fs, hiveConf, properties);
|
this.deltaSyncService = new DeltaSyncService(cfg, jssc, fs, conf, properties);
|
||||||
}
|
|
||||||
|
|
||||||
public HoodieDeltaStreamer(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf) throws IOException {
|
|
||||||
this.cfg = cfg;
|
|
||||||
this.deltaSyncService = new DeltaSyncService(cfg, jssc, fs, hiveConf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void shutdownGracefully() {
|
public void shutdownGracefully() {
|
||||||
deltaSyncService.shutdown(false);
|
deltaSyncService.shutdown(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static HiveConf getDefaultHiveConf(Configuration cfg) {
|
|
||||||
HiveConf hiveConf = new HiveConf();
|
|
||||||
hiveConf.addResource(cfg);
|
|
||||||
return hiveConf;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main method to start syncing.
|
* Main method to start syncing.
|
||||||
*
|
*
|
||||||
@@ -143,6 +142,10 @@ public class HoodieDeltaStreamer implements Serializable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Config getConfig() {
|
||||||
|
return cfg;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean onDeltaSyncShutdown(boolean error) {
|
private boolean onDeltaSyncShutdown(boolean error) {
|
||||||
LOG.info("DeltaSync shutdown. Closing write client. Error?" + error);
|
LOG.info("DeltaSync shutdown. Closing write client. Error?" + error);
|
||||||
deltaSyncService.close();
|
deltaSyncService.close();
|
||||||
@@ -293,6 +296,12 @@ public class HoodieDeltaStreamer implements Serializable {
|
|||||||
@Parameter(names = {"--checkpoint"}, description = "Resume Delta Streamer from this checkpoint.")
|
@Parameter(names = {"--checkpoint"}, description = "Resume Delta Streamer from this checkpoint.")
|
||||||
public String checkpoint = null;
|
public String checkpoint = null;
|
||||||
|
|
||||||
|
@Parameter(names = {"--initial-checkpoint-provider"}, description = "subclass of "
|
||||||
|
+ "org.apache.hudi.utilities.checkpointing.InitialCheckpointProvider. Generate check point for delta streamer "
|
||||||
|
+ "for the first run. This field will override the checkpoint of last commit using the checkpoint field. "
|
||||||
|
+ "Use this field only when switching source, for example, from DFS source to Kafka Source.")
|
||||||
|
public String initialCheckpointProvider = null;
|
||||||
|
|
||||||
@Parameter(names = {"--help", "-h"}, help = true)
|
@Parameter(names = {"--help", "-h"}, help = true)
|
||||||
public Boolean help = false;
|
public Boolean help = false;
|
||||||
|
|
||||||
@@ -371,7 +380,7 @@ public class HoodieDeltaStreamer implements Serializable {
|
|||||||
*/
|
*/
|
||||||
private transient DeltaSync deltaSync;
|
private transient DeltaSync deltaSync;
|
||||||
|
|
||||||
public DeltaSyncService(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf,
|
public DeltaSyncService(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf,
|
||||||
TypedProperties properties) throws IOException {
|
TypedProperties properties) throws IOException {
|
||||||
this.cfg = cfg;
|
this.cfg = cfg;
|
||||||
this.jssc = jssc;
|
this.jssc = jssc;
|
||||||
@@ -395,13 +404,13 @@ public class HoodieDeltaStreamer implements Serializable {
|
|||||||
LOG.info("Creating delta streamer with configs : " + props.toString());
|
LOG.info("Creating delta streamer with configs : " + props.toString());
|
||||||
this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jssc);
|
this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jssc);
|
||||||
|
|
||||||
deltaSync = new DeltaSync(cfg, sparkSession, schemaProvider, props, jssc, fs, hiveConf,
|
deltaSync = new DeltaSync(cfg, sparkSession, schemaProvider, props, jssc, fs, conf,
|
||||||
this::onInitializingWriteClient);
|
this::onInitializingWriteClient);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DeltaSyncService(HoodieDeltaStreamer.Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf hiveConf)
|
public DeltaSyncService(HoodieDeltaStreamer.Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
this(cfg, jssc, fs, hiveConf, null);
|
this(cfg, jssc, fs, conf, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DeltaSync getDeltaSync() {
|
public DeltaSync getDeltaSync() {
|
||||||
|
|||||||
@@ -256,21 +256,21 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, List<String> transformerClassNames,
|
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, List<String> transformerClassNames,
|
||||||
String propsFilename, boolean enableHiveSync) {
|
String propsFilename, boolean enableHiveSync) {
|
||||||
return makeConfig(basePath, op, transformerClassNames, propsFilename, enableHiveSync, true,
|
return makeConfig(basePath, op, transformerClassNames, propsFilename, enableHiveSync, true,
|
||||||
false, null, null);
|
false, null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, List<String> transformerClassNames,
|
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, List<String> transformerClassNames,
|
||||||
String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass, boolean updatePayloadClass,
|
String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass, boolean updatePayloadClass,
|
||||||
String payloadClassName, String tableType) {
|
String payloadClassName, String tableType) {
|
||||||
return makeConfig(basePath, op, TestDataSource.class.getName(), transformerClassNames, propsFilename, enableHiveSync,
|
return makeConfig(basePath, op, TestDataSource.class.getName(), transformerClassNames, propsFilename, enableHiveSync,
|
||||||
useSchemaProviderClass, 1000, updatePayloadClass, payloadClassName, tableType, "timestamp");
|
useSchemaProviderClass, 1000, updatePayloadClass, payloadClassName, tableType, "timestamp");
|
||||||
}
|
}
|
||||||
|
|
||||||
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String sourceClassName,
|
static HoodieDeltaStreamer.Config makeConfig(String basePath, Operation op, String sourceClassName,
|
||||||
List<String> transformerClassNames, String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass,
|
List<String> transformerClassNames, String propsFilename, boolean enableHiveSync, boolean useSchemaProviderClass,
|
||||||
int sourceLimit, boolean updatePayloadClass, String payloadClassName, String tableType, String sourceOrderingField) {
|
int sourceLimit, boolean updatePayloadClass, String payloadClassName, String tableType, String sourceOrderingField) {
|
||||||
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
|
||||||
cfg.targetBasePath = basePath;
|
cfg.targetBasePath = basePath;
|
||||||
cfg.targetTableName = "hoodie_trips";
|
cfg.targetTableName = "hoodie_trips";
|
||||||
@@ -394,6 +394,28 @@ public class TestHoodieDeltaStreamer extends UtilitiesTestBase {
|
|||||||
props.getString("hoodie.datasource.write.keygenerator.class"));
|
props.getString("hoodie.datasource.write.keygenerator.class"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testKafkaConnectCheckpointProvider() throws IOException {
|
||||||
|
String tableBasePath = dfsBasePath + "/test_table";
|
||||||
|
String bootstrapPath = dfsBasePath + "/kafka_topic1";
|
||||||
|
String partitionPath = bootstrapPath + "/year=2016/month=05/day=01";
|
||||||
|
String filePath = partitionPath + "/kafka_topic1+0+100+200.parquet";
|
||||||
|
String checkpointProviderClass = "org.apache.hudi.utilities.checkpointing.KafkaConnectHdfsProvider";
|
||||||
|
HoodieDeltaStreamer.Config cfg = TestHelpers.makeDropAllConfig(tableBasePath, Operation.UPSERT);
|
||||||
|
TypedProperties props =
|
||||||
|
new DFSPropertiesConfiguration(dfs, new Path(dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE)).getConfig();
|
||||||
|
props.put("hoodie.deltastreamer.checkpoint.provider.path", bootstrapPath);
|
||||||
|
cfg.initialCheckpointProvider = checkpointProviderClass;
|
||||||
|
// create regular kafka connect hdfs dirs
|
||||||
|
dfs.mkdirs(new Path(bootstrapPath));
|
||||||
|
dfs.mkdirs(new Path(partitionPath));
|
||||||
|
// generate parquet files using kafka connect naming convention
|
||||||
|
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
|
||||||
|
Helpers.saveParquetToDFS(Helpers.toGenericRecords(dataGenerator.generateInserts("000", 100)), new Path(filePath));
|
||||||
|
HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(cfg, jsc, dfs, hdfsTestService.getHadoopConf(), props);
|
||||||
|
assertEquals(deltaStreamer.getConfig().checkpoint, "kafka_topic1,0:200");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPropsWithInvalidKeyGenerator() throws Exception {
|
public void testPropsWithInvalidKeyGenerator() throws Exception {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -19,13 +19,11 @@
|
|||||||
package org.apache.hudi.utilities.checkpointing;
|
package org.apache.hudi.utilities.checkpointing;
|
||||||
|
|
||||||
import org.apache.hudi.common.HoodieCommonTestHarness;
|
import org.apache.hudi.common.HoodieCommonTestHarness;
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
@@ -34,15 +32,14 @@ import java.io.File;
|
|||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
public class TestKafkaConnectHdfsProvider extends HoodieCommonTestHarness {
|
public class TestKafkaConnectHdfsProvider extends HoodieCommonTestHarness {
|
||||||
private FileSystem fs = null;
|
|
||||||
private String topicPath = null;
|
private String topicPath = null;
|
||||||
|
private Configuration hadoopConf = null;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void init() {
|
public void init() {
|
||||||
// Prepare directories
|
// Prepare directories
|
||||||
initPath();
|
initPath();
|
||||||
final Configuration hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
|
hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
|
||||||
fs = FSUtils.getFs(basePath, hadoopConf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@@ -70,7 +67,10 @@ public class TestKafkaConnectHdfsProvider extends HoodieCommonTestHarness {
|
|||||||
+ "random_snappy_1.parquet").createNewFile();
|
+ "random_snappy_1.parquet").createNewFile();
|
||||||
new File(topicPath + "/year=2016/month=05/day=02/"
|
new File(topicPath + "/year=2016/month=05/day=02/"
|
||||||
+ "random_snappy_2.parquet").createNewFile();
|
+ "random_snappy_2.parquet").createNewFile();
|
||||||
InitialCheckPointProvider provider = new KafkaConnectHdfsProvider(new Path(topicPath), fs);
|
final TypedProperties props = new TypedProperties();
|
||||||
|
props.put("hoodie.deltastreamer.checkpoint.provider.path", topicPath);
|
||||||
|
final InitialCheckPointProvider provider = new KafkaConnectHdfsProvider(props);
|
||||||
|
provider.init(hadoopConf);
|
||||||
assertEquals(provider.getCheckpoint(), "topic1,0:300,1:200");
|
assertEquals(provider.getCheckpoint(), "topic1,0:300,1:200");
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,7 +88,10 @@ public class TestKafkaConnectHdfsProvider extends HoodieCommonTestHarness {
|
|||||||
+ "topic1+2+100+200.parquet").createNewFile();
|
+ "topic1+2+100+200.parquet").createNewFile();
|
||||||
new File(topicPath + "/year=2016/month=05/day=02/"
|
new File(topicPath + "/year=2016/month=05/day=02/"
|
||||||
+ "topic1+0+201+300.parquet").createNewFile();
|
+ "topic1+0+201+300.parquet").createNewFile();
|
||||||
InitialCheckPointProvider provider = new KafkaConnectHdfsProvider(new Path(topicPath), fs);
|
final TypedProperties props = new TypedProperties();
|
||||||
|
props.put("hoodie.deltastreamer.checkpoint.provider.path", topicPath);
|
||||||
|
final InitialCheckPointProvider provider = new KafkaConnectHdfsProvider(props);
|
||||||
|
provider.init(hadoopConf);
|
||||||
provider.getCheckpoint();
|
provider.getCheckpoint();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user