[HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync, hudi-dla-sync (#1810)
- Generalize the hive-sync module for syncing to multiple metastores - Added new options for datasource - Added new command line for delta streamer Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
@@ -23,6 +23,7 @@ import org.apache.hudi.DataSourceUtils;
|
||||
import org.apache.hudi.client.HoodieWriteClient;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
@@ -31,6 +32,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
@@ -39,6 +41,7 @@ import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HiveSyncTool;
|
||||
import org.apache.hudi.keygen.KeyGenerator;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.hudi.utilities.UtilHelpers;
|
||||
import org.apache.hudi.exception.HoodieDeltaStreamerException;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Config;
|
||||
@@ -66,10 +69,15 @@ import org.apache.spark.sql.SparkSession;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.function.Function;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Function;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.collection.JavaConversions;
|
||||
@@ -391,6 +399,7 @@ public class DeltaSync implements Serializable {
|
||||
long totalRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalRecords).sum().longValue();
|
||||
boolean hasErrors = totalErrorRecords > 0;
|
||||
long hiveSyncTimeMs = 0;
|
||||
long metaSyncTimeMs = 0;
|
||||
if (!hasErrors || cfg.commitOnErrors) {
|
||||
HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
|
||||
checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr);
|
||||
@@ -413,10 +422,7 @@ public class DeltaSync implements Serializable {
|
||||
}
|
||||
|
||||
if (!isEmpty) {
|
||||
// Sync to hive if enabled
|
||||
Timer.Context hiveSyncContext = metrics.getHiveSyncTimerContext();
|
||||
syncHiveIfNeeded();
|
||||
hiveSyncTimeMs = hiveSyncContext != null ? hiveSyncContext.stop() : 0;
|
||||
syncMeta(metrics);
|
||||
}
|
||||
} else {
|
||||
LOG.info("Commit " + instantTime + " failed!");
|
||||
@@ -438,8 +444,7 @@ public class DeltaSync implements Serializable {
|
||||
long overallTimeMs = overallTimerContext != null ? overallTimerContext.stop() : 0;
|
||||
|
||||
// Send DeltaStreamer Metrics
|
||||
metrics.updateDeltaStreamerMetrics(overallTimeMs, hiveSyncTimeMs);
|
||||
|
||||
metrics.updateDeltaStreamerMetrics(overallTimeMs);
|
||||
return Pair.of(scheduledCompactionInstant, writeStatusRDD);
|
||||
}
|
||||
|
||||
@@ -471,12 +476,41 @@ public class DeltaSync implements Serializable {
|
||||
throw lastException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sync to Hive.
|
||||
*/
|
||||
public void syncHiveIfNeeded() {
|
||||
private String getSyncClassShortName(String syncClassName) {
|
||||
return syncClassName.substring(syncClassName.lastIndexOf(".") + 1);
|
||||
}
|
||||
|
||||
private void syncMeta(HoodieDeltaStreamerMetrics metrics) {
|
||||
Set<String> syncClientToolClasses = new HashSet<>(Arrays.asList(cfg.syncClientToolClass.split(",")));
|
||||
// for backward compatibility
|
||||
if (cfg.enableHiveSync) {
|
||||
syncHive();
|
||||
cfg.enableMetaSync = true;
|
||||
syncClientToolClasses.add(HiveSyncTool.class.getName());
|
||||
LOG.info("When set --enable-hive-sync will use HiveSyncTool for backward compatibility");
|
||||
}
|
||||
if (cfg.enableMetaSync) {
|
||||
for (String impl : syncClientToolClasses) {
|
||||
Timer.Context syncContext = metrics.getMetaSyncTimerContext();
|
||||
impl = impl.trim();
|
||||
AbstractSyncTool syncTool = null;
|
||||
switch (impl) {
|
||||
case "org.apache.hudi.hive.HiveSyncTool":
|
||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
|
||||
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
|
||||
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
|
||||
syncTool = new HiveSyncTool(hiveSyncConfig, new HiveConf(conf, HiveConf.class), fs);
|
||||
break;
|
||||
default:
|
||||
FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration());
|
||||
Properties properties = new Properties();
|
||||
properties.putAll(props);
|
||||
properties.put("basePath", cfg.targetBasePath);
|
||||
syncTool = (AbstractSyncTool) ReflectionUtils.loadClass(impl, new Class[]{Properties.class, FileSystem.class}, properties, fs);
|
||||
}
|
||||
syncTool.syncHoodieTable();
|
||||
long metaSyncTimeMs = syncContext != null ? syncContext.stop() : 0;
|
||||
metrics.updateDeltaStreamerMetaSyncMetrics(getSyncClassShortName(impl), metaSyncTimeMs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -38,6 +38,7 @@ import org.apache.hudi.utilities.IdentitySplitter;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.hive.HiveSyncTool;
|
||||
import org.apache.hudi.utilities.HiveIncrementalPuller;
|
||||
import org.apache.hudi.utilities.UtilHelpers;
|
||||
import org.apache.hudi.utilities.checkpointing.InitialCheckPointProvider;
|
||||
@@ -268,9 +269,16 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
description = "Should duplicate records from source be dropped/filtered out before insert/bulk-insert")
|
||||
public Boolean filterDupes = false;
|
||||
|
||||
//will abandon in the future version, recommended use --enable-sync
|
||||
@Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive")
|
||||
public Boolean enableHiveSync = false;
|
||||
|
||||
@Parameter(names = {"--enable-sync"}, description = "Enable syncing meta")
|
||||
public Boolean enableMetaSync = false;
|
||||
|
||||
@Parameter(names = {"--sync-tool-classes"}, description = "Meta sync client tool, using comma to separate multi tools")
|
||||
public String syncClientToolClass = HiveSyncTool.class.getName();
|
||||
|
||||
@Parameter(names = {"--max-pending-compactions"},
|
||||
description = "Maximum number of outstanding inflight/requested compactions. Delta Sync will not happen unless"
|
||||
+ "outstanding compactions is less than this number")
|
||||
@@ -447,6 +455,11 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
Map<String, String> additionalSparkConfigs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg);
|
||||
JavaSparkContext jssc =
|
||||
UtilHelpers.buildSparkContext("delta-streamer-" + cfg.targetTableName, cfg.sparkMaster, additionalSparkConfigs);
|
||||
|
||||
if (cfg.enableHiveSync) {
|
||||
LOG.warn("--enable-hive-sync will be deprecated in a future release; please use --enable-sync instead for Hive syncing");
|
||||
}
|
||||
|
||||
try {
|
||||
new HoodieDeltaStreamer(cfg, jssc).sync();
|
||||
} finally {
|
||||
|
||||
@@ -32,8 +32,10 @@ public class HoodieDeltaStreamerMetrics implements Serializable {
|
||||
|
||||
public String overallTimerName = null;
|
||||
public String hiveSyncTimerName = null;
|
||||
private transient Timer overallTimer = null;
|
||||
public transient Timer hiveSyncTimer = null;
|
||||
public String metaSyncTimerName = null;
|
||||
private Timer overallTimer = null;
|
||||
public Timer hiveSyncTimer = null;
|
||||
public Timer metaSyncTimer = null;
|
||||
|
||||
public HoodieDeltaStreamerMetrics(HoodieWriteConfig config) {
|
||||
this.config = config;
|
||||
@@ -42,6 +44,7 @@ public class HoodieDeltaStreamerMetrics implements Serializable {
|
||||
Metrics.init(config);
|
||||
this.overallTimerName = getMetricsName("timer", "deltastreamer");
|
||||
this.hiveSyncTimerName = getMetricsName("timer", "deltastreamerHiveSync");
|
||||
this.metaSyncTimerName = getMetricsName("timer", "deltastreamerMetaSync");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,6 +62,13 @@ public class HoodieDeltaStreamerMetrics implements Serializable {
|
||||
return hiveSyncTimer == null ? null : hiveSyncTimer.time();
|
||||
}
|
||||
|
||||
public Timer.Context getMetaSyncTimerContext() {
|
||||
if (config.isMetricsOn() && metaSyncTimer == null) {
|
||||
metaSyncTimer = createTimer(metaSyncTimerName);
|
||||
}
|
||||
return metaSyncTimer == null ? null : metaSyncTimer.time();
|
||||
}
|
||||
|
||||
private Timer createTimer(String name) {
|
||||
return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null;
|
||||
}
|
||||
@@ -67,10 +77,15 @@ public class HoodieDeltaStreamerMetrics implements Serializable {
|
||||
return config == null ? null : String.format("%s.%s.%s", tableName, action, metric);
|
||||
}
|
||||
|
||||
public void updateDeltaStreamerMetrics(long durationInNs, long hiveSyncNs) {
|
||||
public void updateDeltaStreamerMetrics(long durationInNs) {
|
||||
if (config.isMetricsOn()) {
|
||||
Metrics.registerGauge(getMetricsName("deltastreamer", "duration"), getDurationInMs(durationInNs));
|
||||
Metrics.registerGauge(getMetricsName("deltastreamer", "hiveSyncDuration"), getDurationInMs(hiveSyncNs));
|
||||
}
|
||||
}
|
||||
|
||||
public void updateDeltaStreamerMetaSyncMetrics(String syncClassShortName, long syncNs) {
|
||||
if (config.isMetricsOn()) {
|
||||
Metrics.registerGauge(getMetricsName("deltastreamer", syncClassShortName), getDurationInMs(syncNs));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user