1
0

[HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync, hudi-dla-sync (#1810)

- Generalize the hive-sync module for syncing to multiple metastores
- Added new options for datasource
- Added new command line for delta streamer 

Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
lw0090
2020-08-06 12:34:55 +08:00
committed by GitHub
parent c21209cb58
commit 51ea27d665
44 changed files with 1663 additions and 145 deletions

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
@@ -31,6 +32,7 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieCompactionConfig;
@@ -39,6 +41,7 @@ import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.sync.common.AbstractSyncTool;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.exception.HoodieDeltaStreamerException;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Config;
@@ -66,10 +69,15 @@ import org.apache.spark.sql.SparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.function.Function;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.function.Function;
import java.util.Properties;
import java.util.Set;
import java.util.stream.Collectors;
import scala.collection.JavaConversions;
@@ -391,6 +399,7 @@ public class DeltaSync implements Serializable {
long totalRecords = writeStatusRDD.mapToDouble(WriteStatus::getTotalRecords).sum().longValue();
boolean hasErrors = totalErrorRecords > 0;
long hiveSyncTimeMs = 0;
long metaSyncTimeMs = 0;
if (!hasErrors || cfg.commitOnErrors) {
HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
checkpointCommitMetadata.put(CHECKPOINT_KEY, checkpointStr);
@@ -413,10 +422,7 @@ public class DeltaSync implements Serializable {
}
if (!isEmpty) {
// Sync to hive if enabled
Timer.Context hiveSyncContext = metrics.getHiveSyncTimerContext();
syncHiveIfNeeded();
hiveSyncTimeMs = hiveSyncContext != null ? hiveSyncContext.stop() : 0;
syncMeta(metrics);
}
} else {
LOG.info("Commit " + instantTime + " failed!");
@@ -438,8 +444,7 @@ public class DeltaSync implements Serializable {
long overallTimeMs = overallTimerContext != null ? overallTimerContext.stop() : 0;
// Send DeltaStreamer Metrics
metrics.updateDeltaStreamerMetrics(overallTimeMs, hiveSyncTimeMs);
metrics.updateDeltaStreamerMetrics(overallTimeMs);
return Pair.of(scheduledCompactionInstant, writeStatusRDD);
}
@@ -471,12 +476,41 @@ public class DeltaSync implements Serializable {
throw lastException;
}
/**
* Sync to Hive.
*/
public void syncHiveIfNeeded() {
private String getSyncClassShortName(String syncClassName) {
return syncClassName.substring(syncClassName.lastIndexOf(".") + 1);
}
private void syncMeta(HoodieDeltaStreamerMetrics metrics) {
Set<String> syncClientToolClasses = new HashSet<>(Arrays.asList(cfg.syncClientToolClass.split(",")));
// for backward compatibility
if (cfg.enableHiveSync) {
syncHive();
cfg.enableMetaSync = true;
syncClientToolClasses.add(HiveSyncTool.class.getName());
LOG.info("When set --enable-hive-sync will use HiveSyncTool for backward compatibility");
}
if (cfg.enableMetaSync) {
for (String impl : syncClientToolClasses) {
Timer.Context syncContext = metrics.getMetaSyncTimerContext();
impl = impl.trim();
AbstractSyncTool syncTool = null;
switch (impl) {
case "org.apache.hudi.hive.HiveSyncTool":
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
syncTool = new HiveSyncTool(hiveSyncConfig, new HiveConf(conf, HiveConf.class), fs);
break;
default:
FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration());
Properties properties = new Properties();
properties.putAll(props);
properties.put("basePath", cfg.targetBasePath);
syncTool = (AbstractSyncTool) ReflectionUtils.loadClass(impl, new Class[]{Properties.class, FileSystem.class}, properties, fs);
}
syncTool.syncHoodieTable();
long metaSyncTimeMs = syncContext != null ? syncContext.stop() : 0;
metrics.updateDeltaStreamerMetaSyncMetrics(getSyncClassShortName(impl), metaSyncTimeMs);
}
}
}

View File

@@ -38,6 +38,7 @@ import org.apache.hudi.utilities.IdentitySplitter;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.hive.HiveSyncTool;
import org.apache.hudi.utilities.HiveIncrementalPuller;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.utilities.checkpointing.InitialCheckPointProvider;
@@ -268,9 +269,16 @@ public class HoodieDeltaStreamer implements Serializable {
description = "Should duplicate records from source be dropped/filtered out before insert/bulk-insert")
public Boolean filterDupes = false;
//will abandon in the future version, recommended use --enable-sync
@Parameter(names = {"--enable-hive-sync"}, description = "Enable syncing to hive")
public Boolean enableHiveSync = false;
@Parameter(names = {"--enable-sync"}, description = "Enable syncing meta")
public Boolean enableMetaSync = false;
@Parameter(names = {"--sync-tool-classes"}, description = "Meta sync client tool, using comma to separate multi tools")
public String syncClientToolClass = HiveSyncTool.class.getName();
@Parameter(names = {"--max-pending-compactions"},
description = "Maximum number of outstanding inflight/requested compactions. Delta Sync will not happen unless"
+ "outstanding compactions is less than this number")
@@ -447,6 +455,11 @@ public class HoodieDeltaStreamer implements Serializable {
Map<String, String> additionalSparkConfigs = SchedulerConfGenerator.getSparkSchedulingConfigs(cfg);
JavaSparkContext jssc =
UtilHelpers.buildSparkContext("delta-streamer-" + cfg.targetTableName, cfg.sparkMaster, additionalSparkConfigs);
if (cfg.enableHiveSync) {
LOG.warn("--enable-hive-sync will be deprecated in a future release; please use --enable-sync instead for Hive syncing");
}
try {
new HoodieDeltaStreamer(cfg, jssc).sync();
} finally {

View File

@@ -32,8 +32,10 @@ public class HoodieDeltaStreamerMetrics implements Serializable {
public String overallTimerName = null;
public String hiveSyncTimerName = null;
private transient Timer overallTimer = null;
public transient Timer hiveSyncTimer = null;
public String metaSyncTimerName = null;
private Timer overallTimer = null;
public Timer hiveSyncTimer = null;
public Timer metaSyncTimer = null;
public HoodieDeltaStreamerMetrics(HoodieWriteConfig config) {
this.config = config;
@@ -42,6 +44,7 @@ public class HoodieDeltaStreamerMetrics implements Serializable {
Metrics.init(config);
this.overallTimerName = getMetricsName("timer", "deltastreamer");
this.hiveSyncTimerName = getMetricsName("timer", "deltastreamerHiveSync");
this.metaSyncTimerName = getMetricsName("timer", "deltastreamerMetaSync");
}
}
@@ -59,6 +62,13 @@ public class HoodieDeltaStreamerMetrics implements Serializable {
return hiveSyncTimer == null ? null : hiveSyncTimer.time();
}
public Timer.Context getMetaSyncTimerContext() {
if (config.isMetricsOn() && metaSyncTimer == null) {
metaSyncTimer = createTimer(metaSyncTimerName);
}
return metaSyncTimer == null ? null : metaSyncTimer.time();
}
private Timer createTimer(String name) {
return config.isMetricsOn() ? Metrics.getInstance().getRegistry().timer(name) : null;
}
@@ -67,10 +77,15 @@ public class HoodieDeltaStreamerMetrics implements Serializable {
return config == null ? null : String.format("%s.%s.%s", tableName, action, metric);
}
public void updateDeltaStreamerMetrics(long durationInNs, long hiveSyncNs) {
public void updateDeltaStreamerMetrics(long durationInNs) {
if (config.isMetricsOn()) {
Metrics.registerGauge(getMetricsName("deltastreamer", "duration"), getDurationInMs(durationInNs));
Metrics.registerGauge(getMetricsName("deltastreamer", "hiveSyncDuration"), getDurationInMs(hiveSyncNs));
}
}
public void updateDeltaStreamerMetaSyncMetrics(String syncClassShortName, long syncNs) {
if (config.isMetricsOn()) {
Metrics.registerGauge(getMetricsName("deltastreamer", syncClassShortName), getDurationInMs(syncNs));
}
}