Add ability to provide multi-region (global) data consistency across HMS in different regions (#2542)

[global-hive-sync-tool] Add a global hive sync tool to sync hudi table across clusters. Add a way to rollback the replicated time stamp if we fail to sync or if we partly sync Co-authored-by: Jagmeet Bali <jsbali@uber.com>
2021-06-25 08:56:26 +05:30
parent e64fe55054
commit 0fb8556b0d
27 changed files with 1731 additions and 71 deletions
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/InputPathHandler.java
@@ -39,6 +39,14 @@ import static org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.getTableMetaCl
 * InputPathHandler takes in a set of input paths and incremental tables list. Then, classifies the
 * input paths to incremental, snapshot paths and non-hoodie paths. This is then accessed later to
 * mutate the JobConf before processing incremental mode queries and snapshot queries.
+ *
+ * Note: We are adding jobConf of a mapreduce or spark job. The properties in the jobConf are two
+ * type: session properties and table properties from metastore. While session property is common
+ * for all the tables in a query the table properties are unique per table so there is no need to
+ * check if it belongs to the table for which the path handler is now instantiated. The jobConf has
+ * all table properties such as name, last modification time and so on which are unique to a table.
+ * This class is written in such a way that it can handle multiple tables and properties unique to
+ * a table but for table level property such check is not required.
 */
 public class InputPathHandler {

@@ -63,7 +71,6 @@ public class InputPathHandler {
  /**
   * Takes in the original InputPaths and classifies each of them into incremental, snapshot and
   * non-hoodie InputPaths. The logic is as follows:
-   *
   * 1. Check if an inputPath starts with the same basepath as any of the metadata basepaths we know
   *    1a. If yes, this belongs to a Hoodie table that we already know about. Simply classify this
   *        as incremental or snapshot - We can get the table name of this inputPath from the
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java
@@ -18,13 +18,14 @@

 package org.apache.hudi.hadoop.utils;

-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hudi.common.table.HoodieTableMetaClient;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;
 import org.apache.hudi.common.util.CollectionUtils;
 import org.apache.hudi.exception.HoodieIOException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.log4j.LogManager;
 import org.apache.log4j.Logger;

@@ -73,6 +74,7 @@ public class HoodieHiveUtils {
  public static final int MAX_COMMIT_ALL = -1;
  public static final int DEFAULT_LEVELS_TO_BASEPATH = 3;
  public static final Pattern HOODIE_CONSUME_MODE_PATTERN_STRING = Pattern.compile("hoodie\\.(.*)\\.consume\\.mode");
+  public static final String GLOBALLY_CONSISTENT_READ_TIMESTAMP = "last_replication_timestamp";

  public static boolean stopAtCompaction(JobContext job, String tableName) {
    String compactionPropName = String.format(HOODIE_STOP_AT_COMPACTION_PATTERN, tableName);
--- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java
+++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieInputFormatUtils.java
@@ -442,6 +442,7 @@ public class HoodieInputFormatUtils {
        }

        HoodieTimeline timeline = HoodieHiveUtils.getTableTimeline(metaClient.getTableConfig().getTableName(), job, metaClient);
+
        HoodieTableFileSystemView fsView = fsViewCache.computeIfAbsent(metaClient, tableMetaClient ->
            FileSystemViewManager.createInMemoryFileSystemViewWithTimeline(engineContext, tableMetaClient, buildMetadataConfig(job), timeline));
        List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();