[HUDI-69] Support Spark Datasource for MOR table - RDD approach (#1848)

- This PR implements Spark Datasource for MOR table in the RDD approach. - Implemented SnapshotRelation - Implemented HudiMergeOnReadRDD - Implemented separate Iterator to handle merge and unmerge record reader. - Added TestMORDataSource to verify this feature. - Clean up test file name, add tests for mixed query type tests - We can now revert the change made in DefaultSource Co-authored-by: Vinoth Chandar <vchandar@confluent.io>
2020-08-07 00:28:14 -07:00
parent ab453f2623
commit 4f74a84607
22 changed files with 1317 additions and 409 deletions
--- a/hudi-common/src/main/java/org/apache/hudi/common/table/log/LogReaderUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/table/log/LogReaderUtils.java
@@ -29,9 +29,9 @@ import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
 import org.apache.hudi.common.table.timeline.HoodieTimeline;

 import org.apache.avro.Schema;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapred.JobConf;

 import java.io.IOException;
 import java.util.List;
@@ -62,15 +62,15 @@ public class LogReaderUtils {
    return writerSchema;
  }

-  public static Schema readLatestSchemaFromLogFiles(String basePath, List<String> deltaFilePaths, JobConf jobConf)
+  public static Schema readLatestSchemaFromLogFiles(String basePath, List<String> deltaFilePaths, Configuration config)
      throws IOException {
-    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jobConf, basePath);
+    HoodieTableMetaClient metaClient = new HoodieTableMetaClient(config, basePath);
    List<String> deltaPaths = deltaFilePaths.stream().map(s -> new HoodieLogFile(new Path(s)))
        .sorted(HoodieLogFile.getReverseLogFileComparator()).map(s -> s.getPath().toString())
        .collect(Collectors.toList());
    if (deltaPaths.size() > 0) {
      for (String logPath : deltaPaths) {
-        FileSystem fs = FSUtils.getFs(logPath, jobConf);
+        FileSystem fs = FSUtils.getFs(logPath, config);
        Schema schemaFromLogFile =
            readSchemaFromLogFileInReverse(fs, metaClient.getActiveTimeline(), new Path(logPath));
        if (schemaFromLogFile != null) {
--- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java
+++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestDataGenerator.java
@@ -634,6 +634,10 @@ public class HoodieTestDataGenerator {
    return generateUniqueUpdatesStream(instantTime, n, TRIP_EXAMPLE_SCHEMA).collect(Collectors.toList());
  }

+  public List<HoodieRecord> generateUniqueUpdatesAsPerSchema(String instantTime, Integer n, String schemaStr) {
+    return generateUniqueUpdatesStream(instantTime, n, schemaStr).collect(Collectors.toList());
+  }
+
  /**
   * Generates deduped delete of keys previously inserted, randomly distributed across the keys above.
   *
@@ -745,6 +749,17 @@ public class HoodieTestDataGenerator {
    return result.stream();
  }

+  /**
+   * Generates deduped delete records previously inserted, randomly distributed across the keys above.
+   *
+   * @param instantTime Commit Timestamp
+   * @param n          Number of unique records
+   * @return List of hoodie records for delete
+   */
+  public List<HoodieRecord> generateUniqueDeleteRecords(String instantTime, Integer n) {
+    return generateUniqueDeleteRecordStream(instantTime, n).collect(Collectors.toList());
+  }
+
  public boolean deleteExistingKeyIfPresent(HoodieKey key) {
    Map<Integer, KeyPartition> existingKeys = existingKeysBySchema.get(TRIP_EXAMPLE_SCHEMA);
    Integer numExistingKeys = numKeysBySchema.get(TRIP_EXAMPLE_SCHEMA);