New Features in DeltaStreamer :

(1) Apply transformation when using delta-streamer to ingest data. (2) Add Hudi Incremental Source for Delta Streamer (3) Allow delta-streamer config-property to be passed as command-line (4) Add Hive Integration to Delta-Streamer and address Review comments (5) Ensure MultiPartKeysValueExtractor handle hive style partition description (6) Reuse same spark session on both source and transformer (7) Support extracting partition fields from _hoodie_partition_path for HoodieIncrSource (8) Reuse Binary Avro coders (9) Add push down filter for Incremental source (10) Add Hoodie DeltaStreamer metrics to track total time taken
2018-10-10 10:31:34 -07:00
parent c70dbc13e9
commit 3a0044216c
65 changed files with 2752 additions and 911 deletions
--- a/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala
+++ b/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala
@@ -64,21 +64,33 @@ class IncrementalRelation(val sqlContext: SQLContext,
    throw new HoodieException(s"Specify the begin instant time to pull from using " +
      s"option ${DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY}")
  }
+
+  val lastInstant = commitTimeline.lastInstant().get()
+
  val commitsToReturn = commitTimeline.findInstantsInRange(
    optParams(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY),
-    optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY,
-      commitTimeline.lastInstant().get().getTimestamp))
+    optParams.getOrElse(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY, lastInstant.getTimestamp))
    .getInstants.iterator().toList

  // use schema from a file produced in the latest instant
  val latestSchema = {
+    // use last instant if instant range is empty
+    val instant = commitsToReturn.lastOption.getOrElse(lastInstant)
    val latestMeta = HoodieCommitMetadata
-          .fromBytes(commitTimeline.getInstantDetails(commitsToReturn.last).get, classOf[HoodieCommitMetadata])
+          .fromBytes(commitTimeline.getInstantDetails(instant).get, classOf[HoodieCommitMetadata])
    val metaFilePath = latestMeta.getFileIdAndFullPaths(basePath).values().iterator().next()
    AvroConversionUtils.convertAvroSchemaToStructType(ParquetUtils.readAvroSchema(
      sqlContext.sparkContext.hadoopConfiguration, new Path(metaFilePath)))
  }

+  val filters = {
+    if (optParams.contains(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY)) {
+      val filterStr = optParams.get(DataSourceReadOptions.PUSH_DOWN_INCR_FILTERS_OPT_KEY).getOrElse("")
+      filterStr.split(",").filter(!_.isEmpty)
+    }
+    Array[String]()
+  }
+
  override def schema: StructType = latestSchema

  override def buildScan(): RDD[Row] = {
@@ -92,12 +104,17 @@ class IncrementalRelation(val sqlContext: SQLContext,
    // will filter out all the files incorrectly.
    sqlContext.sparkContext.hadoopConfiguration.unset("mapreduce.input.pathFilter.class")
    val sOpts = optParams.filter(p => !p._1.equalsIgnoreCase("path"))
-    sqlContext.read.options(sOpts)
-      .schema(latestSchema) // avoid AnalysisException for empty input
-      .parquet(fileIdToFullPath.values.toList: _*)
-      .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp))
-      .filter(String.format("%s <= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp))
-      .toDF().rdd
-
+    if (fileIdToFullPath.isEmpty) {
+      sqlContext.sparkContext.emptyRDD[Row]
+    } else {
+      log.info("Additional Filters to be applied to incremental source are :" + filters)
+      filters.foldLeft(sqlContext.read.options(sOpts)
+        .schema(latestSchema)
+        .parquet(fileIdToFullPath.values.toList: _*)
+        .filter(String.format("%s >= '%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.head.getTimestamp))
+        .filter(String.format("%s <= '%s'",
+          HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitsToReturn.last.getTimestamp)))((e, f) => e.filter(f))
+        .toDF().rdd
+    }
  }
 }