Reformatting code per Google Code Style all over

2017-11-12 22:54:56 -08:00
parent 5a62480a92
commit e45679f5e2
254 changed files with 21580 additions and 21108 deletions
--- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala
+++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala
@@ -34,11 +34,11 @@ import scala.collection.mutable._
 /**
  * Spark job to de-duplicate data present in a partition path
  */
-class DedupeSparkJob (basePath: String,
-                      duplicatedPartitionPath: String,
-                      repairOutputPath: String,
-                      sqlContext: SQLContext,
-                      fs: FileSystem) {
+class DedupeSparkJob(basePath: String,
+                     duplicatedPartitionPath: String,
+                     repairOutputPath: String,
+                     sqlContext: SQLContext,
+                     fs: FileSystem) {


  val sparkHelper = new SparkHelper(sqlContext, fs)
@@ -50,8 +50,9 @@ class DedupeSparkJob (basePath: String,
    * @param tblName
    * @return
    */
-  def getDupeKeyDF(tblName: String) : DataFrame = {
-    val dupeSql = s"""
+  def getDupeKeyDF(tblName: String): DataFrame = {
+    val dupeSql =
+      s"""
      select  `${HoodieRecord.RECORD_KEY_METADATA_FIELD}` as dupe_key,
      count(*) as dupe_cnt
      from ${tblName}
@@ -69,7 +70,7 @@ class DedupeSparkJob (basePath: String,
    *
    * @return
    */
-  private def planDuplicateFix() : HashMap[String, HashSet[String]] = {
+  private def planDuplicateFix(): HashMap[String, HashSet[String]] = {

    val tmpTableName = s"htbl_${System.currentTimeMillis()}"
    val dedupeTblName = s"${tmpTableName}_dupeKeys"
@@ -78,17 +79,18 @@ class DedupeSparkJob (basePath: String,

    val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
    val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
-    val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
+    val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
    val filteredStatuses = latestFiles.map(f => f.getPath)
    LOG.info(s" List of files under partition: ${} =>  ${filteredStatuses.mkString(" ")}")

-    val df = sqlContext.parquetFile(filteredStatuses:_*)
+    val df = sqlContext.parquetFile(filteredStatuses: _*)
    df.registerTempTable(tmpTableName)
    val dupeKeyDF = getDupeKeyDF(tmpTableName)
    dupeKeyDF.registerTempTable(dedupeTblName)

    // Obtain necessary satellite information for duplicate rows
-    val dupeDataSql = s"""
+    val dupeDataSql =
+      s"""
        SELECT `_hoodie_record_key`, `_hoodie_partition_path`, `_hoodie_file_name`, `_hoodie_commit_time`
        FROM ${tmpTableName} h
        JOIN ${dedupeTblName} d
@@ -111,9 +113,9 @@ class DedupeSparkJob (basePath: String,

      rows.foreach(r => {
        val c = r(3).asInstanceOf[String].toLong
-        if (c != maxCommit){
+        if (c != maxCommit) {
          val f = r(2).asInstanceOf[String].split("_")(0)
-          if (!fileToDeleteKeyMap.contains(f)){
+          if (!fileToDeleteKeyMap.contains(f)) {
            fileToDeleteKeyMap(f) = HashSet[String]()
          }
          fileToDeleteKeyMap(f).add(key)
@@ -130,28 +132,30 @@ class DedupeSparkJob (basePath: String,
    val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
    val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)

-    val latestFiles:java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())
+    val latestFiles: java.util.List[HoodieDataFile] = fsView.getLatestDataFiles().collect(Collectors.toList[HoodieDataFile]())

    val fileNameToPathMap = latestFiles.map(f => (f.getFileId, new Path(f.getPath))).toMap
    val dupeFixPlan = planDuplicateFix()

    // 1. Copy all latest files into the temp fix path
-    fileNameToPathMap.foreach{ case(fileName, filePath)  => {
+    fileNameToPathMap.foreach { case (fileName, filePath) => {
      val badSuffix = if (dupeFixPlan.contains(fileName)) ".bad" else ""
      val dstPath = new Path(s"${repairOutputPath}/${filePath.getName}${badSuffix}")
      LOG.info(s"Copying from ${filePath} to ${dstPath}")
      FileUtil.copy(fs, filePath, fs, dstPath, false, true, fs.getConf)
-    }}
+    }
+    }

    // 2. Remove duplicates from the bad files
-    dupeFixPlan.foreach{case(fileName, keysToSkip) => {
+    dupeFixPlan.foreach { case (fileName, keysToSkip) => {
      val commitTime = FSUtils.getCommitTime(fileNameToPathMap(fileName).getName)
      val badFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}.bad")
      val newFilePath = new Path(s"${repairOutputPath}/${fileNameToPathMap(fileName).getName}")
      LOG.info(" Skipping and writing new file for : " + fileName)
      SparkHelpers.skipKeysAndWriteNewFile(commitTime, fs, badFilePath, newFilePath, dupeFixPlan(fileName))
      fs.delete(badFilePath, false)
-    }}
+    }
+    }

    // 3. Check that there are no duplicates anymore.
    val df = sqlContext.read.parquet(s"${repairOutputPath}/*.parquet")
@@ -186,6 +190,7 @@ class DedupeSparkJob (basePath: String,
        LOG.info(s"[FOR REAL!!!] Copying from ${srcPath} to ${dstPath}")
        FileUtil.copy(fs, srcPath, fs, dstPath, false, true, fs.getConf)
      }
-    }}
+    }
+    }
  }
 }
--- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala
+++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala
@@ -17,9 +17,9 @@
 package com.uber.hoodie.cli

 import com.uber.hoodie.avro.HoodieAvroWriteSupport
-import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
 import com.uber.hoodie.common.model.HoodieRecord
 import com.uber.hoodie.common.util.ParquetUtils
+import com.uber.hoodie.common.{BloomFilter, HoodieJsonPayload}
 import com.uber.hoodie.config.{HoodieIndexConfig, HoodieStorageConfig}
 import com.uber.hoodie.io.storage.{HoodieParquetConfig, HoodieParquetWriter}
 import org.apache.avro.Schema
@@ -107,7 +107,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
    * @param file
    * @param sqlContext
    */
-  def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) ={
+  def getKeyCount(file: String, sqlContext: org.apache.spark.sql.SQLContext) = {
    println(getRowKeyDF(file).collect().size)
  }

@@ -122,7 +122,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
    * @param file
    * @return
    */
-  def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String) : Boolean = {
+  def fileKeysAgainstBF(conf: Configuration, sqlContext: SQLContext, file: String): Boolean = {
    val bfStr = SparkHelpers.getBloomFilter(file, conf)
    val bf = new com.uber.hoodie.common.BloomFilter(bfStr)
    val foundCount = sqlContext.parquetFile(file)
@@ -134,7 +134,7 @@ class SparkHelper(sqlContext: SQLContext, fs: FileSystem) {
    totalCount == foundCount
  }

-  def getDistinctKeyDF(paths: List[String]) : DataFrame = {
-    sqlContext.read.parquet(paths:_*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
+  def getDistinctKeyDF(paths: List[String]): DataFrame = {
+    sqlContext.read.parquet(paths: _*).select(s"`${HoodieRecord.RECORD_KEY_METADATA_FIELD}`").distinct()
  }
 }