[HUDI-583] Code Cleanup, remove redundant code, and other changes (#1237)

2020-02-02 11:03:44 +01:00
parent f27c7a16c6
commit 5b7bb142dc
69 changed files with 447 additions and 582 deletions
--- a/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java
+++ b/hudi-spark/src/main/java/org/apache/hudi/DataSourceUtils.java
@@ -43,7 +43,7 @@ import org.apache.spark.api.java.JavaSparkContext;

 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;
@@ -127,7 +127,7 @@ public class DataSourceUtils {
  }

  public static void checkRequiredProperties(TypedProperties props, List<String> checkPropNames) {
-    checkPropNames.stream().forEach(prop -> {
+    checkPropNames.forEach(prop -> {
      if (!props.containsKey(prop)) {
        throw new HoodieNotSupportedException("Required property " + prop + " is missing");
      }
@@ -182,19 +182,13 @@ public class DataSourceUtils {
  @SuppressWarnings("unchecked")
  public static JavaRDD<HoodieRecord> dropDuplicates(JavaSparkContext jssc, JavaRDD<HoodieRecord> incomingHoodieRecords,
                                                     HoodieWriteConfig writeConfig, Option<EmbeddedTimelineService> timelineService) {
-    HoodieReadClient client = null;
-    try {
-      client = new HoodieReadClient<>(jssc, writeConfig, timelineService);
+    try (HoodieReadClient client = new HoodieReadClient<>(jssc, writeConfig, timelineService)) {
      return client.tagLocation(incomingHoodieRecords)
          .filter(r -> !((HoodieRecord<HoodieRecordPayload>) r).isCurrentLocationKnown());
    } catch (TableNotFoundException e) {
      // this will be executed when there is no hoodie table yet
      // so no dups to drop
      return incomingHoodieRecords;
-    } finally {
-      if (null != client) {
-        client.close();
-      }
    }
  }

@@ -207,12 +201,12 @@ public class DataSourceUtils {
  }

  public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath) {
-    checkRequiredProperties(props, Arrays.asList(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()));
+    checkRequiredProperties(props, Collections.singletonList(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()));
    HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
    hiveSyncConfig.basePath = basePath;
    hiveSyncConfig.usePreApacheInputFormat =
        props.getBoolean(DataSourceWriteOptions.HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY(),
-            Boolean.valueOf(DataSourceWriteOptions.DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL()));
+            Boolean.parseBoolean(DataSourceWriteOptions.DEFAULT_USE_PRE_APACHE_INPUT_FORMAT_OPT_VAL()));
    hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(),
        DataSourceWriteOptions.DEFAULT_HIVE_DATABASE_OPT_VAL());
    hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY());
--- a/hudi-spark/src/main/scala/org/apache/hudi/AvroConversionUtils.scala
+++ b/hudi-spark/src/main/scala/org/apache/hudi/AvroConversionUtils.scala
@@ -52,7 +52,7 @@ object AvroConversionUtils {
  }

  def createRddForDeletes(df: DataFrame, rowField: String, partitionField: String): RDD[HoodieKey] = {
-    df.rdd.map(row => (new HoodieKey(row.getAs[String](rowField), row.getAs[String](partitionField))))
+    df.rdd.map(row => new HoodieKey(row.getAs[String](rowField), row.getAs[String](partitionField)))
  }

  def createDataFrame(rdd: RDD[GenericRecord], schemaStr: String, ss: SparkSession): Dataset[Row] = {
@@ -67,7 +67,7 @@ object AvroConversionUtils {
          val convertor = AvroConversionHelper.createConverterToRow(schema, dataType)
          records.map { x => convertor(x).asInstanceOf[Row] }
        }
-      }, convertAvroSchemaToStructType(new Schema.Parser().parse(schemaStr))).asInstanceOf[Dataset[Row]]
+      }, convertAvroSchemaToStructType(new Schema.Parser().parse(schemaStr)))
    }
  }

--- a/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala
+++ b/hudi-spark/src/main/scala/org/apache/hudi/DefaultSource.scala
@@ -63,7 +63,7 @@ class DefaultSource extends RelationProvider
      sqlContext.sparkContext.hadoopConfiguration.setClass(
        "mapreduce.input.pathFilter.class",
        classOf[HoodieROTablePathFilter],
-        classOf[org.apache.hadoop.fs.PathFilter]);
+        classOf[org.apache.hadoop.fs.PathFilter])

      log.info("Constructing hoodie (as parquet) data source with options :" + parameters)
      log.warn("Snapshot view not supported yet via data source, for MERGE_ON_READ tables. " +
--- a/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
+++ b/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
@@ -77,7 +77,7 @@ private[hudi] object HoodieSparkSqlWriter {

    val jsc = new JavaSparkContext(sparkContext)
    val basePath = new Path(parameters("path"))
-    val commitTime = HoodieActiveTimeline.createNewInstantTime();
+    val commitTime = HoodieActiveTimeline.createNewInstantTime()
    val fs = basePath.getFileSystem(sparkContext.hadoopConfiguration)
    var exists = fs.exists(new Path(basePath, HoodieTableMetaClient.METAFOLDER_NAME))

@@ -282,7 +282,7 @@ private[hudi] object HoodieSparkSqlWriter {
      client.close()
      commitSuccess && syncHiveSucess
    } else {
-      log.error(s"$operation failed with ${errorCount} errors :");
+      log.error(s"$operation failed with $errorCount errors :")
      if (log.isTraceEnabled) {
        log.trace("Printing out the top 100 errors")
        writeStatuses.rdd.filter(ws => ws.hasErrors)