[HUDI-2225] Add a compaction job in hudi-examples (#3347)

2021-08-03 09:01:56 +05:30
parent b21ae68e67
commit aa857beee0
1 changed files with 113 additions and 0 deletions
--- a/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieMorCompactionJob.scala
+++ b/hudi-examples/src/main/scala/org/apache/hudi/examples/spark/HoodieMorCompactionJob.scala
@@ -0,0 +1,113 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
 package org.apache.hudi.examples.spark
 import org.apache.hudi.DataSourceWriteOptions.{PARTITIONPATH_FIELD_OPT_KEY, PRECOMBINE_FIELD_OPT_KEY, RECORDKEY_FIELD_OPT_KEY, TABLE_TYPE_OPT_KEY}
 import org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs
 import org.apache.hudi.client.SparkRDDWriteClient
 import org.apache.hudi.client.common.HoodieSparkEngineContext
 import org.apache.hudi.common.model.{HoodieAvroPayload, HoodieRecordPayload, HoodieTableType}
 import org.apache.hudi.common.util.Option
 import org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME
 import org.apache.hudi.config.{HoodieCompactionConfig, HoodieWriteConfig}
 import org.apache.hudi.examples.common.{HoodieExampleDataGenerator, HoodieExampleSparkUtils}
 import org.apache.spark.sql.SaveMode.{Append, Overwrite}
 import org.apache.spark.sql.SparkSession
 import scala.collection.JavaConverters._
 /**
 * Simple example to run a compaction job for MOR table.
 * To run this example, you should:
 *   1. For running in IDE, set VM options `-Dspark.master=local[2]`
 *   2. For running in shell, using `spark-submit`
 *
 * Usage: HoodieMorCompactionJob <tablePath> <tableName>.
 * <tablePath> and <tableName> describe root path of hudi and table name
 * for example, `HoodieMorCompactionJob file:///tmp/hoodie/hudi_mor_table hudi_mor_table`
 */
 object HoodieMorCompactionJob {
  def main(args: Array[String]): Unit = {
    if (args.length < 2) {
      System.err.println("Usage: HoodieMorCompactionJob <tablePath> <tableName>")
      System.exit(1)
    }
    val spark = HoodieExampleSparkUtils.defaultSparkSession("Hudi MOR table compaction via Spark example")
    val dataGen = new HoodieExampleDataGenerator[HoodieAvroPayload]
    val tablePath = args(0)
    val tableName = args(1)
    insertData(spark, tablePath, tableName, dataGen, HoodieTableType.MERGE_ON_READ.name())
    updateData(spark, tablePath, tableName, dataGen, HoodieTableType.MERGE_ON_READ.name())
    val cfg = HoodieWriteConfig.newBuilder()
      .withPath(tablePath)
      .withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA)
      .withParallelism(2, 2)
      .forTable(tableName)
      .withCompactionConfig(HoodieCompactionConfig.newBuilder()
        .withInlineCompaction(true)
        .withMaxNumDeltaCommitsBeforeCompaction(1).build())
      .build()
    val client = new SparkRDDWriteClient[HoodieRecordPayload[Nothing]](new HoodieSparkEngineContext(spark.sparkContext), cfg)
    try {
      val instant = client.scheduleCompaction(Option.empty())
      client.compact(instant.get())
      client.clean()
    } catch {
      case e: Exception => System.err.println(s"Compaction failed due to", e)
    } finally {
      client.close()
      spark.stop()
    }
  }
  def insertData(spark: SparkSession, tablePath: String, tableName: String,
                 dataGen: HoodieExampleDataGenerator[HoodieAvroPayload], tableType: String): Unit = {
    val commitTime: String = System.currentTimeMillis().toString
    val inserts = dataGen.convertToStringList(dataGen.generateInserts(commitTime, 20))
    val df = spark.read.json(spark.sparkContext.parallelize(inserts.asScala, 1))
    df.write.format("org.apache.hudi").
      options(getQuickstartWriteConfigs).
      option(PRECOMBINE_FIELD_OPT_KEY.key, "ts").
      option(RECORDKEY_FIELD_OPT_KEY.key, "uuid").
      option(PARTITIONPATH_FIELD_OPT_KEY.key, "partitionpath").
      option(TABLE_NAME.key, tableName).
      option(TABLE_TYPE_OPT_KEY.key, tableType).
      mode(Overwrite).
      save(tablePath)
  }
  def updateData(spark: SparkSession, tablePath: String, tableName: String,
                 dataGen: HoodieExampleDataGenerator[HoodieAvroPayload], tableType: String): Unit = {
    val commitTime: String = System.currentTimeMillis().toString
    val updates = dataGen.convertToStringList(dataGen.generateUpdates(commitTime, 10))
    val df = spark.read.json(spark.sparkContext.parallelize(updates.asScala, 1))
    df.write.format("org.apache.hudi").
      options(getQuickstartWriteConfigs).
      option(PRECOMBINE_FIELD_OPT_KEY.key, "ts").
      option(RECORDKEY_FIELD_OPT_KEY.key, "uuid").
      option(PARTITIONPATH_FIELD_OPT_KEY.key, "partitionpath").
      option(TABLE_NAME.key, tableName).
      option(TABLE_TYPE_OPT_KEY.key, tableType).
      mode(Append).
      save(tablePath)
  }
 }