1
0

[HUDI-1399] support a independent clustering spark job to asynchronously clustering (#2379)

* [HUDI-1481]  add  structured streaming and delta streamer clustering unit test

* [HUDI-1399] support a independent clustering spark job to asynchronously clustering

* [HUDI-1399]  support a  independent clustering spark job to asynchronously clustering

* [HUDI-1498] Read clustering plan from requested file for inflight instant (#2389)

* [HUDI-1399]  support  a independent clustering spark job with schedule generate instant time

Co-authored-by: satishkotha <satishkotha@uber.com>
This commit is contained in:
lw0090
2021-01-10 09:30:16 +08:00
committed by GitHub
parent 65866c45ec
commit 368c1a8f5c
10 changed files with 338 additions and 38 deletions

View File

@@ -204,9 +204,10 @@ class TestStructuredStreaming extends HoodieClientTestBase {
// check have schedule clustering and clustering file group to one
waitTillHasCompletedReplaceInstant(destPath, 120, 5)
metaClient.reloadActiveTimeline()
assertEquals(1, getLatestFileGroupsFileId.size)
assertEquals(1, getLatestFileGroupsFileId(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH).size)
}
structuredStreamingForTestClusteringRunner(sourcePath, destPath, true, checkClusteringResult)
structuredStreamingForTestClusteringRunner(sourcePath, destPath, true,
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, checkClusteringResult)
}
@Test
@@ -219,21 +220,21 @@ class TestStructuredStreaming extends HoodieClientTestBase {
override def execute(): Unit = {
waitTillHasCompletedReplaceInstant(destPath, 120, 5)
}
}
, "Should have replace commit completed")
}, msg)
println(msg)
}
structuredStreamingForTestClusteringRunner(sourcePath, destPath, false, checkClusteringResult)
structuredStreamingForTestClusteringRunner(sourcePath, destPath, false,
HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH, checkClusteringResult)
}
def structuredStreamingForTestClusteringRunner(sourcePath: String, destPath: String,
isInlineClustering: Boolean, checkClusteringResult: String => Unit): Unit = {
isInlineClustering: Boolean, partitionOfRecords: String, checkClusteringResult: String => Unit): Unit = {
// First insert of data
val records1 = recordsToStrings(dataGen.generateInsertsForPartition("000", 100, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList
val records1 = recordsToStrings(dataGen.generateInsertsForPartition("000", 100, partitionOfRecords)).toList
val inputDF1 = spark.read.json(spark.sparkContext.parallelize(records1, 2))
// Second insert of data
val records2 = recordsToStrings(dataGen.generateInsertsForPartition("001", 100, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)).toList
val records2 = recordsToStrings(dataGen.generateInsertsForPartition("001", 100, partitionOfRecords)).toList
val inputDF2 = spark.read.json(spark.sparkContext.parallelize(records2, 2))
val hudiOptions = getInlineClusteringOpts(isInlineClustering.toString, "2", 100)
@@ -252,7 +253,7 @@ class TestStructuredStreaming extends HoodieClientTestBase {
// check have more than one file group
this.metaClient = new HoodieTableMetaClient(fs.getConf, destPath, true)
assertTrue(getLatestFileGroupsFileId().size > 1)
assertTrue(getLatestFileGroupsFileId(partitionOfRecords).size > 1)
// check clustering result
checkClusteringResult(destPath)
@@ -265,10 +266,10 @@ class TestStructuredStreaming extends HoodieClientTestBase {
Await.result(Future.sequence(Seq(f1, f2)), Duration.Inf)
}
private def getLatestFileGroupsFileId():Array[String] = {
private def getLatestFileGroupsFileId(partition: String):Array[String] = {
getHoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline,
HoodieTestTable.of(metaClient).listAllBaseFiles())
tableView.getLatestFileSlices(HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH)
tableView.getLatestFileSlices(partition)
.toArray().map(slice => slice.asInstanceOf[FileSlice].getFileGroupId.getFileId)
}
@@ -283,7 +284,7 @@ class TestStructuredStreaming extends HoodieClientTestBase {
this.metaClient.reloadActiveTimeline()
val completeReplaceSize = this.metaClient.getActiveTimeline.getCompletedReplaceTimeline().getInstants.toArray.size
println("completeReplaceSize:" + completeReplaceSize)
if(completeReplaceSize > 0) {
if (completeReplaceSize > 0) {
success = true
}
} catch {
@@ -293,7 +294,7 @@ class TestStructuredStreaming extends HoodieClientTestBase {
Thread.sleep(sleepSecsAfterEachRun * 1000)
currTime = System.currentTimeMillis
}
if (!success) throw new IllegalStateException("Timed-out waiting for " + " have completed replace instant appear in " + tablePath)
if (!success) throw new IllegalStateException("Timed-out waiting for completing replace instant appear in " + tablePath)
}
}