[HUDI-2778] Optimize statistics collection related codes and add some docs for z-order add fix some bugs (#4013)
* [HUDI-2778] Optimize statistics collection related codes and add more docs for z-order. * add test code for multi-thread parquet footer read
This commit is contained in:
@@ -21,9 +21,11 @@ package org.apache.hudi.functional
|
||||
import java.sql.{Date, Timestamp}
|
||||
|
||||
import org.apache.hadoop.fs.Path
|
||||
import org.apache.hudi.common.model.HoodieFileFormat
|
||||
import org.apache.hudi.config.{HoodieClusteringConfig, HoodieWriteConfig}
|
||||
import org.apache.hudi.{DataSourceReadOptions, DataSourceWriteOptions}
|
||||
import org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings
|
||||
import org.apache.hudi.common.util.{BaseFileUtils, ParquetUtils}
|
||||
import org.apache.hudi.testutils.HoodieClientTestBase
|
||||
import org.apache.spark.ZCurveOptimizeHelper
|
||||
import org.apache.spark.sql._
|
||||
@@ -88,8 +90,23 @@ class TestOptimizeTable extends HoodieClientTestBase {
|
||||
.save(basePath)
|
||||
|
||||
assertEquals(1000, spark.read.format("hudi").load(basePath).count())
|
||||
assertEquals(1000,
|
||||
spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true").format("hudi").load(basePath).count())
|
||||
// use unsorted col as filter.
|
||||
assertEquals(spark.read
|
||||
.format("hudi").load(basePath).where("end_lat >= 0 and rider != '1' and weight > 0.0").count(),
|
||||
spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true")
|
||||
.format("hudi").load(basePath).where("end_lat >= 0 and rider != '1' and weight > 0.0").count())
|
||||
// use sorted col as filter.
|
||||
assertEquals(spark.read.format("hudi").load(basePath)
|
||||
.where("begin_lat >= 0.49 and begin_lat < 0.51 and begin_lon >= 0.49 and begin_lon < 0.51").count(),
|
||||
spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true")
|
||||
.format("hudi").load(basePath)
|
||||
.where("begin_lat >= 0.49 and begin_lat < 0.51 and begin_lon >= 0.49 and begin_lon < 0.51").count())
|
||||
// use sorted cols and unsorted cols as filter
|
||||
assertEquals(spark.read.format("hudi").load(basePath)
|
||||
.where("begin_lat >= 0.49 and begin_lat < 0.51 and end_lat > 0.56").count(),
|
||||
spark.read.option(DataSourceReadOptions.ENABLE_DATA_SKIPPING.key(), "true")
|
||||
.format("hudi").load(basePath)
|
||||
.where("begin_lat >= 0.49 and begin_lat < 0.51 and end_lat > 0.56").count())
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -97,10 +114,13 @@ class TestOptimizeTable extends HoodieClientTestBase {
|
||||
val testPath = new Path(System.getProperty("java.io.tmpdir"), "minMax")
|
||||
val statisticPath = new Path(System.getProperty("java.io.tmpdir"), "stat")
|
||||
val fs = testPath.getFileSystem(spark.sparkContext.hadoopConfiguration)
|
||||
val complexDataFrame = createComplexDataFrame(spark)
|
||||
complexDataFrame.repartition(3).write.mode("overwrite").save(testPath.toString)
|
||||
val df = spark.read.load(testPath.toString)
|
||||
try {
|
||||
val complexDataFrame = createComplexDataFrame(spark)
|
||||
complexDataFrame.repartition(3).write.mode("overwrite").save(testPath.toString)
|
||||
val df = spark.read.load(testPath.toString)
|
||||
// test z-order sort for all primitive type, should not throw exception.
|
||||
ZCurveOptimizeHelper.createZIndexedDataFrameByMapValue(df, "c1,c2,c3,c4,c5,c6,c7,c8", 20).show(1)
|
||||
ZCurveOptimizeHelper.createZIndexedDataFrameBySample(df, "c1,c2,c3,c4,c5,c6,c7,c8", 20).show(1)
|
||||
// do not support TimeStampType, so if we collect statistics for c4, should throw exception
|
||||
val colDf = ZCurveOptimizeHelper.getMinMaxValue(df, "c1,c2,c3,c5,c6,c7,c8")
|
||||
colDf.cache()
|
||||
@@ -115,12 +135,59 @@ class TestOptimizeTable extends HoodieClientTestBase {
|
||||
ZCurveOptimizeHelper.saveStatisticsInfo(df, "c1,c2,c3,c5,c6,c7,c8", statisticPath.toString, "4", Seq("0", "1", "3"))
|
||||
assertEquals(!fs.exists(new Path(statisticPath, "2")), true)
|
||||
assertEquals(fs.exists(new Path(statisticPath, "3")), true)
|
||||
// test to save different index, new index on ("c1,c6,c7,c8") should be successfully saved.
|
||||
ZCurveOptimizeHelper.saveStatisticsInfo(df, "c1,c6,c7,c8", statisticPath.toString, "5", Seq("0", "1", "3", "4"))
|
||||
assertEquals(fs.exists(new Path(statisticPath, "5")), true)
|
||||
} finally {
|
||||
if (fs.exists(testPath)) fs.delete(testPath)
|
||||
if (fs.exists(statisticPath)) fs.delete(statisticPath)
|
||||
}
|
||||
}
|
||||
|
||||
// test collect min-max statistic info for DateType in the case of multithreading.
|
||||
// parquet will give a wrong statistic result for DateType in the case of multithreading.
|
||||
@Test
|
||||
def testMultiThreadParquetFooterReadForDateType(): Unit = {
|
||||
// create parquet file with DateType
|
||||
val rdd = spark.sparkContext.parallelize(0 to 100, 1)
|
||||
.map(item => RowFactory.create(Date.valueOf(s"${2020}-${item % 11 + 1}-${item % 28 + 1}")))
|
||||
val df = spark.createDataFrame(rdd, new StructType().add("id", DateType))
|
||||
val testPath = new Path(System.getProperty("java.io.tmpdir"), "testCollectDateType")
|
||||
val conf = spark.sparkContext.hadoopConfiguration
|
||||
val cols = new java.util.ArrayList[String]
|
||||
cols.add("id")
|
||||
try {
|
||||
df.repartition(3).write.mode("overwrite").save(testPath.toString)
|
||||
val inputFiles = spark.read.load(testPath.toString).inputFiles.sortBy(x => x)
|
||||
|
||||
val realResult = new Array[(String, String)](3)
|
||||
inputFiles.zipWithIndex.foreach { case (f, index) =>
|
||||
val fileUtils = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET).asInstanceOf[ParquetUtils]
|
||||
val res = fileUtils.readRangeFromParquetMetadata(conf, new Path(f), cols).iterator().next()
|
||||
realResult(index) = (res.getMinValueAsString, res.getMaxValueAsString)
|
||||
}
|
||||
|
||||
// multi thread read with no lock
|
||||
val resUseLock = new Array[(String, String)](3)
|
||||
inputFiles.zipWithIndex.par.foreach { case (f, index) =>
|
||||
val fileUtils = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET).asInstanceOf[ParquetUtils]
|
||||
val res = fileUtils.readRangeFromParquetMetadata(conf, new Path(f), cols).iterator().next()
|
||||
resUseLock(index) = (res.getMinValueAsString, res.getMaxValueAsString)
|
||||
}
|
||||
|
||||
// check resUseNoLock,
|
||||
// We can't guarantee that there must be problems in the case of multithreading.
|
||||
// In order to make ut pass smoothly, we will not check resUseNoLock.
|
||||
// check resUseLock
|
||||
// should pass assert
|
||||
realResult.zip(resUseLock).foreach { case (realValue, testValue) =>
|
||||
assert(realValue == testValue, s" expect realValue: ${realValue} but find ${testValue}")
|
||||
}
|
||||
} finally {
|
||||
if (fs.exists(testPath)) fs.delete(testPath)
|
||||
}
|
||||
}
|
||||
|
||||
def createComplexDataFrame(spark: SparkSession): DataFrame = {
|
||||
val schema = new StructType()
|
||||
.add("c1", IntegerType)
|
||||
|
||||
Reference in New Issue
Block a user