[HUDI-3457] Refactored Spark DataSource Relations to avoid code duplication (#4877)
Refactoring Spark DataSource Relations to avoid code duplication. Following Relations were in scope: - BaseFileOnlyViewRelation - MergeOnReadSnapshotRelaation - MergeOnReadIncrementalRelation
This commit is contained in:
@@ -88,35 +88,6 @@ class TestHoodieSparkUtils {
|
||||
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
||||
}
|
||||
|
||||
@Test
|
||||
def testCreateInMemoryIndex(@TempDir tempDir: File): Unit = {
|
||||
val spark = SparkSession.builder
|
||||
.appName("Hoodie Datasource test")
|
||||
.master("local[2]")
|
||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
.getOrCreate
|
||||
|
||||
val folders: Seq[Path] = Seq(
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri)
|
||||
)
|
||||
|
||||
val files: Seq[Path] = Seq(
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file1").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file2").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file3").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file4").toUri)
|
||||
)
|
||||
|
||||
folders.foreach(folder => new File(folder.toUri).mkdir())
|
||||
files.foreach(file => new File(file.toUri).createNewFile())
|
||||
|
||||
val index = HoodieSparkUtils.createInMemoryFileIndex(spark, Seq(folders(0), folders(1)))
|
||||
val indexedFilePaths = index.allFiles().map(fs => fs.getPath)
|
||||
assertEquals(files.sortWith(_.toString < _.toString), indexedFilePaths.sortWith(_.toString < _.toString))
|
||||
spark.stop()
|
||||
}
|
||||
|
||||
@Test
|
||||
def testCreateRddSchemaEvol(): Unit = {
|
||||
val spark = SparkSession.builder
|
||||
|
||||
@@ -112,9 +112,9 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
|
||||
val fullColumnsReadStats: Array[(String, Long)] =
|
||||
if (HoodieSparkUtils.isSpark3)
|
||||
Array(
|
||||
("rider", 14665),
|
||||
("rider,driver", 14665),
|
||||
("rider,driver,tip_history", 14665))
|
||||
("rider", 14166),
|
||||
("rider,driver", 14166),
|
||||
("rider,driver,tip_history", 14166))
|
||||
else if (HoodieSparkUtils.isSpark2)
|
||||
// TODO re-enable tests (these tests are very unstable currently)
|
||||
Array(
|
||||
@@ -163,11 +163,29 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
|
||||
else
|
||||
fail("Only Spark 3 and Spark 2 are currently supported")
|
||||
|
||||
// Stats for the reads fetching _all_ columns (currently for MOR to be able to merge
|
||||
// records properly full row has to be fetched; note, how amount of bytes read
|
||||
// is invariant of the # of columns)
|
||||
val fullColumnsReadStats: Array[(String, Long)] =
|
||||
if (HoodieSparkUtils.isSpark3)
|
||||
Array(
|
||||
("rider", 14166),
|
||||
("rider,driver", 14166),
|
||||
("rider,driver,tip_history", 14166))
|
||||
else if (HoodieSparkUtils.isSpark2)
|
||||
// TODO re-enable tests (these tests are very unstable currently)
|
||||
Array(
|
||||
("rider", -1),
|
||||
("rider,driver", -1),
|
||||
("rider,driver,tip_history", -1))
|
||||
else
|
||||
fail("Only Spark 3 and Spark 2 are currently supported")
|
||||
|
||||
// Test MOR / Snapshot / Skip-merge
|
||||
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL, projectedColumnsReadStats)
|
||||
|
||||
// Test MOR / Snapshot / Payload-combine
|
||||
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL, projectedColumnsReadStats)
|
||||
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL, fullColumnsReadStats)
|
||||
|
||||
// Test MOR / Read Optimized
|
||||
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, "null", projectedColumnsReadStats)
|
||||
@@ -209,9 +227,9 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
|
||||
val fullColumnsReadStats: Array[(String, Long)] =
|
||||
if (HoodieSparkUtils.isSpark3)
|
||||
Array(
|
||||
("rider", 19683),
|
||||
("rider,driver", 19683),
|
||||
("rider,driver,tip_history", 19683))
|
||||
("rider", 19684),
|
||||
("rider,driver", 19684),
|
||||
("rider,driver,tip_history", 19684))
|
||||
else if (HoodieSparkUtils.isSpark2)
|
||||
// TODO re-enable tests (these tests are very unstable currently)
|
||||
Array(
|
||||
|
||||
Reference in New Issue
Block a user