1
0

[HUDI-3457] Refactored Spark DataSource Relations to avoid code duplication (#4877)

Refactoring Spark DataSource Relations to avoid code duplication. 

Following Relations were in scope:

- BaseFileOnlyViewRelation
- MergeOnReadSnapshotRelaation
- MergeOnReadIncrementalRelation
This commit is contained in:
Alexey Kudinkin
2022-03-18 22:32:16 -07:00
committed by GitHub
parent 316e38c71e
commit 099c2c099a
22 changed files with 1051 additions and 572 deletions

View File

@@ -88,35 +88,6 @@ class TestHoodieSparkUtils {
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
}
@Test
def testCreateInMemoryIndex(@TempDir tempDir: File): Unit = {
val spark = SparkSession.builder
.appName("Hoodie Datasource test")
.master("local[2]")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate
val folders: Seq[Path] = Seq(
new Path(Paths.get(tempDir.getAbsolutePath, "folder1").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri)
)
val files: Seq[Path] = Seq(
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file1").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file2").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file3").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file4").toUri)
)
folders.foreach(folder => new File(folder.toUri).mkdir())
files.foreach(file => new File(file.toUri).createNewFile())
val index = HoodieSparkUtils.createInMemoryFileIndex(spark, Seq(folders(0), folders(1)))
val indexedFilePaths = index.allFiles().map(fs => fs.getPath)
assertEquals(files.sortWith(_.toString < _.toString), indexedFilePaths.sortWith(_.toString < _.toString))
spark.stop()
}
@Test
def testCreateRddSchemaEvol(): Unit = {
val spark = SparkSession.builder

View File

@@ -112,9 +112,9 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
val fullColumnsReadStats: Array[(String, Long)] =
if (HoodieSparkUtils.isSpark3)
Array(
("rider", 14665),
("rider,driver", 14665),
("rider,driver,tip_history", 14665))
("rider", 14166),
("rider,driver", 14166),
("rider,driver,tip_history", 14166))
else if (HoodieSparkUtils.isSpark2)
// TODO re-enable tests (these tests are very unstable currently)
Array(
@@ -163,11 +163,29 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
else
fail("Only Spark 3 and Spark 2 are currently supported")
// Stats for the reads fetching _all_ columns (currently for MOR to be able to merge
// records properly full row has to be fetched; note, how amount of bytes read
// is invariant of the # of columns)
val fullColumnsReadStats: Array[(String, Long)] =
if (HoodieSparkUtils.isSpark3)
Array(
("rider", 14166),
("rider,driver", 14166),
("rider,driver,tip_history", 14166))
else if (HoodieSparkUtils.isSpark2)
// TODO re-enable tests (these tests are very unstable currently)
Array(
("rider", -1),
("rider,driver", -1),
("rider,driver,tip_history", -1))
else
fail("Only Spark 3 and Spark 2 are currently supported")
// Test MOR / Snapshot / Skip-merge
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_SKIP_MERGE_OPT_VAL, projectedColumnsReadStats)
// Test MOR / Snapshot / Payload-combine
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL, projectedColumnsReadStats)
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL, DataSourceReadOptions.REALTIME_PAYLOAD_COMBINE_OPT_VAL, fullColumnsReadStats)
// Test MOR / Read Optimized
runTest(tableState, DataSourceReadOptions.QUERY_TYPE_READ_OPTIMIZED_OPT_VAL, "null", projectedColumnsReadStats)
@@ -209,9 +227,9 @@ class TestParquetColumnProjection extends SparkClientFunctionalTestHarness with
val fullColumnsReadStats: Array[(String, Long)] =
if (HoodieSparkUtils.isSpark3)
Array(
("rider", 19683),
("rider,driver", 19683),
("rider,driver,tip_history", 19683))
("rider", 19684),
("rider,driver", 19684),
("rider,driver,tip_history", 19684))
else if (HoodieSparkUtils.isSpark2)
// TODO re-enable tests (these tests are very unstable currently)
Array(