1
0

[HUDI-2494] Fixing glob pattern to skip all hoodie meta paths (#3768)

This commit is contained in:
Sivabalan Narayanan
2021-10-12 14:06:40 -04:00
committed by GitHub
parent 252c4ed380
commit 8a487eafa7
3 changed files with 37 additions and 15 deletions

View File

@@ -41,14 +41,18 @@ class TestHoodieSparkUtils {
def testGlobPaths(@TempDir tempDir: File): Unit = {
val folders: Seq[Path] = Seq(
new Path(Paths.get(tempDir.getAbsolutePath, "folder1").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri)
new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie", "metadata").toUri)
)
val files: Seq[Path] = Seq(
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file1").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file2").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file3").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file4").toUri)
new Path(Paths.get(tempDir.getAbsolutePath, "folder2","file4").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie","metadata", "file5").toUri),
new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie","metadata", "file6").toUri)
)
folders.foreach(folder => new File(folder.toUri).mkdir())
@@ -57,12 +61,14 @@ class TestHoodieSparkUtils {
var paths = Seq(tempDir.getAbsolutePath + "/*")
var globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths,
new Path(paths.head).getFileSystem(new Configuration()))
assertEquals(folders.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
assertEquals(folders.filterNot(entry => entry.toString.contains(".hoodie"))
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
paths = Seq(tempDir.getAbsolutePath + "/*/*")
globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths,
new Path(paths.head).getFileSystem(new Configuration()))
assertEquals(files.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
assertEquals(files.filterNot(entry => entry.toString.contains(".hoodie"))
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
paths = Seq(tempDir.getAbsolutePath + "/folder1/*")
globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths,
@@ -79,7 +85,8 @@ class TestHoodieSparkUtils {
paths = Seq(tempDir.getAbsolutePath + "/folder1/*", tempDir.getAbsolutePath + "/folder2/*")
globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths,
new Path(paths.head).getFileSystem(new Configuration()))
assertEquals(files.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
assertEquals(files.filterNot(entry => entry.toString.contains(".hoodie"))
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
}
@Test

View File

@@ -419,7 +419,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
@Test def testSparkPartitonByWithCustomKeyGenerator(): Unit = {
// Without fieldType, the default is SIMPLE
var writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, false)
var writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName)
writer.partitionBy("current_ts")
.mode(SaveMode.Overwrite)
.save(basePath)
@@ -428,7 +428,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("current_ts").cast("string")).count() == 0)
// Specify fieldType as TIMESTAMP
writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, false)
writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName)
writer.partitionBy("current_ts:TIMESTAMP")
.option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS")
.option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd")
@@ -504,7 +504,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
}
@Test def testSparkPartitonByWithTimestampBasedKeyGenerator() {
val writer = getDataFrameWriter(classOf[TimestampBasedKeyGenerator].getName, false)
val writer = getDataFrameWriter(classOf[TimestampBasedKeyGenerator].getName)
writer.partitionBy("current_ts")
.option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS")
.option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd")