[HUDI-2494] Fixing glob pattern to skip all hoodie meta paths (#3768)
This commit is contained in:
committed by
GitHub
parent
252c4ed380
commit
8a487eafa7
@@ -41,14 +41,18 @@ class TestHoodieSparkUtils {
|
||||
def testGlobPaths(@TempDir tempDir: File): Unit = {
|
||||
val folders: Seq[Path] = Seq(
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri)
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie", "metadata").toUri)
|
||||
)
|
||||
|
||||
val files: Seq[Path] = Seq(
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file1").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder1", "file2").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file3").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2", "file4").toUri)
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, "folder2","file4").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie","metadata", "file5").toUri),
|
||||
new Path(Paths.get(tempDir.getAbsolutePath, ".hoodie","metadata", "file6").toUri)
|
||||
)
|
||||
|
||||
folders.foreach(folder => new File(folder.toUri).mkdir())
|
||||
@@ -57,12 +61,14 @@ class TestHoodieSparkUtils {
|
||||
var paths = Seq(tempDir.getAbsolutePath + "/*")
|
||||
var globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths,
|
||||
new Path(paths.head).getFileSystem(new Configuration()))
|
||||
assertEquals(folders.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
||||
assertEquals(folders.filterNot(entry => entry.toString.contains(".hoodie"))
|
||||
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
||||
|
||||
paths = Seq(tempDir.getAbsolutePath + "/*/*")
|
||||
globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths,
|
||||
new Path(paths.head).getFileSystem(new Configuration()))
|
||||
assertEquals(files.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
||||
assertEquals(files.filterNot(entry => entry.toString.contains(".hoodie"))
|
||||
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
||||
|
||||
paths = Seq(tempDir.getAbsolutePath + "/folder1/*")
|
||||
globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths,
|
||||
@@ -79,7 +85,8 @@ class TestHoodieSparkUtils {
|
||||
paths = Seq(tempDir.getAbsolutePath + "/folder1/*", tempDir.getAbsolutePath + "/folder2/*")
|
||||
globbedPaths = HoodieSparkUtils.checkAndGlobPathIfNecessary(paths,
|
||||
new Path(paths.head).getFileSystem(new Configuration()))
|
||||
assertEquals(files.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
||||
assertEquals(files.filterNot(entry => entry.toString.contains(".hoodie"))
|
||||
.sortWith(_.toString < _.toString), globbedPaths.sortWith(_.toString < _.toString))
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
@@ -419,7 +419,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
|
||||
@Test def testSparkPartitonByWithCustomKeyGenerator(): Unit = {
|
||||
// Without fieldType, the default is SIMPLE
|
||||
var writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, false)
|
||||
var writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName)
|
||||
writer.partitionBy("current_ts")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(basePath)
|
||||
@@ -428,7 +428,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
assertTrue(recordsReadDF.filter(col("_hoodie_partition_path") =!= col("current_ts").cast("string")).count() == 0)
|
||||
|
||||
// Specify fieldType as TIMESTAMP
|
||||
writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName, false)
|
||||
writer = getDataFrameWriter(classOf[CustomKeyGenerator].getName)
|
||||
writer.partitionBy("current_ts:TIMESTAMP")
|
||||
.option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS")
|
||||
.option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd")
|
||||
@@ -504,7 +504,7 @@ class TestCOWDataSource extends HoodieClientTestBase {
|
||||
}
|
||||
|
||||
@Test def testSparkPartitonByWithTimestampBasedKeyGenerator() {
|
||||
val writer = getDataFrameWriter(classOf[TimestampBasedKeyGenerator].getName, false)
|
||||
val writer = getDataFrameWriter(classOf[TimestampBasedKeyGenerator].getName)
|
||||
writer.partitionBy("current_ts")
|
||||
.option(Config.TIMESTAMP_TYPE_FIELD_PROP, "EPOCHMILLISECONDS")
|
||||
.option(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP, "yyyyMMdd")
|
||||
|
||||
Reference in New Issue
Block a user