1
0

[HUDI-1330] handle prefix filtering at directory level (#2157)

The current DFSPathSelector only ignore prefix(_, .) at the file level while files under subdirectories
e.g. (.checkpoint/*) are still considered which result in bad-format exception during reading.
This commit is contained in:
Ho Tien Vu
2020-10-21 14:20:19 +08:00
committed by GitHub
parent fd269ddeb0
commit af5ef4d49d
2 changed files with 40 additions and 22 deletions

View File

@@ -175,5 +175,18 @@ public abstract class AbstractDFSSourceTestBase extends UtilitiesTestBase {
InputBatch<JavaRDD<GenericRecord>> fetch5 = sourceFormatAdapter.fetchNewDataInAvroFormat(
Option.empty(), Long.MAX_VALUE);
assertEquals(10100, fetch5.getBatch().get().count());
// 6. Should skip files/directories whose names start with prefixes ("_", ".")
generateOneFile(".checkpoint/3", "002", 100);
generateOneFile("_checkpoint/3", "002", 100);
generateOneFile(".3", "002", 100);
generateOneFile("_3", "002", 100);
// also work with nested directory
generateOneFile("foo/.bar/3", "002", 1); // not ok
generateOneFile("foo/bar/3", "002", 1); // ok
// fetch everything from the beginning
InputBatch<JavaRDD<GenericRecord>> fetch6 = sourceFormatAdapter.fetchNewDataInAvroFormat(
Option.empty(), Long.MAX_VALUE);
assertEquals(10101, fetch6.getBatch().get().count());
}
}