1
0

Make sure properties set in HoodieWriteConfig is propogated down to individual configs. Fix a race condition which lets InputFormat to think file size is 0 when it is actually not

This commit is contained in:
Prasanna Rajaperumal
2017-04-27 10:37:30 -07:00
committed by prazanna
parent 91b088f29f
commit 8974e11161
8 changed files with 64 additions and 5 deletions

View File

@@ -114,6 +114,7 @@ public class HoodieInputFormat extends MapredParquetInputFormat
.collect(Collectors.toList());
for (HoodieDataFile filteredFile : filteredFiles) {
LOG.info("Processing incremental hoodie file - " + filteredFile.getPath());
filteredFile = checkFileStatus(fsView, filteredFile);
returns.add(filteredFile.getFileStatus());
}
LOG.info(
@@ -126,6 +127,7 @@ public class HoodieInputFormat extends MapredParquetInputFormat
if (LOG.isDebugEnabled()) {
LOG.debug("Processing latest hoodie file - " + filteredFile.getPath());
}
filteredFile = checkFileStatus(fsView, filteredFile);
returns.add(filteredFile.getFileStatus());
}
}
@@ -134,6 +136,24 @@ public class HoodieInputFormat extends MapredParquetInputFormat
}
/**
* Checks the file status for a race condition which can set the file size to 0.
* 1. HiveInputFormat does super.listStatus() and gets back a FileStatus[]
* 2. Then it creates the HoodieTableMetaClient for the paths listed.
* 3. Generation of splits looks at FileStatus size to create splits, which skips this file
*
* @param fsView
* @param fileStatus
* @return
*/
private HoodieDataFile checkFileStatus(TableFileSystemView fsView, HoodieDataFile fileStatus) {
if(fileStatus.getFileSize() == 0) {
LOG.info("Refreshing file status " + fileStatus.getPath());
return new HoodieDataFile(fsView.getFileStatus(fileStatus.getPath()));
}
return fileStatus;
}
private Map<HoodieTableMetaClient, List<FileStatus>> groupFileStatus(FileStatus[] fileStatuses)
throws IOException {
// This assumes the paths for different tables are grouped together