1
0

[HUDI-3180] Include files from completed commits while bootstrapping metadata table (#4519)

This commit is contained in:
Sivabalan Narayanan
2022-01-10 15:33:15 -05:00
committed by GitHub
parent bc95571caa
commit 7a8b94c82d
3 changed files with 47 additions and 1 deletions

View File

@@ -746,9 +746,16 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
HoodieData<HoodieRecord> partitionRecords = engineContext.parallelize(Arrays.asList(allPartitionRecord), 1);
if (!partitionInfoList.isEmpty()) {
HoodieData<HoodieRecord> fileListRecords = engineContext.parallelize(partitionInfoList, partitionInfoList.size()).map(partitionInfo -> {
Map<String, Long> fileNameToSizeMap = partitionInfo.getFileNameToSizeMap();
// filter for files that are part of the completed commits
Map<String, Long> validFileNameToSizeMap = fileNameToSizeMap.entrySet().stream().filter(fileSizePair -> {
String commitTime = FSUtils.getCommitTime(fileSizePair.getKey());
return HoodieTimeline.compareTimestamps(commitTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, createInstantTime);
}).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
// Record which saves files within a partition
return HoodieMetadataPayload.createPartitionFilesRecord(
partitionInfo.getRelativePath().isEmpty() ? NON_PARTITIONED_NAME : partitionInfo.getRelativePath(), Option.of(partitionInfo.getFileNameToSizeMap()), Option.empty());
partitionInfo.getRelativePath().isEmpty() ? NON_PARTITIONED_NAME : partitionInfo.getRelativePath(), Option.of(validFileNameToSizeMap), Option.empty());
});
partitionRecords = partitionRecords.union(fileListRecords);
}

View File

@@ -22,6 +22,7 @@ import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
import org.apache.hudi.common.testutils.FileCreateUtils;
import org.apache.hudi.common.testutils.HoodieMetadataTestTable;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestTable;
@@ -36,7 +37,10 @@ import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.UUID;
import static java.util.Arrays.asList;
import static java.util.Collections.emptyList;
@@ -76,6 +80,36 @@ public class TestHoodieMetadataBootstrap extends TestHoodieMetadataBase {
bootstrapAndVerify();
}
/**
* Validate that bootstrap considers only files part of completed commit and ignore any extra files.
*/
@Test
public void testMetadataBootstrapWithExtraFiles() throws Exception {
HoodieTableType tableType = COPY_ON_WRITE;
init(tableType, false);
doPreBootstrapWriteOperation(testTable, INSERT, "0000001");
doPreBootstrapWriteOperation(testTable, "0000002");
doPreBootstrapClean(testTable, "0000003", Arrays.asList("0000001"));
doPreBootstrapWriteOperation(testTable, "0000005");
// add few extra files to table. bootstrap should include those files.
String fileName = UUID.randomUUID().toString();
Path baseFilePath = FileCreateUtils.getBaseFilePath(basePath, "p1", "0000006", fileName);
FileCreateUtils.createBaseFile(basePath, "p1", "0000006", fileName, 100);
writeConfig = getWriteConfig(true, true);
initWriteConfigAndMetatableWriter(writeConfig, true);
syncTableMetadata(writeConfig);
// remove those files from table. and then validate.
Files.delete(baseFilePath);
// validate
validateMetadata(testTable);
// after bootstrap do two writes and validate its still functional.
doWriteInsertAndUpsert(testTable);
validateMetadata(testTable);
}
@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testMetadataBootstrapInsertUpsertRollback(HoodieTableType tableType) throws Exception {

View File

@@ -304,6 +304,11 @@ public class FileCreateUtils {
Files.setLastModifiedTime(baseFilePath, FileTime.fromMillis(lastModificationTimeMilli));
}
public static Path getBaseFilePath(String basePath, String partitionPath, String instantTime, String fileId) {
Path parentPath = Paths.get(basePath, partitionPath);
return parentPath.resolve(baseFileName(instantTime, fileId));
}
public static void createLogFile(String basePath, String partitionPath, String instantTime, String fileId, int version)
throws Exception {
createLogFile(basePath, partitionPath, instantTime, fileId, version, 0);