1
0

[HUDI-3290] Different file formats for the partition metadata file. (#5179)

* [HUDI-3290] Different file formats for the partition metadata file.

Partition metadata files are stored in each partition to help identify the base path of a table. These files are saved in the properties file format. Some query engines do not work when non Parquet/ORC files are found in a partition.

Added a new table config 'hoodie.partition.metafile.use.data.format' which when enabled (default false for backward compatibility) ensures that partition metafiles will be saved in the same format as the base files of a dataset.

For new datasets, the config can be set via hudi-cli. Deltastreamer has a new parameter --partition-metafile-use-data-format which will create a table with this setting.

* Code review comments

- Adding a new command to migrate from text to base file formats for meta file.
- Reimplementing readFromFS() to first read the text format, then base format
- Avoid extra exists() checks in readFromFS()
- Added unit tests, enabled parquet format across hoodie-hadoop-mr
- Code cleanup, restructuring, naming consistency.

* Wiring in all the other Spark code paths to respect this config

 - Turned on parquet meta format for COW data source tests
 - Removed the deltastreamer command line to keep it shorter

* populate HoodiePartitionMetadata#format after readFromFS()

Co-authored-by: Vinoth Chandar <vinoth@apache.org>
Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
Prashant Wason
2022-04-04 08:08:20 -07:00
committed by GitHub
parent 8add740d22
commit b28f0d6ceb
33 changed files with 544 additions and 94 deletions

View File

@@ -173,7 +173,8 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload, I, K, O> extends
try {
// Save hoodie partition meta in the partition path
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, baseInstantTime,
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath),
hoodieTable.getPartitionMetafileFormat());
partitionMetadata.trySave(getPartitionId());
// Since the actual log file written to can be different based on when rollover happens, we use the

View File

@@ -94,7 +94,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload, I, K, O> extends
try {
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime,
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath),
hoodieTable.getPartitionMetafileFormat());
partitionMetadata.trySave(getPartitionId());
createMarkerFile(partitionPath, FSUtils.makeDataFileName(this.instantTime, this.writeToken, this.fileId, hoodieTable.getBaseFileExtension()));
this.fileWriter = HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config,

View File

@@ -166,7 +166,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload, I, K, O> extends H
writeStatus.getStat().setPrevCommit(FSUtils.getCommitTime(latestValidFilePath));
HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime,
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath),
hoodieTable.getPartitionMetafileFormat());
partitionMetadata.trySave(getPartitionId());
String newFileName = FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension());

View File

@@ -1074,7 +1074,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
if (!status.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME)) {
this.subDirectories.add(status.getPath());
}
} else if (status.getPath().getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)) {
} else if (status.getPath().getName().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)) {
// Presence of partition meta file implies this is a HUDI partition
this.isHoodiePartition = true;
} else if (FSUtils.isDataFile(status.getPath())) {

View File

@@ -756,6 +756,10 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
return metaClient.getTableConfig().getLogFileFormat();
}
public Option<HoodieFileFormat> getPartitionMetafileFormat() {
return metaClient.getTableConfig().getPartitionMetafileFormat();
}
public String getBaseFileExtension() {
return getBaseFileFormat().getFileExtension();
}