[HUDI-3290] Different file formats for the partition metadata file. (#5179)
* [HUDI-3290] Different file formats for the partition metadata file. Partition metadata files are stored in each partition to help identify the base path of a table. These files are saved in the properties file format. Some query engines do not work when non Parquet/ORC files are found in a partition. Added a new table config 'hoodie.partition.metafile.use.data.format' which when enabled (default false for backward compatibility) ensures that partition metafiles will be saved in the same format as the base files of a dataset. For new datasets, the config can be set via hudi-cli. Deltastreamer has a new parameter --partition-metafile-use-data-format which will create a table with this setting. * Code review comments - Adding a new command to migrate from text to base file formats for meta file. - Reimplementing readFromFS() to first read the text format, then base format - Avoid extra exists() checks in readFromFS() - Added unit tests, enabled parquet format across hoodie-hadoop-mr - Code cleanup, restructuring, naming consistency. * Wiring in all the other Spark code paths to respect this config - Turned on parquet meta format for COW data source tests - Removed the deltastreamer command line to keep it shorter * populate HoodiePartitionMetadata#format after readFromFS() Co-authored-by: Vinoth Chandar <vinoth@apache.org> Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.model;
|
||||
|
||||
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestHoodiePartitionMetadata extends HoodieCommonTestHarness {
|
||||
|
||||
FileSystem fs;
|
||||
|
||||
@BeforeEach
|
||||
public void setupTest() throws IOException {
|
||||
initMetaClient();
|
||||
fs = metaClient.getFs();
|
||||
}
|
||||
|
||||
static Stream<Arguments> formatProviderFn() {
|
||||
return Stream.of(
|
||||
Arguments.arguments(Option.empty()),
|
||||
Arguments.arguments(Option.of(HoodieFileFormat.PARQUET)),
|
||||
Arguments.arguments(Option.of(HoodieFileFormat.ORC))
|
||||
);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("formatProviderFn")
|
||||
public void testTextFormatMetaFile(Option<HoodieFileFormat> format) throws IOException {
|
||||
// given
|
||||
final Path partitionPath = new Path(basePath, "a/b/"
|
||||
+ format.map(Enum::name).orElse("text"));
|
||||
fs.mkdirs(partitionPath);
|
||||
final String commitTime = "000000000001";
|
||||
HoodiePartitionMetadata writtenMetadata = new HoodiePartitionMetadata(metaClient.getFs(), commitTime, new Path(basePath), partitionPath, format);
|
||||
writtenMetadata.trySave(0);
|
||||
|
||||
// when
|
||||
HoodiePartitionMetadata readMetadata = new HoodiePartitionMetadata(metaClient.getFs(), new Path(metaClient.getBasePath(), partitionPath));
|
||||
|
||||
// then
|
||||
assertTrue(HoodiePartitionMetadata.hasPartitionMetadata(fs, partitionPath));
|
||||
assertEquals(Option.of(commitTime), readMetadata.readPartitionCreatedCommitTime());
|
||||
assertEquals(3, readMetadata.getPartitionDepth());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorIfAbsent() throws IOException {
|
||||
final Path partitionPath = new Path(basePath, "a/b/not-a-partition");
|
||||
fs.mkdirs(partitionPath);
|
||||
HoodiePartitionMetadata readMetadata = new HoodiePartitionMetadata(metaClient.getFs(), new Path(metaClient.getBasePath(), partitionPath));
|
||||
assertThrows(HoodieException.class, readMetadata::readPartitionCreatedCommitTime);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFileNames() {
|
||||
assertEquals(new Path("/a/b/c/.hoodie_partition_metadata"), HoodiePartitionMetadata.textFormatMetaFilePath(new Path("/a/b/c")));
|
||||
assertEquals(Arrays.asList(new Path("/a/b/c/.hoodie_partition_metadata.parquet"),
|
||||
new Path("/a/b/c/.hoodie_partition_metadata.orc")), HoodiePartitionMetadata.baseFormatMetaFilePaths(new Path("/a/b/c")));
|
||||
}
|
||||
}
|
||||
@@ -280,7 +280,7 @@ public class FileCreateUtils {
|
||||
public static void createPartitionMetaFile(String basePath, String partitionPath) throws IOException {
|
||||
Path parentPath = Paths.get(basePath, partitionPath);
|
||||
Files.createDirectories(parentPath);
|
||||
Path metaFilePath = parentPath.resolve(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
|
||||
Path metaFilePath = parentPath.resolve(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX);
|
||||
if (Files.notExists(metaFilePath)) {
|
||||
Files.createFile(metaFilePath);
|
||||
}
|
||||
@@ -397,7 +397,7 @@ public class FileCreateUtils {
|
||||
}
|
||||
return Files.list(basePath).filter(entry -> (!entry.getFileName().toString().equals(HoodieTableMetaClient.METAFOLDER_NAME)
|
||||
&& !entry.getFileName().toString().contains("parquet") && !entry.getFileName().toString().contains("log"))
|
||||
&& !entry.getFileName().toString().endsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE)).collect(Collectors.toList());
|
||||
&& !entry.getFileName().toString().startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX)).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -205,7 +205,7 @@ public class HoodieTestDataGenerator implements AutoCloseable {
|
||||
*/
|
||||
public void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) {
|
||||
for (String partitionPath : partitionPaths) {
|
||||
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0);
|
||||
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath), Option.empty()).trySave(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -680,7 +680,7 @@ public class HoodieTestTable {
|
||||
boolean toReturn = true;
|
||||
String filePath = entry.getPath().toString();
|
||||
String fileName = entry.getPath().getName();
|
||||
if (fileName.equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE) || (!fileName.contains("log") && !fileName.contains("parquet"))
|
||||
if (fileName.startsWith(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE_PREFIX) || (!fileName.contains("log") && !fileName.contains("parquet"))
|
||||
|| filePath.contains("metadata")) {
|
||||
toReturn = false;
|
||||
} else {
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
package org.apache.hudi.common.util;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
@@ -24,9 +25,10 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.EnumSource;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
@@ -41,7 +43,7 @@ public final class TestTablePathUtils {
|
||||
private static final String BASE_FILE_EXTENSION = HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().getFileExtension();
|
||||
|
||||
@TempDir
|
||||
static File tempDir;
|
||||
public File tempDir;
|
||||
private static FileSystem fs;
|
||||
private static Path tablePath;
|
||||
private static Path partitionPath1;
|
||||
@@ -49,9 +51,12 @@ public final class TestTablePathUtils {
|
||||
private static Path filePath1;
|
||||
private static Path filePath2;
|
||||
|
||||
@BeforeAll
|
||||
static void setup() throws IOException {
|
||||
URI tablePathURI = Paths.get(tempDir.getAbsolutePath(),"test_table").toUri();
|
||||
private void setup() throws IOException {
|
||||
setup(Option.empty());
|
||||
}
|
||||
|
||||
private void setup(Option<HoodieFileFormat> partitionMetafileFormat) throws IOException {
|
||||
URI tablePathURI = Paths.get(tempDir.getAbsolutePath(), "test_table").toUri();
|
||||
tablePath = new Path(tablePathURI);
|
||||
fs = tablePath.getFileSystem(new Configuration());
|
||||
|
||||
@@ -69,10 +74,10 @@ public final class TestTablePathUtils {
|
||||
assertTrue(new File(partitionPathURI2).mkdirs());
|
||||
|
||||
HoodiePartitionMetadata partitionMetadata1 = new HoodiePartitionMetadata(fs, Instant.now().toString(), tablePath,
|
||||
partitionPath1);
|
||||
partitionPath1, partitionMetafileFormat);
|
||||
partitionMetadata1.trySave(1);
|
||||
HoodiePartitionMetadata partitionMetadata2 = new HoodiePartitionMetadata(fs, Instant.now().toString(), tablePath,
|
||||
partitionPath2);
|
||||
partitionPath2, partitionMetafileFormat);
|
||||
partitionMetadata2.trySave(2);
|
||||
|
||||
// Create files
|
||||
@@ -87,12 +92,14 @@ public final class TestTablePathUtils {
|
||||
|
||||
@Test
|
||||
void getTablePathFromTablePath() throws IOException {
|
||||
setup();
|
||||
Option<Path> inferredTablePath = TablePathUtils.getTablePath(fs, tablePath);
|
||||
assertEquals(tablePath, inferredTablePath.get());
|
||||
}
|
||||
|
||||
@Test
|
||||
void getTablePathFromMetadataFolderPath() throws IOException {
|
||||
setup();
|
||||
Path metaFolder = new Path(tablePath, HoodieTableMetaClient.METAFOLDER_NAME);
|
||||
Option<Path> inferredTablePath = TablePathUtils.getTablePath(fs, metaFolder);
|
||||
assertEquals(tablePath, inferredTablePath.get());
|
||||
@@ -100,6 +107,7 @@ public final class TestTablePathUtils {
|
||||
|
||||
@Test
|
||||
void getTablePathFromMetadataSubFolderPath() throws IOException {
|
||||
setup();
|
||||
Path auxFolder = new Path(tablePath, HoodieTableMetaClient.AUXILIARYFOLDER_NAME);
|
||||
assertEquals(tablePath, TablePathUtils.getTablePath(fs, auxFolder).get());
|
||||
|
||||
@@ -117,8 +125,10 @@ public final class TestTablePathUtils {
|
||||
assertEquals(metadataTableFolder, TablePathUtils.getTablePath(fs, metadataTablePartitionFolder).get());
|
||||
}
|
||||
|
||||
@Test
|
||||
void getTablePathFromPartitionFolderPath() throws IOException {
|
||||
@ParameterizedTest
|
||||
@EnumSource(value = HoodieFileFormat.class, names = {"PARQUET", "ORC"})
|
||||
void getTablePathFromPartitionFolderPath(HoodieFileFormat partitionMetafileFormat) throws IOException {
|
||||
setup(Option.of(partitionMetafileFormat));
|
||||
Option<Path> inferredTablePath = TablePathUtils.getTablePath(fs, partitionPath1);
|
||||
assertEquals(tablePath, inferredTablePath.get());
|
||||
|
||||
@@ -128,6 +138,7 @@ public final class TestTablePathUtils {
|
||||
|
||||
@Test
|
||||
void getTablePathFromFilePath() throws IOException {
|
||||
setup();
|
||||
Option<Path> inferredTablePath = TablePathUtils.getTablePath(fs, filePath1);
|
||||
assertEquals(tablePath, inferredTablePath.get());
|
||||
|
||||
|
||||
Reference in New Issue
Block a user