[HUDI-1371] [HUDI-1893] Support metadata based listing for Spark DataSource and Spark SQL (#2893)
This commit is contained in:
@@ -52,6 +52,7 @@ import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
@@ -277,13 +278,16 @@ public class FSUtils {
|
||||
}
|
||||
}
|
||||
|
||||
public static FileStatus[] getFilesInPartition(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig,
|
||||
String basePathStr, Path partitionPath) {
|
||||
try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext,
|
||||
metadataConfig, basePathStr, FileSystemViewStorageConfig.FILESYSTEM_VIEW_SPILLABLE_DIR.defaultValue())) {
|
||||
return tableMetadata.getAllFilesInPartition(partitionPath);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Error get files in partition: " + partitionPath, e);
|
||||
public static Map<String, FileStatus[]> getFilesInPartitions(HoodieEngineContext engineContext,
|
||||
HoodieMetadataConfig metadataConfig,
|
||||
String basePathStr,
|
||||
String[] partitionPaths,
|
||||
String spillableMapPath) {
|
||||
try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr,
|
||||
spillableMapPath, true)) {
|
||||
return tableMetadata.getAllFilesInPartitions(Arrays.asList(partitionPaths));
|
||||
} catch (Exception ex) {
|
||||
throw new HoodieException("Error get files in partitions: " + String.join(",", partitionPaths), ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -44,7 +44,9 @@ import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class BaseTableMetadata implements HoodieTableMetadata {
|
||||
@@ -134,6 +136,26 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
|
||||
.getAllFilesInPartition(partitionPath);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, FileStatus[]> getAllFilesInPartitions(List<String> partitionPaths)
|
||||
throws IOException {
|
||||
if (enabled) {
|
||||
Map<String, FileStatus[]> partitionsFilesMap = new HashMap<>();
|
||||
|
||||
try {
|
||||
for (String partitionPath : partitionPaths) {
|
||||
partitionsFilesMap.put(partitionPath, fetchAllFilesInPartition(new Path(partitionPath)));
|
||||
}
|
||||
return partitionsFilesMap;
|
||||
} catch (Exception e) {
|
||||
throw new HoodieMetadataException("Failed to retrieve files in partition from metadata", e);
|
||||
}
|
||||
}
|
||||
|
||||
return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, datasetBasePath, metadataConfig.shouldAssumeDatePartitioning())
|
||||
.getAllFilesInPartitions(partitionPaths);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of all partitions.
|
||||
*/
|
||||
|
||||
@@ -33,8 +33,10 @@ import org.apache.hadoop.fs.Path;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
|
||||
@@ -105,6 +107,24 @@ public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
|
||||
return partitionPaths;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, FileStatus[]> getAllFilesInPartitions(List<String> partitionPaths)
|
||||
throws IOException {
|
||||
if (partitionPaths == null || partitionPaths.isEmpty()) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
int parallelism = Math.min(DEFAULT_LISTING_PARALLELISM, partitionPaths.size());
|
||||
|
||||
List<Pair<String, FileStatus[]>> partitionToFiles = engineContext.map(partitionPaths, partitionPathStr -> {
|
||||
Path partitionPath = new Path(partitionPathStr);
|
||||
FileSystem fs = partitionPath.getFileSystem(hadoopConf.get());
|
||||
return Pair.of(partitionPathStr, FSUtils.getAllDataFilesInPartition(fs, partitionPath));
|
||||
}, parallelism);
|
||||
|
||||
return partitionToFiles.stream().collect(Collectors.toMap(Pair::getLeft, Pair::getRight));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getSyncedInstantTime() {
|
||||
throw new UnsupportedOperationException();
|
||||
|
||||
@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.Path;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Interface that supports querying various pieces of metadata about a hudi table.
|
||||
@@ -95,6 +96,11 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable {
|
||||
*/
|
||||
List<String> getAllPartitionPaths() throws IOException;
|
||||
|
||||
/**
|
||||
* Fetch all files for given partition paths.
|
||||
*/
|
||||
Map<String, FileStatus[]> getAllFilesInPartitions(List<String> partitionPaths) throws IOException;
|
||||
|
||||
/**
|
||||
* Get the instant time to which the metadata is synced w.r.t data timeline.
|
||||
*/
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.metadata;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||
@@ -30,7 +31,10 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||
@@ -63,8 +67,10 @@ public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 0);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath)).length == 10);
|
||||
Assertions.assertEquals(0, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
|
||||
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath)).length);
|
||||
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartitions(
|
||||
Collections.singletonList(basePath)).get(basePath).length);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -86,8 +92,14 @@ public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, true);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + DATE_PARTITIONS.get(0))).length == 10);
|
||||
Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
|
||||
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + DATE_PARTITIONS.get(0))).length);
|
||||
|
||||
List<String> fullPartitionPaths = DATE_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
|
||||
Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
|
||||
for (String p : fullPartitionPaths) {
|
||||
Assertions.assertEquals(10, partitionToFilesMap.get(p).length);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -101,7 +113,9 @@ public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||
// Generate 10 files under each partition
|
||||
DATE_PARTITIONS.stream().forEach(p -> {
|
||||
try {
|
||||
hoodieTestTable = hoodieTestTable.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
|
||||
hoodieTestTable = hoodieTestTable
|
||||
.withPartitionMetaFiles(p)
|
||||
.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@@ -109,7 +123,13 @@ public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 0);
|
||||
Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
|
||||
|
||||
List<String> fullPartitionPaths = DATE_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
|
||||
Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
|
||||
for (String p : fullPartitionPaths) {
|
||||
Assertions.assertEquals(10, partitionToFilesMap.get(p).length);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -128,8 +148,14 @@ public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + ONE_LEVEL_PARTITIONS.get(0))).length == 10);
|
||||
Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
|
||||
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + ONE_LEVEL_PARTITIONS.get(0))).length);
|
||||
|
||||
List<String> fullPartitionPaths = ONE_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
|
||||
Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
|
||||
for (String p : fullPartitionPaths) {
|
||||
Assertions.assertEquals(10, partitionToFilesMap.get(p).length);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -148,8 +174,14 @@ public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length == 10);
|
||||
Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
|
||||
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length);
|
||||
|
||||
List<String> fullPartitionPaths = MULTI_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
|
||||
Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
|
||||
for (String p : fullPartitionPaths) {
|
||||
Assertions.assertEquals(10, partitionToFilesMap.get(p).length);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -167,8 +199,14 @@ public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
|
||||
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length == 0);
|
||||
Assertions.assertEquals(3, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
|
||||
Assertions.assertEquals(0, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length);
|
||||
|
||||
List<String> fullPartitionPaths = MULTI_LEVEL_PARTITIONS.stream().map(p -> basePath + "/" + p).collect(Collectors.toList());
|
||||
Map<String, FileStatus[]> partitionToFilesMap = fileSystemBackedTableMetadata.getAllFilesInPartitions(fullPartitionPaths);
|
||||
for (String p : fullPartitionPaths) {
|
||||
Assertions.assertEquals(0, partitionToFilesMap.get(p).length);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user