1
0

[HUDI-1371] [HUDI-1893] Support metadata based listing for Spark DataSource and Spark SQL (#2893)

This commit is contained in:
Udit Mehrotra
2021-08-03 14:47:40 -07:00
committed by GitHub
parent 245e1fd17d
commit 1ff2d3459a
13 changed files with 383 additions and 125 deletions

View File

@@ -52,6 +52,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
@@ -277,13 +278,16 @@ public class FSUtils {
}
}
public static FileStatus[] getFilesInPartition(HoodieEngineContext engineContext, HoodieMetadataConfig metadataConfig,
String basePathStr, Path partitionPath) {
try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext,
metadataConfig, basePathStr, FileSystemViewStorageConfig.FILESYSTEM_VIEW_SPILLABLE_DIR.defaultValue())) {
return tableMetadata.getAllFilesInPartition(partitionPath);
} catch (Exception e) {
throw new HoodieException("Error get files in partition: " + partitionPath, e);
public static Map<String, FileStatus[]> getFilesInPartitions(HoodieEngineContext engineContext,
HoodieMetadataConfig metadataConfig,
String basePathStr,
String[] partitionPaths,
String spillableMapPath) {
try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, metadataConfig, basePathStr,
spillableMapPath, true)) {
return tableMetadata.getAllFilesInPartitions(Arrays.asList(partitionPaths));
} catch (Exception ex) {
throw new HoodieException("Error get files in partitions: " + String.join(",", partitionPaths), ex);
}
}

View File

@@ -44,7 +44,9 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public abstract class BaseTableMetadata implements HoodieTableMetadata {
@@ -134,6 +136,26 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
.getAllFilesInPartition(partitionPath);
}
@Override
public Map<String, FileStatus[]> getAllFilesInPartitions(List<String> partitionPaths)
throws IOException {
if (enabled) {
Map<String, FileStatus[]> partitionsFilesMap = new HashMap<>();
try {
for (String partitionPath : partitionPaths) {
partitionsFilesMap.put(partitionPath, fetchAllFilesInPartition(new Path(partitionPath)));
}
return partitionsFilesMap;
} catch (Exception e) {
throw new HoodieMetadataException("Failed to retrieve files in partition from metadata", e);
}
}
return new FileSystemBackedTableMetadata(getEngineContext(), hadoopConf, datasetBasePath, metadataConfig.shouldAssumeDatePartitioning())
.getAllFilesInPartitions(partitionPaths);
}
/**
* Returns a list of all partitions.
*/

View File

@@ -33,8 +33,10 @@ import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
@@ -105,6 +107,24 @@ public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
return partitionPaths;
}
@Override
public Map<String, FileStatus[]> getAllFilesInPartitions(List<String> partitionPaths)
throws IOException {
if (partitionPaths == null || partitionPaths.isEmpty()) {
return Collections.emptyMap();
}
int parallelism = Math.min(DEFAULT_LISTING_PARALLELISM, partitionPaths.size());
List<Pair<String, FileStatus[]>> partitionToFiles = engineContext.map(partitionPaths, partitionPathStr -> {
Path partitionPath = new Path(partitionPathStr);
FileSystem fs = partitionPath.getFileSystem(hadoopConf.get());
return Pair.of(partitionPathStr, FSUtils.getAllDataFilesInPartition(fs, partitionPath));
}, parallelism);
return partitionToFiles.stream().collect(Collectors.toMap(Pair::getLeft, Pair::getRight));
}
@Override
public Option<String> getSyncedInstantTime() {
throw new UnsupportedOperationException();

View File

@@ -30,6 +30,7 @@ import org.apache.hadoop.fs.Path;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
/**
* Interface that supports querying various pieces of metadata about a hudi table.
@@ -95,6 +96,11 @@ public interface HoodieTableMetadata extends Serializable, AutoCloseable {
*/
List<String> getAllPartitionPaths() throws IOException;
/**
* Fetch all files for given partition paths.
*/
Map<String, FileStatus[]> getAllFilesInPartitions(List<String> partitionPaths) throws IOException;
/**
* Get the instant time to which the metadata is synced w.r.t data timeline.
*/