1
0

[HUDI-1479] Use HoodieEngineContext to parallelize fetching of partiton paths (#2417)

* [HUDI-1479] Use HoodieEngineContext to parallelize fetching of partition paths

* Adding testClass for FileSystemBackedTableMetadata

Co-authored-by: Nishith Agarwal <nagarwal@uber.com>
This commit is contained in:
Udit Mehrotra
2021-01-10 21:19:52 -08:00
committed by GitHub
parent 23e93d05c0
commit 7ce3ac778e
38 changed files with 509 additions and 100 deletions

View File

@@ -161,7 +161,9 @@ public class MetadataCommand implements CommandMarker {
@CliCommand(value = "metadata list-partitions", help = "Print a list of all partitions from the metadata") @CliCommand(value = "metadata list-partitions", help = "Print a list of all partitions from the metadata")
public String listPartitions() throws IOException { public String listPartitions() throws IOException {
HoodieCLI.getTableMetaClient(); HoodieCLI.getTableMetaClient();
HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(HoodieCLI.conf, HoodieCLI.basePath, "/tmp", true, false, false); initJavaSparkContext();
HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(new HoodieSparkEngineContext(jsc),
HoodieCLI.basePath, "/tmp", true, false, false, false);
StringBuffer out = new StringBuffer("\n"); StringBuffer out = new StringBuffer("\n");
if (!metadata.enabled()) { if (!metadata.enabled()) {

View File

@@ -119,6 +119,10 @@ public abstract class AbstractHoodieClient implements Serializable, AutoCloseabl
return config; return config;
} }
public HoodieEngineContext getEngineContext() {
return context;
}
protected void initWrapperFSMetrics() { protected void initWrapperFSMetrics() {
// no-op. // no-op.
} }

View File

@@ -221,8 +221,9 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
protected abstract void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient); protected abstract void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient);
private void initTableMetadata() { private void initTableMetadata() {
this.metadata = new HoodieBackedTableMetadata(hadoopConf.get(), datasetWriteConfig.getBasePath(), datasetWriteConfig.getSpillableMapBasePath(), this.metadata = new HoodieBackedTableMetadata(engineContext, datasetWriteConfig.getBasePath(),
datasetWriteConfig.useFileListingMetadata(), datasetWriteConfig.getFileListingMetadataVerify(), false, datasetWriteConfig.getSpillableMapBasePath(), datasetWriteConfig.useFileListingMetadata(),
datasetWriteConfig.getFileListingMetadataVerify(), false,
datasetWriteConfig.shouldAssumeDatePartitioning()); datasetWriteConfig.shouldAssumeDatePartitioning());
this.metaClient = metadata.getMetaClient(); this.metaClient = metadata.getMetaClient();
} }

View File

@@ -31,6 +31,7 @@ import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.avro.model.HoodieSavepointMetadata; import org.apache.hudi.avro.model.HoodieSavepointMetadata;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext; import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.engine.TaskContextSupplier; import org.apache.hudi.common.engine.TaskContextSupplier;
import org.apache.hudi.common.fs.ConsistencyGuard; import org.apache.hudi.common.fs.ConsistencyGuard;
import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility; import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility;
@@ -93,6 +94,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
protected final HoodieWriteConfig config; protected final HoodieWriteConfig config;
protected final HoodieTableMetaClient metaClient; protected final HoodieTableMetaClient metaClient;
protected final transient HoodieEngineContext context;
protected final HoodieIndex<T, I, K, O> index; protected final HoodieIndex<T, I, K, O> index;
private SerializableConfiguration hadoopConfiguration; private SerializableConfiguration hadoopConfiguration;
@@ -108,6 +110,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
config.getViewStorageConfig()); config.getViewStorageConfig());
this.metaClient = metaClient; this.metaClient = metaClient;
this.index = getIndex(config, context); this.index = getIndex(config, context);
this.context = context;
this.taskContextSupplier = context.getTaskContextSupplier(); this.taskContextSupplier = context.getTaskContextSupplier();
} }
@@ -660,8 +663,16 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
public HoodieTableMetadata metadata() { public HoodieTableMetadata metadata() {
if (metadata == null) { if (metadata == null) {
metadata = HoodieTableMetadata.create(hadoopConfiguration.get(), config.getBasePath(), config.getSpillableMapBasePath(), HoodieEngineContext engineContext = context;
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.isMetricsOn(), config.shouldAssumeDatePartitioning()); if (engineContext == null) {
// This is to handle scenarios where this is called at the executor tasks which do not have access
// to engine context, and it ends up being null (as its not serializable and marked transient here).
engineContext = new HoodieLocalEngineContext(hadoopConfiguration.get());
}
metadata = HoodieTableMetadata.create(engineContext, config.getBasePath(), config.getSpillableMapBasePath(),
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.isMetricsOn(),
config.shouldAssumeDatePartitioning());
} }
return metadata; return metadata;
} }

View File

@@ -67,7 +67,7 @@ public abstract class PartitionAwareClusteringPlanStrategy<T extends HoodieRecor
HoodieTableMetaClient metaClient = getHoodieTable().getMetaClient(); HoodieTableMetaClient metaClient = getHoodieTable().getMetaClient();
LOG.info("Scheduling clustering for " + metaClient.getBasePath()); LOG.info("Scheduling clustering for " + metaClient.getBasePath());
HoodieWriteConfig config = getWriteConfig(); HoodieWriteConfig config = getWriteConfig();
List<String> partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), List<String> partitionPaths = FSUtils.getAllPartitionPaths(getEngineContext(), metaClient.getFs(), metaClient.getBasePath(),
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
config.shouldAssumeDatePartitioning()); config.shouldAssumeDatePartitioning());

View File

@@ -91,9 +91,10 @@ public class RollbackUtils {
* @param config instance of {@link HoodieWriteConfig} to use. * @param config instance of {@link HoodieWriteConfig} to use.
* @return {@link List} of {@link ListingBasedRollbackRequest}s thus collected. * @return {@link List} of {@link ListingBasedRollbackRequest}s thus collected.
*/ */
public static List<ListingBasedRollbackRequest> generateRollbackRequestsByListingCOW(FileSystem fs, String basePath, HoodieWriteConfig config) { public static List<ListingBasedRollbackRequest> generateRollbackRequestsByListingCOW(HoodieEngineContext engineContext,
FileSystem fs, String basePath, HoodieWriteConfig config) {
try { try {
return FSUtils.getAllPartitionPaths(fs, basePath, config.useFileListingMetadata(), return FSUtils.getAllPartitionPaths(engineContext, fs, basePath, config.useFileListingMetadata(),
config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()).stream() config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()).stream()
.map(ListingBasedRollbackRequest::createRollbackRequestWithDeleteDataAndLogFilesAction) .map(ListingBasedRollbackRequest::createRollbackRequestWithDeleteDataAndLogFilesAction)
.collect(Collectors.toList()); .collect(Collectors.toList());
@@ -113,7 +114,7 @@ public class RollbackUtils {
public static List<ListingBasedRollbackRequest> generateRollbackRequestsUsingFileListingMOR(HoodieInstant instantToRollback, HoodieTable table, HoodieEngineContext context) throws IOException { public static List<ListingBasedRollbackRequest> generateRollbackRequestsUsingFileListingMOR(HoodieInstant instantToRollback, HoodieTable table, HoodieEngineContext context) throws IOException {
String commit = instantToRollback.getTimestamp(); String commit = instantToRollback.getTimestamp();
HoodieWriteConfig config = table.getConfig(); HoodieWriteConfig config = table.getConfig();
List<String> partitions = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), List<String> partitions = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()); config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1); int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1);
context.setJobStatus(RollbackUtils.class.getSimpleName(), "Generate all rollback requests"); context.setJobStatus(RollbackUtils.class.getSimpleName(), "Generate all rollback requests");

View File

@@ -90,7 +90,7 @@ public class SavepointActionExecutor<T extends HoodieRecordPayload, I, K, O> ext
"Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained); "Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained);
context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime); context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime);
List<String> partitions = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), List<String> partitions = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getFs(),
table.getMetaClient().getBasePath(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(), table.getMetaClient().getBasePath(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
config.shouldAssumeDatePartitioning() config.shouldAssumeDatePartitioning()
); );

View File

@@ -65,7 +65,7 @@ public class FlinkCopyOnWriteRollbackActionExecutor<T extends HoodieRecordPayloa
@Override @Override
protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) { protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) {
List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW( List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(
table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), config); context, table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), config);
return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests); return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests);
} }
} }

View File

@@ -93,8 +93,8 @@ public class ZeroToOneUpgradeHandler implements UpgradeHandler {
// generate rollback stats // generate rollback stats
List<ListingBasedRollbackRequest> rollbackRequests; List<ListingBasedRollbackRequest> rollbackRequests;
if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) { if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context, table.getMetaClient().getFs(),
table.getConfig()); table.getMetaClient().getBasePath(), table.getConfig());
} else { } else {
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context); rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
} }

View File

@@ -63,7 +63,7 @@ public class SparkHoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends
final HoodieTable hoodieTable) { final HoodieTable hoodieTable) {
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
try { try {
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getFs(), metaClient.getBasePath(),
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()); config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
return super.loadInvolvedFiles(allPartitionPaths, context, hoodieTable); return super.loadInvolvedFiles(allPartitionPaths, context, hoodieTable);
} catch (IOException e) { } catch (IOException e) {

View File

@@ -104,7 +104,7 @@ public class SparkHoodieGlobalSimpleIndex<T extends HoodieRecordPayload> extends
final HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) { final HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
try { try {
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getFs(), metaClient.getBasePath(),
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()); config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
// Obtain the latest data files from all the partitions. // Obtain the latest data files from all the partitions.
return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable); return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable);

View File

@@ -50,7 +50,7 @@ public class SparkInsertOverwriteTableCommitActionExecutor<T extends HoodieRecor
protected Map<String, List<String>> getPartitionToReplacedFileIds(JavaRDD<WriteStatus> writeStatuses) { protected Map<String, List<String>> getPartitionToReplacedFileIds(JavaRDD<WriteStatus> writeStatuses) {
Map<String, List<String>> partitionToExistingFileIds = new HashMap<>(); Map<String, List<String>> partitionToExistingFileIds = new HashMap<>();
try { try {
List<String> partitionPaths = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getFs(),
table.getMetaClient().getBasePath(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(), table.getMetaClient().getBasePath(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
false); false);
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context); JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);

View File

@@ -195,7 +195,7 @@ public class HoodieSparkMergeOnReadTableCompactor<T extends HoodieRecordPayload>
// TODO - rollback any compactions in flight // TODO - rollback any compactions in flight
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
List<String> partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getFs(), metaClient.getBasePath(),
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()); config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
// filter the partition paths if needed to reduce list status // filter the partition paths if needed to reduce list status

View File

@@ -66,8 +66,8 @@ public class SparkCopyOnWriteRollbackActionExecutor<T extends HoodieRecordPayloa
@Override @Override
protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) { protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) {
List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context,
table.getMetaClient().getBasePath(), config); table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), config);
return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests); return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests);
} }
} }

View File

@@ -92,8 +92,8 @@ public class ZeroToOneUpgradeHandler implements UpgradeHandler {
// generate rollback stats // generate rollback stats
List<ListingBasedRollbackRequest> rollbackRequests; List<ListingBasedRollbackRequest> rollbackRequests;
if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) { if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context, table.getMetaClient().getFs(),
table.getConfig()); table.getMetaClient().getBasePath(), table.getConfig());
} else { } else {
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context); rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
} }

View File

@@ -101,7 +101,7 @@ public class TestClientRollback extends HoodieClientTestBase {
assertNoWriteErrors(statuses); assertNoWriteErrors(statuses);
HoodieWriteConfig config = getConfig(); HoodieWriteConfig config = getConfig();
List<String> partitionPaths = List<String> partitionPaths =
FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), config.useFileListingMetadata(), FSUtils.getAllPartitionPaths(context, fs, cfg.getBasePath(), config.useFileListingMetadata(),
config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()); config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
metaClient = HoodieTableMetaClient.reload(metaClient); metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient); HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient);

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.client.SparkRDDWriteClient;
import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext; import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.metrics.Registry;
import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.FileSlice;
@@ -454,7 +455,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
records = dataGen.generateUniqueUpdates(newCommitTime, 5); records = dataGen.generateUniqueUpdates(newCommitTime, 5);
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses); assertNoWriteErrors(writeStatuses);
assertFalse(metadata(client).isInSync()); assertTrue(metadata(client).isInSync());
// updates and inserts // updates and inserts
newCommitTime = HoodieActiveTimeline.createNewInstantTime(); newCommitTime = HoodieActiveTimeline.createNewInstantTime();
@@ -462,21 +463,21 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
records = dataGen.generateUpdates(newCommitTime, 10); records = dataGen.generateUpdates(newCommitTime, 10);
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses); assertNoWriteErrors(writeStatuses);
assertFalse(metadata(client).isInSync()); assertTrue(metadata(client).isInSync());
// Compaction // Compaction
if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) { if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) {
newCommitTime = HoodieActiveTimeline.createNewInstantTime(); newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.scheduleCompactionAtInstant(newCommitTime, Option.empty()); client.scheduleCompactionAtInstant(newCommitTime, Option.empty());
client.compact(newCommitTime); client.compact(newCommitTime);
assertFalse(metadata(client).isInSync()); assertTrue(metadata(client).isInSync());
} }
// Savepoint // Savepoint
restoreToInstant = newCommitTime; restoreToInstant = newCommitTime;
if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) { if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) {
client.savepoint("hoodie", "metadata test"); client.savepoint("hoodie", "metadata test");
assertFalse(metadata(client).isInSync()); assertTrue(metadata(client).isInSync());
} }
// Deletes // Deletes
@@ -485,12 +486,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
JavaRDD<HoodieKey> deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey()); JavaRDD<HoodieKey> deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey());
client.startCommitWithTime(newCommitTime); client.startCommitWithTime(newCommitTime);
client.delete(deleteKeys, newCommitTime); client.delete(deleteKeys, newCommitTime);
assertFalse(metadata(client).isInSync()); assertTrue(metadata(client).isInSync());
// Clean // Clean
newCommitTime = HoodieActiveTimeline.createNewInstantTime(); newCommitTime = HoodieActiveTimeline.createNewInstantTime();
client.clean(newCommitTime); client.clean(newCommitTime);
assertFalse(metadata(client).isInSync()); assertTrue(metadata(client).isInSync());
// updates // updates
newCommitTime = HoodieActiveTimeline.createNewInstantTime(); newCommitTime = HoodieActiveTimeline.createNewInstantTime();
@@ -498,7 +499,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
records = dataGen.generateUniqueUpdates(newCommitTime, 10); records = dataGen.generateUniqueUpdates(newCommitTime, 10);
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses); assertNoWriteErrors(writeStatuses);
assertFalse(metadata(client).isInSync()); assertTrue(metadata(client).isInSync());
// insert overwrite to test replacecommit // insert overwrite to test replacecommit
newCommitTime = HoodieActiveTimeline.createNewInstantTime(); newCommitTime = HoodieActiveTimeline.createNewInstantTime();
@@ -507,7 +508,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
HoodieWriteResult replaceResult = client.insertOverwrite(jsc.parallelize(records, 1), newCommitTime); HoodieWriteResult replaceResult = client.insertOverwrite(jsc.parallelize(records, 1), newCommitTime);
writeStatuses = replaceResult.getWriteStatuses().collect(); writeStatuses = replaceResult.getWriteStatuses().collect();
assertNoWriteErrors(writeStatuses); assertNoWriteErrors(writeStatuses);
assertFalse(metadata(client).isInSync()); assertTrue(metadata(client).isInSync());
} }
// Enable metadata table and ensure it is synced // Enable metadata table and ensure it is synced
@@ -757,7 +758,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
private void validateMetadata(SparkRDDWriteClient client) throws IOException { private void validateMetadata(SparkRDDWriteClient client) throws IOException {
HoodieWriteConfig config = client.getConfig(); HoodieWriteConfig config = client.getConfig();
HoodieBackedTableMetadata tableMetadata = metadata(client); HoodieTableMetadata tableMetadata = metadata(client);
assertNotNull(tableMetadata, "MetadataReader should have been initialized"); assertNotNull(tableMetadata, "MetadataReader should have been initialized");
if (!config.useFileListingMetadata()) { if (!config.useFileListingMetadata()) {
return; return;
@@ -767,7 +768,9 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc); HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
// Partitions should match // Partitions should match
List<String> fsPartitions = FSUtils.getAllFoldersWithPartitionMetaFile(fs, basePath); FileSystemBackedTableMetadata fsBackedTableMetadata = new FileSystemBackedTableMetadata(engineContext,
new SerializableConfiguration(hadoopConf), config.getBasePath(), config.shouldAssumeDatePartitioning());
List<String> fsPartitions = fsBackedTableMetadata.getAllPartitionPaths();
List<String> metadataPartitions = tableMetadata.getAllPartitionPaths(); List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
Collections.sort(fsPartitions); Collections.sort(fsPartitions);
@@ -849,7 +852,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
// Metadata table has a fixed number of partitions // Metadata table has a fixed number of partitions
// Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory
// in the .hoodie folder. // in the .hoodie folder.
List<String> metadataTablePartitions = FSUtils.getAllPartitionPaths(fs, HoodieTableMetadata.getMetadataTableBasePath(basePath), List<String> metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, fs, HoodieTableMetadata.getMetadataTableBasePath(basePath),
false, false, false); false, false, false);
assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size()); assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size());
@@ -873,10 +876,11 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
.create(hadoopConf, client.getConfig(), new HoodieSparkEngineContext(jsc)); .create(hadoopConf, client.getConfig(), new HoodieSparkEngineContext(jsc));
} }
private HoodieBackedTableMetadata metadata(SparkRDDWriteClient client) { private HoodieTableMetadata metadata(SparkRDDWriteClient client) {
HoodieWriteConfig clientConfig = client.getConfig(); HoodieWriteConfig clientConfig = client.getConfig();
return (HoodieBackedTableMetadata) HoodieTableMetadata.create(hadoopConf, clientConfig.getBasePath(), clientConfig.getSpillableMapBasePath(), return HoodieTableMetadata.create(client.getEngineContext(), clientConfig.getBasePath(),
clientConfig.useFileListingMetadata(), clientConfig.getFileListingMetadataVerify(), false, clientConfig.shouldAssumeDatePartitioning()); clientConfig.getSpillableMapBasePath(), clientConfig.useFileListingMetadata(),
clientConfig.getFileListingMetadataVerify(), false, clientConfig.shouldAssumeDatePartitioning());
} }
// TODO: this can be moved to TestHarness after merge from master // TODO: this can be moved to TestHarness after merge from master

View File

@@ -39,6 +39,7 @@ public final class HoodieMetadataConfig extends DefaultHoodieConfig {
// Validate contents of Metadata Table on each access against the actual filesystem // Validate contents of Metadata Table on each access against the actual filesystem
public static final String METADATA_VALIDATE_PROP = METADATA_PREFIX + ".validate"; public static final String METADATA_VALIDATE_PROP = METADATA_PREFIX + ".validate";
public static final boolean DEFAULT_METADATA_VALIDATE = false; public static final boolean DEFAULT_METADATA_VALIDATE = false;
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = false;
// Parallelism for inserts // Parallelism for inserts
public static final String METADATA_INSERT_PARALLELISM_PROP = METADATA_PREFIX + ".insert.parallelism"; public static final String METADATA_INSERT_PARALLELISM_PROP = METADATA_PREFIX + ".insert.parallelism";
@@ -62,10 +63,6 @@ public final class HoodieMetadataConfig extends DefaultHoodieConfig {
public static final String CLEANER_COMMITS_RETAINED_PROP = METADATA_PREFIX + ".cleaner.commits.retained"; public static final String CLEANER_COMMITS_RETAINED_PROP = METADATA_PREFIX + ".cleaner.commits.retained";
public static final int DEFAULT_CLEANER_COMMITS_RETAINED = 3; public static final int DEFAULT_CLEANER_COMMITS_RETAINED = 3;
// We can set the default to true for readers, as it will internally default to listing from filesystem if metadata
// table is not found
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = true;
private HoodieMetadataConfig(Properties props) { private HoodieMetadataConfig(Properties props) {
super(props); super(props);
} }

View File

@@ -0,0 +1,90 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.engine;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.function.SerializableConsumer;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static java.util.stream.Collectors.toList;
import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper;
/**
* A java based engine context, use this implementation on the query engine integrations if needed.
*/
public final class HoodieLocalEngineContext extends HoodieEngineContext {
public HoodieLocalEngineContext(Configuration conf) {
this(conf, new LocalTaskContextSupplier());
}
public HoodieLocalEngineContext(Configuration conf, TaskContextSupplier taskContextSupplier) {
super(new SerializableConfiguration(conf), taskContextSupplier);
}
@Override
public <I, O> List<O> map(List<I> data, SerializableFunction<I, O> func, int parallelism) {
return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList());
}
@Override
public <I, O> List<O> flatMap(List<I> data, SerializableFunction<I, Stream<O>> func, int parallelism) {
return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList());
}
@Override
public <I> void foreach(List<I> data, SerializableConsumer<I> consumer, int parallelism) {
data.stream().forEach(throwingForeachWrapper(consumer));
}
@Override
public <I, K, V> Map<K, V> mapToPair(List<I> data, SerializablePairFunction<I, K, V> func, Integer parallelism) {
return data.stream().map(throwingMapToPairWrapper(func)).collect(
Collectors.toMap(Pair::getLeft, Pair::getRight, (oldVal, newVal) -> newVal)
);
}
@Override
public void setProperty(EngineProperty key, String value) {
// no operation for now
}
@Override
public Option<String> getProperty(EngineProperty key) {
return Option.empty();
}
@Override
public void setJobStatus(String activeModule, String activityDescription) {
// no operation for now
}
}

View File

@@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.engine;
import org.apache.hudi.common.util.Option;
import java.util.function.Supplier;
public final class LocalTaskContextSupplier extends TaskContextSupplier {
@Override
public Supplier<Integer> getPartitionIdSupplier() {
return () -> 0;
}
@Override
public Supplier<Integer> getStageIdSupplier() {
return () -> 0;
}
@Override
public Supplier<Long> getAttemptIdSupplier() {
return () -> 0L;
}
@Override
public Option<String> getProperty(EngineProperty prop) {
return Option.empty();
}
}

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.common.fs; package org.apache.hudi.common.fs;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodiePartitionMetadata;
@@ -252,13 +253,14 @@ public class FSUtils {
} }
} }
public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr, boolean useFileListingFromMetadata, boolean verifyListings, public static List<String> getAllPartitionPaths(HoodieEngineContext engineContext, FileSystem fs, String basePathStr,
boolean useFileListingFromMetadata, boolean verifyListings,
boolean assumeDatePartitioning) throws IOException { boolean assumeDatePartitioning) throws IOException {
if (assumeDatePartitioning) { if (assumeDatePartitioning) {
return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr); return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr);
} else { } else {
HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(fs.getConf(), basePathStr, "/tmp/", useFileListingFromMetadata, HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, basePathStr, "/tmp/",
verifyListings, false, false); useFileListingFromMetadata, verifyListings, false, false);
return tableMetadata.getAllPartitionPaths(); return tableMetadata.getAllPartitionPaths();
} }
} }

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.common.table.view; package org.apache.hudi.common.table.view;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.util.Functions.Function2; import org.apache.hudi.common.util.Functions.Function2;
@@ -159,12 +160,11 @@ public class FileSystemViewManager {
return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled()); return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled());
} }
public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieTableMetaClient metaClient, public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieEngineContext engineContext,
boolean useFileListingFromMetadata, HoodieTableMetaClient metaClient, boolean useFileListingFromMetadata, boolean verifyListings) {
boolean verifyListings) {
LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath()); LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath());
if (useFileListingFromMetadata) { if (useFileListingFromMetadata) {
return new HoodieMetadataFileSystemView(metaClient, return new HoodieMetadataFileSystemView(engineContext, metaClient,
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(), metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(),
true, true,
verifyListings); verifyListings);

View File

@@ -23,6 +23,7 @@ import org.apache.avro.Schema;
import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.avro.model.HoodieMetadataRecord;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.metrics.Registry; import org.apache.hudi.common.metrics.Registry;
import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodiePartitionMetadata;
@@ -33,7 +34,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.HoodieTimer;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@@ -54,6 +54,7 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
static final long MAX_MEMORY_SIZE_IN_BYTES = 1024 * 1024 * 1024; static final long MAX_MEMORY_SIZE_IN_BYTES = 1024 * 1024 * 1024;
static final int BUFFER_SIZE = 10 * 1024 * 1024; static final int BUFFER_SIZE = 10 * 1024 * 1024;
protected final transient HoodieEngineContext engineContext;
protected final SerializableConfiguration hadoopConf; protected final SerializableConfiguration hadoopConf;
protected final String datasetBasePath; protected final String datasetBasePath;
protected boolean enabled; protected boolean enabled;
@@ -66,10 +67,11 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
protected final String spillableMapDirectory; protected final String spillableMapDirectory;
private transient HoodieMetadataMergedInstantRecordScanner timelineRecordScanner; private transient HoodieMetadataMergedInstantRecordScanner timelineRecordScanner;
protected BaseTableMetadata(Configuration hadoopConf, String datasetBasePath, String spillableMapDirectory, protected BaseTableMetadata(HoodieEngineContext engineContext, String datasetBasePath, String spillableMapDirectory,
boolean enabled, boolean validateLookups, boolean enableMetrics, boolean enabled, boolean validateLookups, boolean enableMetrics,
boolean assumeDatePartitioning) { boolean assumeDatePartitioning) {
this.hadoopConf = new SerializableConfiguration(hadoopConf); this.engineContext = engineContext;
this.hadoopConf = new SerializableConfiguration(engineContext.getHadoopConf());
this.datasetBasePath = datasetBasePath; this.datasetBasePath = datasetBasePath;
this.spillableMapDirectory = spillableMapDirectory; this.spillableMapDirectory = spillableMapDirectory;
@@ -102,7 +104,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
LOG.error("Failed to retrieve list of partition from metadata", e); LOG.error("Failed to retrieve list of partition from metadata", e);
} }
} }
return new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning).getAllPartitionPaths(); return new FileSystemBackedTableMetadata(engineContext, hadoopConf, datasetBasePath,
assumeDatePartitioning).getAllPartitionPaths();
} }
/** /**
@@ -155,7 +158,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
if (validateLookups) { if (validateLookups) {
// Validate the Metadata Table data by listing the partitions from the file system // Validate the Metadata Table data by listing the partitions from the file system
timer.startTimer(); timer.startTimer();
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning); FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(engineContext,
hadoopConf, datasetBasePath, assumeDatePartitioning);
List<String> actualPartitions = fileSystemBackedTableMetadata.getAllPartitionPaths(); List<String> actualPartitions = fileSystemBackedTableMetadata.getAllPartitionPaths();
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_PARTITIONS_STR, timer.endTimer())); metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_PARTITIONS_STR, timer.endTimer()));
@@ -290,4 +294,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
protected void closeReaders() { protected void closeReaders() {
timelineRecordScanner = null; timelineRecordScanner = null;
} }
protected HoodieEngineContext getEngineContext() {
return engineContext;
}
} }

View File

@@ -19,23 +19,36 @@
package org.apache.hudi.metadata; package org.apache.hudi.metadata;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodiePartitionMetadata;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
public class FileSystemBackedTableMetadata implements HoodieTableMetadata { public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
private static final int DEFAULT_LISTING_PARALLELISM = 1500;
private final transient HoodieEngineContext engineContext;
private final SerializableConfiguration hadoopConf; private final SerializableConfiguration hadoopConf;
private final String datasetBasePath; private final String datasetBasePath;
private final boolean assumeDatePartitioning; private final boolean assumeDatePartitioning;
public FileSystemBackedTableMetadata(SerializableConfiguration conf, String datasetBasePath, boolean assumeDatePartitioning) { public FileSystemBackedTableMetadata(HoodieEngineContext engineContext, SerializableConfiguration conf, String datasetBasePath,
boolean assumeDatePartitioning) {
this.engineContext = engineContext;
this.hadoopConf = conf; this.hadoopConf = conf;
this.datasetBasePath = datasetBasePath; this.datasetBasePath = datasetBasePath;
this.assumeDatePartitioning = assumeDatePartitioning; this.assumeDatePartitioning = assumeDatePartitioning;
@@ -49,12 +62,47 @@ public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
@Override @Override
public List<String> getAllPartitionPaths() throws IOException { public List<String> getAllPartitionPaths() throws IOException {
FileSystem fs = new Path(datasetBasePath).getFileSystem(hadoopConf.get());
if (assumeDatePartitioning) { if (assumeDatePartitioning) {
FileSystem fs = new Path(datasetBasePath).getFileSystem(hadoopConf.get());
return FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, datasetBasePath); return FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, datasetBasePath);
} else {
return FSUtils.getAllFoldersWithPartitionMetaFile(fs, datasetBasePath);
} }
List<Path> pathsToList = new LinkedList<>();
pathsToList.add(new Path(datasetBasePath));
List<String> partitionPaths = new ArrayList<>();
while (!pathsToList.isEmpty()) {
// TODO: Get the parallelism from HoodieWriteConfig
int listingParallelism = Math.min(DEFAULT_LISTING_PARALLELISM, pathsToList.size());
// List all directories in parallel
List<Pair<Path, FileStatus[]>> dirToFileListing = engineContext.map(pathsToList, path -> {
FileSystem fileSystem = path.getFileSystem(hadoopConf.get());
return Pair.of(path, fileSystem.listStatus(path));
}, listingParallelism);
pathsToList.clear();
// If the listing reveals a directory, add it to queue. If the listing reveals a hoodie partition, add it to
// the results.
dirToFileListing.forEach(p -> {
Option<FileStatus> partitionMetaFile = Option.fromJavaOptional(Arrays.stream(p.getRight()).parallel()
.filter(fs -> fs.getPath().getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE))
.findFirst());
if (partitionMetaFile.isPresent()) {
// Is a partition.
String partitionName = FSUtils.getRelativePartitionPath(new Path(datasetBasePath), p.getLeft());
partitionPaths.add(partitionName);
} else {
// Add sub-dirs to the queue
pathsToList.addAll(Arrays.stream(p.getRight())
.filter(fs -> fs.isDirectory() && !fs.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME))
.map(fs -> fs.getPath())
.collect(Collectors.toList()));
}
});
}
return partitionPaths;
} }
@Override @Override
@@ -64,6 +112,6 @@ public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
@Override @Override
public boolean isInSync() { public boolean isInSync() {
throw new UnsupportedOperationException(); return true;
} }
} }

View File

@@ -21,6 +21,8 @@ package org.apache.hudi.metadata;
import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.avro.model.HoodieMetadataRecord; import org.apache.hudi.avro.model.HoodieMetadataRecord;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieLogFile; import org.apache.hudi.common.model.HoodieLogFile;
@@ -70,15 +72,15 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata {
private transient HoodieFileReader<GenericRecord> baseFileReader; private transient HoodieFileReader<GenericRecord> baseFileReader;
private transient HoodieMetadataMergedLogRecordScanner logRecordScanner; private transient HoodieMetadataMergedLogRecordScanner logRecordScanner;
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory, public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory, boolean enabled,
boolean enabled, boolean validateLookups, boolean assumeDatePartitioning) { boolean validateLookups, boolean assumeDatePartitioning) {
this(conf, datasetBasePath, spillableMapDirectory, enabled, validateLookups, false, assumeDatePartitioning); this(new HoodieLocalEngineContext(conf), datasetBasePath, spillableMapDirectory, enabled, validateLookups,
false, assumeDatePartitioning);
} }
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory, public HoodieBackedTableMetadata(HoodieEngineContext engineContext, String datasetBasePath, String spillableMapDirectory,
boolean enabled, boolean validateLookups, boolean enableMetrics, boolean enabled, boolean validateLookups, boolean enableMetrics, boolean assumeDatePartitioning) {
boolean assumeDatePartitioning) { super(engineContext, datasetBasePath, spillableMapDirectory, enabled, validateLookups, enableMetrics, assumeDatePartitioning);
super(conf, datasetBasePath, spillableMapDirectory, enabled, validateLookups, enableMetrics, assumeDatePartitioning);
this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(datasetBasePath); this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(datasetBasePath);
if (enabled) { if (enabled) {
try { try {

View File

@@ -22,6 +22,8 @@ import java.io.IOException;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig; import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
@@ -40,12 +42,13 @@ public class HoodieMetadataFileSystemView extends HoodieTableFileSystemView {
this.tableMetadata = tableMetadata; this.tableMetadata = tableMetadata;
} }
public HoodieMetadataFileSystemView(HoodieTableMetaClient metaClient, public HoodieMetadataFileSystemView(HoodieEngineContext engineContext,
HoodieTableMetaClient metaClient,
HoodieTimeline visibleActiveTimeline, HoodieTimeline visibleActiveTimeline,
boolean useFileListingFromMetadata, boolean useFileListingFromMetadata,
boolean verifyListings) { boolean verifyListings) {
super(metaClient, visibleActiveTimeline); super(metaClient, visibleActiveTimeline);
this.tableMetadata = HoodieTableMetadata.create(metaClient.getHadoopConf(), metaClient.getBasePath(), this.tableMetadata = HoodieTableMetadata.create(engineContext, metaClient.getBasePath(),
FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR, useFileListingFromMetadata, verifyListings, FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR, useFileListingFromMetadata, verifyListings,
false, false); false, false);
} }

View File

@@ -18,10 +18,11 @@
package org.apache.hudi.metadata; package org.apache.hudi.metadata;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@@ -68,10 +69,16 @@ public interface HoodieTableMetadata extends Serializable {
return basePath.endsWith(METADATA_TABLE_REL_PATH); return basePath.endsWith(METADATA_TABLE_REL_PATH);
} }
static HoodieTableMetadata create(Configuration conf, String datasetBasePath, String spillableMapPath, boolean useFileListingFromMetadata, static HoodieTableMetadata create(HoodieEngineContext engineContext, String datasetBasePath,
boolean verifyListings, boolean enableMetrics, boolean shouldAssumeDatePartitioning) { String spillableMapPath, boolean useFileListingFromMetadata, boolean verifyListings,
return new HoodieBackedTableMetadata(conf, datasetBasePath, spillableMapPath, useFileListingFromMetadata, verifyListings, boolean enableMetrics, boolean shouldAssumeDatePartitioning) {
enableMetrics, shouldAssumeDatePartitioning); if (useFileListingFromMetadata) {
return new HoodieBackedTableMetadata(engineContext, datasetBasePath, spillableMapPath, useFileListingFromMetadata,
verifyListings, enableMetrics, shouldAssumeDatePartitioning);
} else {
return new FileSystemBackedTableMetadata(engineContext, new SerializableConfiguration(engineContext.getHadoopConf()),
datasetBasePath, shouldAssumeDatePartitioning);
}
} }
/** /**

View File

@@ -0,0 +1,174 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.metadata;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
import org.apache.hudi.common.testutils.HoodieTestTable;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.stream.IntStream;
public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
private static final String DEFAULT_PARTITION = "";
private static final List<String> DATE_PARTITIONS = Arrays.asList("2019/01/01", "2020/01/02", "2021/03/01");
private static final List<String> ONE_LEVEL_PARTITIONS = Arrays.asList("2019", "2020", "2021");
private static final List<String> MULTI_LEVEL_PARTITIONS = Arrays.asList("2019/01", "2020/01", "2021/01");
private static HoodieTestTable hoodieTestTable;
@BeforeEach
public void setUp() throws IOException {
initMetaClient();
hoodieTestTable = HoodieTestTable.of(metaClient);
}
@AfterEach
public void tearDown() throws IOException {
metaClient.getFs().delete(new Path(metaClient.getBasePath()), true);
}
/**
* Test non partition hoodie table.
* @throws Exception
*/
@Test
public void testNonPartitionedTable() throws Exception {
// Generate 10 files under basepath
hoodieTestTable.addCommit("100").withBaseFilesInPartition(DEFAULT_PARTITION, IntStream.range(0, 10).toArray());
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 0);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath)).length == 10);
}
/**
* Test listing of partitions result for date based partitions.
* @throws Exception
*/
@Test
public void testDatePartitionedTable() throws Exception {
String instant = "100";
hoodieTestTable = hoodieTestTable.addCommit(instant);
// Generate 10 files under each partition
DATE_PARTITIONS.stream().forEach(p -> {
try {
hoodieTestTable = hoodieTestTable.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
} catch (Exception e) {
throw new RuntimeException(e);
}
});
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, true);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + DATE_PARTITIONS.get(0))).length == 10);
}
/**
* Test listing of partitions result for date based partitions with assumeDataPartitioning = false.
* @throws Exception
*/
@Test
public void testDatePartitionedTableWithAssumeDateIsFalse() throws Exception {
String instant = "100";
hoodieTestTable = hoodieTestTable.addCommit(instant);
// Generate 10 files under each partition
DATE_PARTITIONS.stream().forEach(p -> {
try {
hoodieTestTable = hoodieTestTable.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
} catch (Exception e) {
throw new RuntimeException(e);
}
});
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 0);
}
@Test
public void testOneLevelPartitionedTable() throws Exception {
String instant = "100";
hoodieTestTable = hoodieTestTable.addCommit(instant);
// Generate 10 files under each partition
ONE_LEVEL_PARTITIONS.stream().forEach(p -> {
try {
hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p)
.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
} catch (Exception e) {
throw new RuntimeException(e);
}
});
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + ONE_LEVEL_PARTITIONS.get(0))).length == 10);
}
@Test
public void testMultiLevelPartitionedTable() throws Exception {
String instant = "100";
hoodieTestTable = hoodieTestTable.addCommit(instant);
// Generate 10 files under each partition
MULTI_LEVEL_PARTITIONS.stream().forEach(p -> {
try {
hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p)
.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
} catch (Exception e) {
throw new RuntimeException(e);
}
});
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length == 10);
}
@Test
public void testMultiLevelEmptyPartitionTable() throws Exception {
String instant = "100";
hoodieTestTable = hoodieTestTable.addCommit(instant);
// Generate 10 files under each partition
MULTI_LEVEL_PARTITIONS.stream().forEach(p -> {
try {
hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p);
} catch (Exception e) {
throw new RuntimeException(e);
}
});
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length == 0);
}
}

View File

@@ -22,6 +22,7 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configurable;
import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodiePartitionMetadata;
@@ -172,8 +173,9 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
boolean useFileListingFromMetadata = getConf().getBoolean(METADATA_ENABLE_PROP, DEFAULT_METADATA_ENABLE_FOR_READERS); boolean useFileListingFromMetadata = getConf().getBoolean(METADATA_ENABLE_PROP, DEFAULT_METADATA_ENABLE_FOR_READERS);
boolean verifyFileListing = getConf().getBoolean(METADATA_VALIDATE_PROP, DEFAULT_METADATA_VALIDATE); boolean verifyFileListing = getConf().getBoolean(METADATA_VALIDATE_PROP, DEFAULT_METADATA_VALIDATE);
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(metaClient, HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(conf.get());
useFileListingFromMetadata, verifyFileListing); HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext,
metaClient, useFileListingFromMetadata, verifyFileListing);
String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder); String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder);
List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList()); List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList());

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.hadoop.utils; package org.apache.hudi.hadoop.utils;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodieCommitMetadata; import org.apache.hudi.common.model.HoodieCommitMetadata;
@@ -427,8 +428,9 @@ public class HoodieInputFormatUtils {
boolean useFileListingFromMetadata = job.getBoolean(METADATA_ENABLE_PROP, DEFAULT_METADATA_ENABLE_FOR_READERS); boolean useFileListingFromMetadata = job.getBoolean(METADATA_ENABLE_PROP, DEFAULT_METADATA_ENABLE_FOR_READERS);
boolean verifyFileListing = job.getBoolean(METADATA_VALIDATE_PROP, DEFAULT_METADATA_VALIDATE); boolean verifyFileListing = job.getBoolean(METADATA_VALIDATE_PROP, DEFAULT_METADATA_VALIDATE);
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(metaClient, HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(job);
useFileListingFromMetadata, verifyFileListing); HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext,
metaClient, useFileListingFromMetadata, verifyFileListing);
List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>(); List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();
for (Path p : paths) { for (Path p : paths) {

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.hadoop.utils; package org.apache.hudi.hadoop.utils;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieBaseFile; import org.apache.hudi.common.model.HoodieBaseFile;
@@ -81,9 +82,9 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
// for each partition path obtain the data & log file groupings, then map back to inputsplits // for each partition path obtain the data & log file groupings, then map back to inputsplits
HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath); HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
if (!fsCache.containsKey(metaClient)) { if (!fsCache.containsKey(metaClient)) {
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(conf);
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(metaClient, HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext,
useFileListingFromMetadata, verifyFileListing); metaClient, useFileListingFromMetadata, verifyFileListing);
fsCache.put(metaClient, fsView); fsCache.put(metaClient, fsView);
} }
HoodieTableFileSystemView fsView = fsCache.get(metaClient); HoodieTableFileSystemView fsView = fsCache.get(metaClient);

View File

@@ -19,6 +19,7 @@
package org.apache.hudi.integ.testsuite.reader; package org.apache.hudi.integ.testsuite.reader;
import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.common.config.HoodieMetadataConfig; import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice; import org.apache.hudi.common.model.FileSlice;
@@ -86,7 +87,8 @@ public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader {
// Using FSUtils.getFS here instead of metaClient.getFS() since we dont want to count these listStatus // Using FSUtils.getFS here instead of metaClient.getFS() since we dont want to count these listStatus
// calls in metrics as they are not part of normal HUDI operation. // calls in metrics as they are not part of normal HUDI operation.
FileSystem fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf()); FileSystem fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf());
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, metaClient.getBasePath(), HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
List<String> partitionPaths = FSUtils.getAllPartitionPaths(engineContext, fs, metaClient.getBasePath(),
HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false); HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false);
// Sort partition so we can pick last N partitions by default // Sort partition so we can pick last N partitions by default
Collections.sort(partitionPaths); Collections.sort(partitionPaths);

View File

@@ -373,7 +373,7 @@ public class TestBootstrap extends HoodieClientTestBase {
reloadInputFormats(); reloadInputFormats();
List<GenericRecord> records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( List<GenericRecord> records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
jsc.hadoopConfiguration(), jsc.hadoopConfiguration(),
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream() HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
.map(f -> basePath + "/" + f).collect(Collectors.toList()), .map(f -> basePath + "/" + f).collect(Collectors.toList()),
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>()); basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>());
@@ -392,7 +392,7 @@ public class TestBootstrap extends HoodieClientTestBase {
seenKeys = new HashSet<>(); seenKeys = new HashSet<>();
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
jsc.hadoopConfiguration(), jsc.hadoopConfiguration(),
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream() HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
.map(f -> basePath + "/" + f).collect(Collectors.toList()), .map(f -> basePath + "/" + f).collect(Collectors.toList()),
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>()); basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>());
@@ -409,7 +409,7 @@ public class TestBootstrap extends HoodieClientTestBase {
reloadInputFormats(); reloadInputFormats();
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
jsc.hadoopConfiguration(), jsc.hadoopConfiguration(),
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream() HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
.map(f -> basePath + "/" + f).collect(Collectors.toList()), .map(f -> basePath + "/" + f).collect(Collectors.toList()),
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES,
@@ -427,7 +427,7 @@ public class TestBootstrap extends HoodieClientTestBase {
seenKeys = new HashSet<>(); seenKeys = new HashSet<>();
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
jsc.hadoopConfiguration(), jsc.hadoopConfiguration(),
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream() HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
.map(f -> basePath + "/" + f).collect(Collectors.toList()), .map(f -> basePath + "/" + f).collect(Collectors.toList()),
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true, basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true,
@@ -443,7 +443,7 @@ public class TestBootstrap extends HoodieClientTestBase {
reloadInputFormats(); reloadInputFormats();
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
jsc.hadoopConfiguration(), jsc.hadoopConfiguration(),
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream() HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
.map(f -> basePath + "/" + f).collect(Collectors.toList()), .map(f -> basePath + "/" + f).collect(Collectors.toList()),
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, true, basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, true,
@@ -461,7 +461,7 @@ public class TestBootstrap extends HoodieClientTestBase {
seenKeys = new HashSet<>(); seenKeys = new HashSet<>();
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat( records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
jsc.hadoopConfiguration(), jsc.hadoopConfiguration(),
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream() HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
.map(f -> basePath + "/" + f).collect(Collectors.toList()), .map(f -> basePath + "/" + f).collect(Collectors.toList()),
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true, basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true,

View File

@@ -18,10 +18,7 @@
package org.apache.hudi.functional package org.apache.hudi.functional
import java.sql.{Date, Timestamp} import java.sql.{Date, Timestamp}
import java.util.function.Supplier
import java.util.stream.Stream
import org.apache.hadoop.fs.Path
import org.apache.hudi.common.config.HoodieMetadataConfig import org.apache.hudi.common.config.HoodieMetadataConfig
import org.apache.hudi.common.table.HoodieTableMetaClient import org.apache.hudi.common.table.HoodieTableMetaClient
import org.apache.hudi.common.table.timeline.HoodieInstant import org.apache.hudi.common.table.timeline.HoodieInstant

View File

@@ -20,6 +20,8 @@ package org.apache.hudi.sync.common;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieTableType; import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
@@ -127,7 +129,9 @@ public abstract class AbstractSyncHoodieClient {
if (!lastCommitTimeSynced.isPresent()) { if (!lastCommitTimeSynced.isPresent()) {
LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs); LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs);
try { try {
return FSUtils.getAllPartitionPaths(fs, basePath, useFileListingFromMetadata, verifyMetadataFileListing, assumeDatePartitioning); HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
return FSUtils.getAllPartitionPaths(engineContext, fs, basePath, useFileListingFromMetadata, verifyMetadataFileListing,
assumeDatePartitioning);
} catch (IOException e) { } catch (IOException e) {
throw new HoodieIOException("Failed to list all partitions in " + basePath, e); throw new HoodieIOException("Failed to list all partitions in " + basePath, e);
} }

View File

@@ -86,6 +86,7 @@ public class HoodieSnapshotCopier implements Serializable {
final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir); final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir);
final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata, final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata,
tableMetadata.getActiveTimeline().getCommitsAndCompactionTimeline().filterCompletedInstants()); tableMetadata.getActiveTimeline().getCommitsAndCompactionTimeline().filterCompletedInstants());
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
// Get the latest commit // Get the latest commit
Option<HoodieInstant> latestCommit = Option<HoodieInstant> latestCommit =
tableMetadata.getActiveTimeline().getCommitsAndCompactionTimeline().filterCompletedInstants().lastInstant(); tableMetadata.getActiveTimeline().getCommitsAndCompactionTimeline().filterCompletedInstants().lastInstant();
@@ -97,7 +98,7 @@ public class HoodieSnapshotCopier implements Serializable {
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
latestCommitTimestamp)); latestCommitTimestamp));
List<String> partitions = FSUtils.getAllPartitionPaths(fs, baseDir, useFileListingFromMetadata, verifyMetadataFileListing, shouldAssumeDatePartitioning); List<String> partitions = FSUtils.getAllPartitionPaths(context, fs, baseDir, useFileListingFromMetadata, verifyMetadataFileListing, shouldAssumeDatePartitioning);
if (partitions.size() > 0) { if (partitions.size() > 0) {
LOG.info(String.format("The job needs to copy %d partitions.", partitions.size())); LOG.info(String.format("The job needs to copy %d partitions.", partitions.size()));
@@ -108,7 +109,6 @@ public class HoodieSnapshotCopier implements Serializable {
fs.delete(new Path(outputDir), true); fs.delete(new Path(outputDir), true);
} }
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot"); context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot");
List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> { List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> {

View File

@@ -117,6 +117,7 @@ public class HoodieSnapshotExporter {
public void export(JavaSparkContext jsc, Config cfg) throws IOException { public void export(JavaSparkContext jsc, Config cfg) throws IOException {
FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration()); FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
if (outputPathExists(fs, cfg)) { if (outputPathExists(fs, cfg)) {
throw new HoodieSnapshotExporterException("The target output path already exists."); throw new HoodieSnapshotExporterException("The target output path already exists.");
@@ -128,7 +129,7 @@ public class HoodieSnapshotExporter {
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
latestCommitTimestamp)); latestCommitTimestamp));
final List<String> partitions = getPartitions(fs, cfg); final List<String> partitions = getPartitions(engineContext, fs, cfg);
if (partitions.isEmpty()) { if (partitions.isEmpty()) {
throw new HoodieSnapshotExporterException("The source dataset has 0 partition to snapshot."); throw new HoodieSnapshotExporterException("The source dataset has 0 partition to snapshot.");
} }
@@ -153,8 +154,8 @@ public class HoodieSnapshotExporter {
return latestCommit.isPresent() ? Option.of(latestCommit.get().getTimestamp()) : Option.empty(); return latestCommit.isPresent() ? Option.of(latestCommit.get().getTimestamp()) : Option.empty();
} }
private List<String> getPartitions(FileSystem fs, Config cfg) throws IOException { private List<String> getPartitions(HoodieEngineContext engineContext, FileSystem fs, Config cfg) throws IOException {
return FSUtils.getAllPartitionPaths(fs, cfg.sourceBasePath, true, false, false); return FSUtils.getAllPartitionPaths(engineContext, fs, cfg.sourceBasePath, true, false, false);
} }
private void createSuccessTag(FileSystem fs, Config cfg) throws IOException { private void createSuccessTag(FileSystem fs, Config cfg) throws IOException {

View File

@@ -85,13 +85,14 @@ public class TimelineServerPerf implements Serializable {
} }
public void run() throws IOException { public void run() throws IOException {
JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster);
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(timelineServer.getFs(), cfg.basePath, cfg.useFileListingFromMetadata, HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
cfg.verifyMetadataFileListing, true); List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, timelineServer.getFs(), cfg.basePath,
cfg.useFileListingFromMetadata, cfg.verifyMetadataFileListing, true);
Collections.shuffle(allPartitionPaths); Collections.shuffle(allPartitionPaths);
List<String> selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions) List<String> selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions)
.collect(Collectors.toList()); .collect(Collectors.toList());
JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster);
if (!useExternalTimelineServer) { if (!useExternalTimelineServer) {
this.timelineServer.startService(); this.timelineServer.startService();
setHostAddrFromSparkConf(jsc.getConf()); setHostAddrFromSparkConf(jsc.getConf());